In [13]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
X_train = newsgroups_train.data
y_train = newsgroups_train.target
targets = newsgroups_train.target_names
print(targets)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

vectorizer = TfidfVectorizer(min_df=3, max_features=None,
                                token_pattern=r'\w+',
                                strip_accents='unicode', analyzer='word',
                                ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,
                                stop_words='english')

clf = LogisticRegression(C=1000, multi_class="auto")
pipeline = make_pipeline(vectorizer, clf)
pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(min_df=3, ngram_range=(1, 2), smooth_idf=1,
                                 stop_words='english', strip_accents='unicode',
                                 sublinear_tf=1, token_pattern='\\w+',
                                 use_idf=1)),
                ('logisticregression', LogisticRegression(C=1000))])

In [15]:
from sklearn.metrics import accuracy_score, classification_report, f1_score
newsgroups_test = fetch_20newsgroups(subset='test')
X_test = newsgroups_test.data
y_test = newsgroups_test.target
y_pred = pipeline.predict(X_test)

print('ACCURACY :', accuracy_score(y_pred, y_test))
print('F1       :', f1_score(y_pred, y_test, average='macro'))

ACCURACY : 0.8619224641529474
F1       : 0.8571851807862799


In [16]:
def predict(text):
    idx = pipeline.predict([text])[0]
    return targets[idx]

In [17]:
predict('Windows is an operating system')

'comp.os.ms-windows.misc'

In [18]:
predict('I sell my soul')

'misc.forsale'

In [19]:
predict('Which is the fastest car?')

'rec.autos'

In [20]:
predict('Which is the fastest bike?')

'rec.motorcycles'

In [23]:
#from sklearn.externals import joblib
#from joblib import dump, load
import joblib
!rm -rf models
!mkdir models/
joblib.dump(pipeline, 'models/pipeline.pkl')
joblib.dump(targets, 'models/targets.pkl')

['models/targets.pkl1']

In [22]:
joblib.load('models/pipeline.pkl')

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(min_df=3, ngram_range=(1, 2), smooth_idf=1,
                                 stop_words='english', strip_accents='unicode',
                                 sublinear_tf=1, token_pattern='\\w+',
                                 use_idf=1)),
                ('logisticregression', LogisticRegression(C=1000))])