In [46]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [2]:
categories = [
    'alt.atheism', 
    'soc.religion.christian', 
    'comp.graphics', 'sci.med'
]

In [4]:
twenty_train = fetch_20newsgroups(
    subset='train',
    categories=categories,
    shuffle=True,
    random_state=42)

No handlers could be found for logger "sklearn.datasets.twenty_newsgroups"


In [27]:
# Verify data import
print('\n'.join(twenty_train.data[0].split('\n')[:3]))
print(twenty_train.target_names[twenty_train.target[0]])

for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
comp.graphics


In [16]:
count_vect = CountVectorizer()

In [17]:
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [21]:
X_train_counts.shape

(2257, 35788)

In [22]:
count_vect.vocabulary_.get(u'algorithm')

4690

In [25]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [26]:
X_train_tf.shape

(2257, 35788)

In [28]:
# Train the classifier
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [29]:
# Try to classify new data
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('{} => {}'.format(doc, twenty_train.target_names[category]))

In [33]:
# Build a pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [34]:
# Train using the pipeline
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [36]:
# Evaluate performance on the test set
twenty_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    shuffle=True,
    random_state=42
)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.83488681757656458

In [41]:
# Retrain the model using SVM
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, n_iter=5, random_state=42))
])

In [42]:
# Check performance
_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.9127829560585885

In [44]:
# Use more detailed metrics
print(metrics.classification_report(
    twenty_test.target,
    predicted,
    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [45]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]])

In [47]:
# Parameter tuning using grid search

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3)
}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [48]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [49]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [50]:
gs_clf.best_score_

0.90000000000000002

In [51]:
for param_name in sorted(parameters.keys()):
    print('{}: {}'.format(param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [52]:
gs_clf.cv_results_

{'mean_fit_time': array([ 0.28681056,  0.97328528,  0.23728005,  0.88248459,  0.25288232,
         0.88470101,  0.25161171,  0.7535766 ]),
 'mean_score_time': array([ 0.13608511,  0.26327976,  0.11387897,  0.23682404,  0.09485873,
         0.236655  ,  0.09347471,  0.18427698]),
 'mean_test_score': array([ 0.8775,  0.875 ,  0.765 ,  0.78  ,  0.9   ,  0.89  ,  0.7675,  0.81  ]),
 'mean_train_score': array([ 0.99374372,  1.        ,  0.94123886,  0.97623272,  1.        ,
         1.        ,  0.98499057,  1.        ]),
 'param_clf__alpha': masked_array(data = [0.01 0.01 0.01 0.01 0.001 0.001 0.001 0.001],
              mask = [False False False False False False False False],
        fill_value = ?),
 'param_tfidf__use_idf': masked_array(data = [True True False False True True False False],
              mask = [False False False False False False False False],
        fill_value = ?),
 'param_vect__ngram_range': masked_array(data = [(1, 1) (1, 2) (1, 1) (1, 2) (1, 1) (1, 2) (1, 1) (1, 2