In [1]:
from __future__ import print_function

import numpy as np

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)

Sklearn version: 0.19.1


# The data

The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.


In [2]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)


## Build a pipeline

In [3]:
#Define the pipeline

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([('vect', CountVectorizer(max_df=.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())
                    ])

# Fit all the pipeline
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=5000, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
       ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [4]:
#Evaluate test data
twenty_test = fetch_20newsgroups(subset='test',
                    remove=('headers', 'footers', 'quotes'),
                    categories=categories, 
                    shuffle=True, random_state=42)

predicted = text_clf.predict(twenty_test.data)

print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.7989347536617842


## Change classifier in the pipeline
    - LinearSVC
    - k-NN
    - Random forest

In [5]:
from sklearn.svm import LinearSVC
text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC())
                    ])

#Fit
_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

# Predict
predicted = text_clf_svm.predict(twenty_test.data)

# Evaluate accuracy
print('Test accuracy:', np.mean(predicted == twenty_test.target))        

Test accuracy: 0.8089214380825566


In [6]:
from sklearn.neighbors import KNeighborsClassifier

text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier())
                    ])

_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.27363515312916115


In [7]:
from sklearn.ensemble import RandomForestClassifier

text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier())
                    ])

_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.6904127829560586


## Use features from a factorization instead the provided by the tf-idf

In [8]:
# Text preprocessing, tokenizing and filtering of stopwords
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=5000,
                                stop_words='english')
X_train_counts = tf_vectorizer.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 5000)

In [9]:
%%time

from sklearn.decomposition import LatentDirichletAllocation

n_components = 6
n_top_words = 20

lda = LatentDirichletAllocation(n_components=n_components,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(X_train_counts)

Wall time: 3.53 s


In [10]:
lda.transform(X_train_counts).shape

(2257, 6)

In [11]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
church pope catholic marriage authority married orthodox canon schism mass liturgy bishop ceremony st churches catholics does priest jurisdiction coptic
Topic #1:
image file jpeg use program files images gif color know format does thanks graphics software using version bit available like
Topic #2:
edu com graphics mail send pub keyboard ftp data computer information cs systems software ca faq available gov contact pc
Topic #3:
god people think don jesus just does believe know say like time bible way things good true life christian question
Topic #4:
health use medical years people disease food msg new patients like don doctor research time 1993 10 day know just
Topic #5:
banks gordon skepticism edu soon pitt geb intellect chastity n3jxp dsl shameful cadre surrender father spirit son holy int col



## Pipeline with factorization

In [12]:
%%time

from sklearn.neighbors import KNeighborsClassifier

text_lda_knn = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                         ('lda', LatentDirichletAllocation(n_components=150,
                                                           max_iter=15,
                                                           learning_method='online',
                                                           learning_offset=200.,
                                                           random_state=0)),
                         ('clf', KNeighborsClassifier(n_neighbors=10))
                        ])
                         
_ = text_lda_knn.fit(twenty_train.data, twenty_train.target)
predicted = text_lda_knn.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.7003994673768309
Wall time: 56 s


In [13]:
%%time

from sklearn.ensemble import RandomForestClassifier

text_lda_rf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                         ('lda', LatentDirichletAllocation(n_components=150,
                                                           max_iter=15,
                                                           learning_method='online',
                                                           learning_offset=200.,
                                                           random_state=0)),
                         ('clf', RandomForestClassifier()),
                        ])
                         
_ = text_lda_rf.fit(twenty_train.data, twenty_train.target)

predicted = text_lda_rf.predict(twenty_test.data)
print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.6731025299600533
Wall time: 56.3 s


## Optimize a pipeline

In [14]:
%%time

from sklearn.model_selection import RandomizedSearchCV

# Define estimator. No parameters of the search
clf = Pipeline([('vect', CountVectorizer(max_df=.95, min_df=2)),
                ('tfidf', TfidfTransformer()), # tf-idf
                ('clf', LinearSVC()), # LinearSVC
                ])

# Specify parameters and distributions to sample from
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_dist = {"vect__max_features": [1000, 2500, 5000, 7500, 10000, None], 
              "vect__stop_words": ['english', None], 
              "clf__C": [.1, .5, 1., 1.5, 2.]}

# Define randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, return_train_score=True)

# Run the randomized search
random_search.fit(twenty_train.data, twenty_train.target)

print("Done!")

Done!
Wall time: 21.5 s


In [15]:
# Load dictionary of search results to a Pandas dataframe
import pandas as pd

df_cv_results = pd.DataFrame.from_dict(random_search.cv_results_)
df_cv_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_clf__C,param_vect__max_features,param_vect__stop_words,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.345919,0.115975,0.787328,0.977182,1.5,1000.0,,"{'vect__stop_words': None, 'vect__max_features...",8,0.802125,0.975399,0.791501,0.978723,0.768309,0.977424,0.010835,0.009997,0.014114,0.001368
1,0.30717,0.11129,0.868409,0.9825,0.5,10000.0,english,"{'vect__stop_words': 'english', 'vect__max_fea...",1,0.883134,0.983378,0.855246,0.984043,0.866844,0.98008,0.011461,0.003739,0.011444,0.001733
2,0.34426,0.109288,0.797519,0.961011,0.5,1000.0,english,"{'vect__stop_words': 'english', 'vect__max_fea...",7,0.803453,0.957447,0.798141,0.966755,0.790945,0.958831,0.044103,0.011611,0.005124,0.004101
3,0.393379,0.115328,0.865751,0.982057,0.5,7500.0,english,"{'vect__stop_words': 'english', 'vect__max_fea...",2,0.881806,0.982048,0.855246,0.984043,0.860186,0.98008,0.073303,0.00356,0.011538,0.001618
4,0.364981,0.128676,0.828977,0.951043,0.1,,,"{'vect__stop_words': None, 'vect__max_features...",5,0.843293,0.954122,0.816733,0.951463,0.826897,0.947543,0.060972,0.015227,0.010947,0.002702
5,0.304154,0.11331,0.781568,0.899203,0.1,1000.0,,"{'vect__stop_words': None, 'vect__max_features...",10,0.804781,0.895612,0.772908,0.902926,0.766977,0.89907,0.006826,0.006714,0.016602,0.002987
6,0.322188,0.110955,0.852902,0.964777,0.1,7500.0,english,"{'vect__stop_words': 'english', 'vect__max_fea...",4,0.861886,0.964761,0.851262,0.965426,0.845539,0.964143,0.04529,0.008211,0.006772,0.000524
7,0.408427,0.118642,0.858219,0.98117,0.5,7500.0,,"{'vect__stop_words': None, 'vect__max_features...",3,0.867198,0.980053,0.848606,0.983378,0.858855,0.98008,0.080976,0.005324,0.007607,0.001561
8,0.293113,0.105941,0.782898,0.974302,1.5,1000.0,english,"{'vect__stop_words': 'english', 'vect__max_fea...",9,0.796813,0.972739,0.786189,0.975399,0.765646,0.974768,0.00251,0.00695,0.012932,0.001135
9,0.325203,0.111954,0.821444,0.982499,1.5,2500.0,,"{'vect__stop_words': None, 'vect__max_features...",6,0.827357,0.982048,0.819389,0.984043,0.817577,0.981408,0.002478,0.006239,0.004249,0.001122


In [16]:
print('Best params:', random_search.best_params_)

Best params: {'vect__stop_words': 'english', 'vect__max_features': 10000, 'clf__C': 0.5}


In [17]:
# Score & evaluate test data using the best estimator

predicted = random_search.predict(twenty_test.data)

print('Test accuracy:', np.mean(predicted == twenty_test.target))

Test accuracy: 0.8202396804260985


## Aditional metrics for multiclass classification

In [18]:
from sklearn import metrics

print(metrics.classification_report(twenty_test.target, 
                                    predicted,
                                    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.78      0.61      0.68       319
         comp.graphics       0.82      0.93      0.87       389
               sci.med       0.87      0.86      0.87       396
soc.religion.christian       0.80      0.84      0.82       398

           avg / total       0.82      0.82      0.82      1502



In [19]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[193,  21,  29,  76],
       [ 13, 361,  14,   1],
       [ 12,  33, 342,   9],
       [ 31,  23,   8, 336]], dtype=int64)