In [1]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [2]:
from sklearn.datasets import fetch_20newsgroups

twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)



Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
len(twenty_train.data)

2257

In [13]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [19]:
twenty_train.target # twenty_train is a numpy array

array([1, 1, 3, ..., 2, 2, 2])

In [20]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [21]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [22]:
from sklearn.feature_extraction.text import CountVectorizer # Text preprocessing, tokenizing and filtering of stopwords

# instantiating
count_vect = CountVectorizer()

# fitting & transforming as per our training data
X_train_counts = count_vect.fit_transform(twenty_train.data)

# seeing shape of training data counts object
X_train_counts.shape # (2257, 35788)


(2257, 35788)

In [23]:
from sklearn.feature_extraction.text import TfidfTransformer

# instantiating our estimator and fitting it to our training counts object
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

# transforming our TF-IDF object as per our training data to get out of our count-matrix form and into our tf-idf representation
X_train_tf = tf_transformer.transform(X_train_counts)

# checking our TF-IDF object's shape
X_train_tf.shape # (2257, 35788)


(2257, 35788)

In [24]:
# faster way to do cell above here, using fit_transform: 

tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X_train_tfidf.shape # (2257, 35788)


(2257, 35788)

# Training our Classifier w/NB

In [25]:
# start with NB classifier for a good baseline model:
from sklearn.naive_bayes import MultinomialNB

# fit it to our tf-idf representation as our x-train data and our y-train data is our twenty_train.target
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

To try to predict the outcome on a new document we need to extract the features using almost the same feature extracting chain as before. **The difference is that we call transform instead of fit_transform on the transformers, since they have already been fit to the training set:**

In [26]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']

X_new_counts = count_vect.transform(docs_new)

X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

In [27]:
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


## Building a pipeline for NB model


Making pipline to make vectorizer => transformer => classifier easier to work with

In [28]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [29]:
# train model w/single command, thanks to pipeline:

text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

## Evaluation of Performance of NB Model

In [31]:
import numpy as np

twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

# getting test data
docs_test = twenty_test.data

# predicting using pipeline from above on our test data
predicted = text_clf.predict(docs_test)

# getting the accuracy score (takes the # of matches between the two arrays, counts them up, and divides them by the total number of predictions you made) 
np.mean(predicted == twenty_test.target)  

0.8348868175765646

# Seeing if we can train our classifier better w/SVM

In [32]:
# plug diff estimator into pipeline, make preds, get accuracy score


from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                             alpha=1e-3, random_state=42,
                             max_iter=5, tol=None)),
    ])

text_clf.fit(twenty_train.data, twenty_train.target)  

predicted = text_clf.predict(docs_test)

np.mean(predicted == twenty_test.target)            




0.9127829560585885

In [34]:
# more detailed evaluation report

from sklearn import metrics

print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

             micro avg       0.91      0.91      0.91      1502
             macro avg       0.92      0.91      0.91      1502
          weighted avg       0.92      0.91      0.91      1502



# SVM Was The Better Classifier, Now Let's Tune Our Parameters Using GridSearch

In [35]:
# creating our grid

from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
    }

In [36]:
# running our grid search

gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

In [38]:
# fitting our gridsearch object to a subset of our data (subset just to speed up computation)

gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])



In [39]:
# The result of calling fit on a GridSearchCV object is a classifier that we can use to predict:

twenty_train.target_names[gs_clf.predict(['God is love'])[0]]


'soc.religion.christian'

In [40]:
# finding best mean score of our gridsearch classifier 

gs_clf.best_score_ 

0.9151349867929058

In [41]:
# finding the best parameters in our grid

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)
