In [2]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from sklearn import metrics

In [3]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=8888)

In [4]:
print(dir(train))

['DESCR', 'data', 'filenames', 'target', 'target_names']


In [7]:
train.target

array([2, 2, 1, ..., 0, 3, 0], dtype=int64)

In [12]:
train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [13]:
len(train.data)

2257

In [14]:
len(train.filenames)

2257

In [17]:
#Let’s print the first lines of the first loaded file:
print("\n".join(train.data[0].split("\n")[:10]))

From: young@serum.kodak.com (Rich Young)
Subject: Re: Is MSG sensitivity superstition?
Originator: young@sasquatch
Nntp-Posting-Host: sasquatch
Reply-To: young@serum.kodak.com
Organization: Clinical Diagnostics Division, Eastman Kodak Company
Lines: 86

>>In article <1qnns0$4l3@agate.berkeley.edu> spp@zabriskie.berkeley.edu (Steve Pope) writes:
>>The mass of anectdotal evidence, combined with the lack of


In [21]:
print(train.target_names[train.target[0]])

sci.med


In order to perform machine learning on text documents, we first need to turn the text content into numerical feature vectors.



In [23]:
#bag of words
#Text preprocessing, tokenizing and filtering of stopwords are all included in CountVectorizer, 
#which builds a dictionary of features and transforms documents to feature vectors

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train.data)
X_train_counts.shape

(2257, 35788)

In [24]:
print(dir(count_vect))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_char_ngrams', '_char_wb_ngrams', '_check_stop_words_consistency', '_check_vocabulary', '_count_vocab', '_get_param_names', '_get_tags', '_limit_features', '_more_tags', '_sort_features', '_stop_words_id', '_validate_custom_analyzer', '_validate_params', '_validate_vocabulary', '_white_spaces', '_word_ngrams', 'analyzer', 'binary', 'build_analyzer', 'build_preprocessor', 'build_tokenizer', 'decode', 'decode_error', 'dtype', 'encoding', 'fit', 'fit_transform', 'fixed_vocabulary_', 'get_feature_names', 'get_params', 'get_stop_words', 'input', 'inverse_transform', 'lowercase', 'max_df', 'max_features', 'min_df', 'ngra

In [26]:
print(dir(count_vect.vocabulary_.get))

['__call__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__name__', '__ne__', '__new__', '__qualname__', '__reduce__', '__reduce_ex__', '__repr__', '__self__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__text_signature__']


In [31]:
count_vect.vocabulary_.get('dog')

12055

In [39]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

Training a classifier

In [41]:
clf = MultinomialNB().fit(X_train_tfidf, train.target)

In [42]:
#testing
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [44]:
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


Building a pipeline

In [46]:
#In order to make the vectorizer => transformer => classifier easier to work with, scikit-learn provides a Pipeline class
#that behaves like a compound classifier

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

In [48]:
text_clf.fit(train.data, train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

Evaluation of the performance on the test set

In [50]:
test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=8888)
docs_test = test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == test.target)


0.8348868175765646

SVM

In [53]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=8888,
                          max_iter=5, tol=None)),
])

text_clf.fit(train.data, train.target)

predicted = text_clf.predict(docs_test)
np.mean(predicted == test.target)


0.90745672436751

More detailed metrics

In [56]:
print(metrics.classification_report(test.target, predicted, target_names=test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.79      0.87       319
         comp.graphics       0.87      0.97      0.92       389
               sci.med       0.96      0.88      0.92       396
soc.religion.christian       0.87      0.96      0.91       398

              accuracy                           0.91      1502
             macro avg       0.91      0.90      0.90      1502
          weighted avg       0.91      0.91      0.91      1502



In [57]:
metrics.confusion_matrix(test.target, predicted)

array([[253,  11,  12,  43],
       [  2, 379,   2,   6],
       [  5,  35, 350,   6],
       [  5,  10,   2, 381]], dtype=int64)

HYPER parameter tuning....