In [34]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pickle

## Loading a 'Bunch' of Data

In [35]:
## Only examine 4 of the total 20 categories
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
# twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
# with open ('/Users/zachdischner/Desktop/twentytrain.pkl','wb') as pfile:
#     pickle.dump(twenty_train,pfile)
with open("/Users/zachdischner/Desktop/twentytrain.pkl",'rb') as pfile:
    twenty_train=pickle.load(pfile)
print("{} datasets with {} targets:\n {}".format(len(twenty_train.data),len(twenty_train.target_names),twenty_train.target_names))

2257 datasets with 4 targets:
 ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


In [36]:
## print an email: twenty_train.data[100]
## See what it's target (answer): twenty_train.target[100]
## See it's ascii name: twenty_train.target_names[twenty_train.target[100]]
print("First 10 email Category Names")
for t in twenty_train.target[:10]:
    print("\ttarget {} ==> {}".format(t,twenty_train.target_names[t]))

First 10 email Category Names
	target 1 ==> comp.graphics
	target 1 ==> comp.graphics
	target 3 ==> soc.religion.christian
	target 3 ==> soc.religion.christian
	target 3 ==> soc.religion.christian
	target 3 ==> soc.religion.christian
	target 3 ==> soc.religion.christian
	target 2 ==> sci.med
	target 2 ==> sci.med
	target 2 ==> sci.med


## Extract Text Features

In [4]:
## Tokenizing
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape
## count_vect['word'] ==> Number of times word has occured in training data

(2257, 35788)

### Converting counts to frequencies
* Normalize counts per document by total number of words in a document
* Downscale weights of words that occur across many documents, AKA common ones
* Called `tf-idf`: "Term Frequence time Inverse Document Frequency"

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

## Training a Classifier

In [6]:
## Naive Bayes classifier (multinominal variant) is a good one for word counts evidently
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf,twenty_train.target)

In [7]:
## Test out some new documents
docs_new = ['God is love','OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
for doc,category in zip(docs_new,predicted):
    print("{} ==> {}".format(doc,twenty_train.target_names[category]))

God is love ==> soc.religion.christian
OpenGL on the GPU is fast ==> comp.graphics


# Pipeline!
This is putting it all together so we can simplify the
    text ==> vectorizer ==> transformer ==> classifier
workflow easier to work with.

### Create a single pipeline

In [8]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

text_clf = text_clf.fit(twenty_train.data,twenty_train.target)

### Test out new data!

In [9]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
accuracy = np.mean(predicted == twenty_test.target)
print("Text classifier accuracy on test set  is %{:3.3}".format(accuracy*100))

Text classifier accuracy on test set  is %83.5


*83%* kinda sucks. But SVGs are normally really good at text classifiers!
* Plug in an SGD Classifier into our pipeline

In [10]:
from sklearn.linear_model import SGDClassifier
sgd_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                          alpha=1e-3, n_iter=5,
                                         random_state=42))
                    ])

## Train
_ = sgd_clf.fit(twenty_train.data, twenty_train.target)
predicted = sgd_clf.predict(docs_test)

## How did we do?
accuracy = np.mean(predicted==twenty_test.target)
print("SGD accuracy is %{:3.3}".format(accuracy*100))


SGD accuracy is %91.3


In [11]:
predicted

array([2, 2, 2, ..., 2, 2, 1])

### Performance Metrics

In [12]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

### Confusion matrix (whatever that is...)
metrics.confusion_matrix(twenty_test.target, predicted)

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]])

## Parameter Tuning Using Grid Search

In [28]:
## Instead of blindly checking and tuning by hand, we'll look at *all* parameters
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range':[(1,1),(1,2)],
              'tfidf__use_idf':(True,False),
              'clf__alpha':(1e-2,1e-3,1e-4,1e-5),
             }

## let the program determine how many cores we have available, n_jobs=-1
gs_clf = GridSearchCV(sgd_clf, parameters, n_jobs=-1)

In [29]:
## Result of grid search is a normal sklearn model. Let's try it out! 
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [30]:
## See some results
print(twenty_train.target_names[gs_clf.predict(['God is love'])[0]])
print(twenty_train.target_names[gs_clf.predict(['God is love but graphics on GPUs run much faster on multi cores'])[0]])

soc.religion.christian
comp.graphics


In [33]:
gs_clf.best_score_
for param_name in sorted(parameters.keys()):
    print("{}: {}".format(param_name,gs_clf.best_params_[param_name]))

## More detailed summary
# gs_clf.cv_results_

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)
