In [1]:
import sklearn.datasets
import sklearn.feature_extraction.text
import sklearn.metrics
import sklearn.multiclass
import sklearn.svm

import numpy as np

In [2]:
def get_train_test():
    train = sklearn.datasets.fetch_20newsgroups(subset='train', remove=('headers'))
    test = sklearn.datasets.fetch_20newsgroups(subset='test', remove=('headers'))
    return train, test


# get the training and test data
train, test = get_train_test()
print(type(train))
#print(train.shape)
print("Train", train.keys())
print("test", test.keys())


<class 'sklearn.utils.Bunch'>
Train dict_keys(['data', 'filenames', 'DESCR', 'target', 'target_names', 'description'])
test dict_keys(['data', 'filenames', 'DESCR', 'target', 'target_names', 'description'])


In [3]:
print(train.description)
print("All categories: ",train.target_names)
print(train.DESCR) # WHAT IS THIS ?
#

the 20 newsgroups by date dataset
All categories:  ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
None


### Select only some categories

In [4]:
categories_4 = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
categories_2 = ['alt.atheism', 'comp.graphics']
categories_all = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
categories = categories_2
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
print("Categories", twenty_train['target_names'])
print("The data is of type ",type(twenty_train['data']), " with",len(twenty_train['data']), "samples loaded")


Categories ['alt.atheism', 'comp.graphics']
The data is of type  <class 'list'>  with 1064 samples loaded


In [5]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

alt.atheism
comp.graphics
comp.graphics
comp.graphics
alt.atheism
comp.graphics
comp.graphics
comp.graphics
comp.graphics
comp.graphics


In [6]:
this_sample = 0
this_target_int = twenty_train['target'][this_sample]
print("\n\n *** Sample {}, with target category of {} ({}) ***\n".format(
    this_sample, 
    this_target_int, 
    twenty_train.target_names[this_target_int]))
print("The data:",twenty_train['data'][this_sample])



 *** Sample 0, with target category of 0 (alt.atheism) ***

The data: From: frank@D012S658.uucp (Frank O'Dwyer)
Subject: Re: After 2000 years, can we say that Christian Morality is
Organization: Siemens-Nixdorf AG
Lines: 28
NNTP-Posting-Host: d012s658.ap.mchp.sni.de

In article <1993Apr15.125245.12872@abo.fi> MANDTBACKA@FINABO.ABO.FI (Mats Andtbacka) writes:
|In <1qie61$fkt@horus.ap.mchp.sni.de> frank@D012S658.uucp writes:
|> In article <30114@ursa.bear.com> halat@pooh.bears (Jim Halat) writes:
|
|> #I'm one of those people who does not know what the word objective means 
|> #when put next to the word morality.  I assume its an idiom and cannot
|> #be defined by its separate terms.
|> #
|> #Give it a try.
|> 
|> Objective morality is morality built from objective values.
|
|      "And these objective values are ... ?"
|Please be specific, and more importantly, motivate.

I'll take a wild guess and say Freedom is objectively valuable.  I base
this on the assumption that if everyone in

The most intuitive way to do so is the bags of words representation:

1. assign a fixed integer id to each word occurring in any document of the training set (for instance by building a dictionary from words to integer indices).
2. for each document #i, count the number of occurrences of each word w and store it in X[i, j] as the value of feature #j where j is the index of word w in the dictionary

The result is a sparse matrix with all possible words as the header. For each sample document, a row is created which lists the count for each word (column). This also means that the word order is lost. 

Each document is stored as a single row of length N, where N is the total number of words. 

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print("X_train_counts is a Bag of Words sparse matrix: {}".format(type(X_train_counts)))
print("Number rows (sample documents):", X_train_counts.shape[0], "number words (tokens?):", X_train_counts.shape[1])
#print()
#print(X_train_counts[0,:])
X_train_counts.shape

X_train_counts is a Bag of Words sparse matrix: <class 'scipy.sparse.csr.csr_matrix'>
Number rows (sample documents): 1064 number words (tokens?): 21366


(1064, 21366)

In [8]:
print("Test query the word - index dictionary")
word = 'algorithm'
print(word, count_vect.vocabulary_.get(word))
word = 'hi'
print(word, count_vect.vocabulary_.get(word))

Test query the word - index dictionary
algorithm 2853
hi 9949


In [9]:
n=10000
print("First row, first {} entries:\n{} ".format(n, X_train_counts[0,0:n]))
print("(Note the sparseness!)")
#print(word, count_vect.vocabulary_[230])
print()
names = count_vect.get_feature_names()
print(names[8696])
#get_feature_names()

First row, first 10000 entries:
  (0, 5756)	1
  (0, 8116)	1
  (0, 9906)	1
  (0, 9777)	1
  (0, 8229)	1
  (0, 5707)	1
  (0, 8814)	2
  (0, 4706)	1
  (0, 6513)	1
  (0, 2445)	1
  (0, 3370)	3
  (0, 4071)	1
  (0, 3889)	1
  (0, 3408)	2
  (0, 5594)	1
  (0, 2872)	1
  (0, 2907)	2
  (0, 5892)	1
  (0, 2550)	1
  (0, 8128)	2
  (0, 6781)	1
  (0, 8131)	2
  (0, 3435)	1
  (0, 3760)	1
  (0, 8946)	3
  :	:
  (0, 5452)	1
  (0, 3815)	1
  (0, 1156)	1
  (0, 8687)	1
  (0, 779)	1
  (0, 3048)	1
  (0, 8630)	1
  (0, 8581)	2
  (0, 2410)	2
  (0, 333)	1
  (0, 324)	1
  (0, 674)	1
  (0, 3354)	2
  (0, 6535)	2
  (0, 3151)	2
  (0, 1078)	1
  (0, 2735)	1
  (0, 5135)	1
  (0, 4697)	2
  (0, 842)	1
  (0, 2730)	1
  (0, 7505)	2
  (0, 6392)	3
  (0, 8925)	4
  (0, 8991)	3 
(Note the sparseness!)

flames


## Now, use tf-idf

Problem: Longer documents may have higher counts of words than shorter ones, just due to length. Therefore, frequencies (count of word divided by total word count) are better. These are term frequencies (`tf`). 

Furthermore, we can downscale the terms which are seen in most of the documents, since these terms are less useful in differentiating the documents. Inverse Document Frequency -> IDF

= `tf_idf`

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
# First use 'fit' to to create the estimator
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
# Then transform the data into the matrix representation
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape


(1064, 21366)

In [11]:
# Or combine both steps at once: 
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1064, 21366)

### Naive Bayes

Now that we have a nice vector representation of all sample documents, lets apply a classifier. 

In [12]:
# Get the Naive Bayes Multi classifier
from sklearn.naive_bayes import MultinomialNB
# Fit to our training data
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [13]:
docs_short = ['God is love', 
            'OpenGL on the GPU is fast',
           'The fish is in the kitchen',
           'Satan is my master',
           'Richard Dawkins',
            'Science and statistics',
           ]

docs_long =            ["""
            A preeminent scientist -- and the world's most prominent atheist -- asserts the irrationality of belief in God and the grievous harm religion has inflicted on society, from the Crusades to 9/11.

With rigor and wit, Dawkins examines God in all his forms, from the sex-obsessed tyrant of the Old Testament to the more benign (but still illogical) Celestial Watchmaker favored by some Enlightenment thinkers. He eviscerates the major arguments for religion and demonstrates the supreme improbability of a supreme being. He shows how religion fuels war, foments bigotry, and abuses children, buttressing his points with historical and contemporary evidence. The God Delusion makes a compelling case that belief in God is not just wrong but potentially deadly. It also offers exhilarating insight into the advantages of atheism to the individual and society, not the least of which is a clearer, truer appreciation of the universe's wonders than any faith could ever muster.""",

"""
A preeminent scientist -- and the world s most prominent atheist -- asserts the irrationality of belief in science and the grievous harm religion has inflicted on society, from the Crusades to 9/11.
With rigor and wit, Dawkins examines science in all his forms, from the sex-obsessed tyrant of the Old Testament to the more benign (but still illogical) Celestial Watchmaker favored by some Enlightenment thinkers. He eviscerates the major arguments for religion and demonstrates the supreme improbability of a supreme being. He shows how religion fuels war, foments bigotry, and abuses children, buttressing his points with historical and contemporary evidence. The science Delusion makes a compelling case that belief in science is not just wrong but potentially deadly. It also offers exhilarating insight into the advantages of atheism to the individual and society, not the least of which is a clearer, truer appreciation of the universe's wonders than any faith could ever muster.""",
           ]

docs_mine = docs_short
           
X_new_counts = count_vect.transform(docs_mine)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

# clf is Classifier
predicted = clf.predict(X_new_tfidf)
print("Data is trained on the following categories: \n{}".format(categories))
print()
for doc, category in zip(docs_mine, predicted):
    #print('%r => %s' % (doc, twenty_train.target_names[category]))
    print("{:<30} -> {:>30}".format(doc,twenty_train.target_names[category]))
    
#'God is love' => soc.religion.christian
#'OpenGL on the GPU is fast' => comp.graphics

Data is trained on the following categories: 
['alt.atheism', 'comp.graphics']

God is love                    ->                    alt.atheism
OpenGL on the GPU is fast      ->                  comp.graphics
The fish is in the kitchen     ->                    alt.atheism
Satan is my master             ->                    alt.atheism
Richard Dawkins                ->                  comp.graphics
Science and statistics         ->                  comp.graphics


## Pipeline

In [14]:
from sklearn.pipeline import Pipeline
text_clf_piped = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                     ])
print(text_clf_piped)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])


In [15]:
text_clf_piped.fit(twenty_train.data, twenty_train.target)  
#Pipeline(...)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

## Testing

In [16]:
# First, get the rest data
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data

In [17]:
# Then make a prediction
predicted = text_clf_piped.predict(docs_test)
acc = np.mean(predicted == twenty_test.target)          
print("Accuracy: {:0.1%}".format(acc))

Accuracy: 97.0%


In [18]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
])
text_clf.fit(twenty_train.data, twenty_train.target)  
predicted = text_clf.predict(docs_test)
acc = np.mean(predicted == twenty_test.target)          
print("Accuracy: {:0.1%}".format(acc))

Accuracy: 97.6%


In [19]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,target_names=twenty_test.target_names))

               precision    recall  f1-score   support

  alt.atheism       0.99      0.96      0.97       319
comp.graphics       0.97      0.99      0.98       389

  avg / total       0.98      0.98      0.98       708



In [20]:
this_CM = metrics.confusion_matrix(twenty_test.target, predicted)
print(this_CM)

[[306  13]
 [  4 385]]


In [21]:
pd.DataFrame(this_CM,columns = categories, index = categories )

Unnamed: 0,alt.atheism,comp.graphics
alt.atheism,306,13
comp.graphics,4,385


## Now with SGD

In [22]:
from sklearn.pipeline import Pipeline

text_clf_SGD_piped = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

print(text_clf_SGD_piped)


text_clf_SGD_piped.fit(twenty_train.data, twenty_train.target)  
predicted = text_clf_SGD_piped.predict(docs_test)
acc = np.mean(predicted == twenty_test.target)          
print("Accuracy: {:0.1%}".format(acc))

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])
Accuracy: 97.6%


## SVC

In [60]:
from sklearn.pipeline import Pipeline

my_SVC = sklearn.svm.SVC(C=1.0, 
                kernel='rbf',
                degree=3, 
                gamma=1, # auto - Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used instead.
                coef0=0.0, 
                shrinking=True, 
                probability=True, # False - Whether to enable probability estimates. This must be enabled prior to calling fit, and will slow down that method.
                tol=0.001, 
                cache_size=200, 
                class_weight=None, 
                verbose=False, # False
                max_iter=-1, 
                decision_function_shape='ovr', 
                random_state=None)

text_clf_SVC_piped = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', my_SVC),
])

print(text_clf_SVC_piped)
for param in text_clf_SVC_piped.get_params():
    print(param)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
tfidf__smooth_idf
clf__degree
clf__decision_function_shape
vect__dtype
clf__kernel
vect__decode_error
clf__class_weight
vect__max_df
vect
clf__shrinking
clf__verbose
clf
vect__input
clf__cache_size
vect__lowercase
clf__max_iter
clf__tol
clf__gamma
tfidf__sublinear_tf
tfidf__norm
vect__encoding
clf__random_state
vect__max_features
tfidf
vect__binary
vect__stop_words
tfidf__use_idf
vect__token_pattern
memory
vect__strip_accents
steps
vect__preprocessor
clf__probability
clf__coef0
vect__tokenizer
vect__analyzer
vect__vocabulary
clf__C
vect__ngram_range
vect__min_df

In [78]:
%%time
text_clf_SVC_piped.fit(twenty_train.data, twenty_train.target)  

Wall time: 6.4 s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [64]:
predicted = text_clf_SVC_piped.predict(docs_test)
acc = np.mean(predicted == twenty_test.target)       
print("Accuracy: {:0.1%}".format(acc))

prob = text_clf_SVC_piped.predict_proba(docs_test)

acc = np.mean(predicted == twenty_test.target)       

print(prob,acc)

Accuracy: 95.8%
[[  5.66441125e-01   4.33558875e-01]
 [  1.76658148e-02   9.82334185e-01]
 [  9.99992782e-01   7.21810630e-06]
 ..., 
 [  2.17007882e-06   9.99997830e-01]
 [  5.81867556e-06   9.99994181e-01]
 [  2.85675547e-08   9.99999971e-01]] 0.957627118644


In [67]:
prob = text_clf_SVC_piped.predict_proba(['Richard Dawkins'])
print(prob)
prob = text_clf_SVC_piped.predict_proba(['asdfasdf asdfasdf asddfsa'])
print(prob)


[[ 0.00947431  0.99052569]]
[[ 0.00846231  0.99153769]]


In [75]:
prob = text_clf_SVC_piped.predict_proba(['God atheist'])
my_int = text_clf_SVC_piped.named_steps['clf'].intercept_

Get the model ...

In [77]:
# This is the intercept of the model
text_clf_SVC_piped.named_steps['clf'].intercept_

array([ 0.65781663])

## Grid search

A search consists of:

1. an estimator (regressor or classifier such as sklearn.svm.SVC());
1. a parameter space;
1. a method for searching or sampling candidates;
1. a cross-validation scheme; and
1. a score function.

In [41]:
param_grid = [
  {'clf__gamma': [0.1, 0.01, 0.001, 0.0001]},
 ]

In [53]:
params_list =  {'clf__gamma': [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]}

In [54]:
parameters = params_list

In [79]:
%%time
grid_search = sklearn.model_selection.GridSearchCV(text_clf_SVC_piped, parameters, n_jobs=-1, verbose=1)

grid_search.fit(twenty_train.data, twenty_train.target) 
grid_search.fit

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   25.9s finished


Wall time: 33.3 s


In [56]:
grid_search.best_score_

0.99248120300751874

In [57]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.992
Best parameters set:
	clf__gamma: 1


grid_search.best_score_