# Support vector machines and machine learning on documents

In [1]:
from sklearn.datasets import load_files
twenty_train = load_files('12-twenty_newsgroups/20news-bydate-train/',
                         encoding = 'latin1')
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

# Extracting features from text files

## Tokenizing text

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(twenty_train.data)
x_train_counts.shape

(11314, 130107)

In [4]:
count_vect.vocabulary_.get('for')

56283

In [5]:
ngram_count_vect = CountVectorizer(ngram_range = (1, 5))
xx_train_counts = ngram_count_vect.fit_transform(twenty_train.data)
xx_train_counts.shape

(11314, 8069416)

In [6]:
ngram_count_vect.vocabulary_.get('algorithm for')

627642

## TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf = False).fit(x_train_counts)
x_train_tf = tf_transformer.transform(x_train_counts)
x_train_tf.shape

(11314, 130107)

In [9]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_train_tfidf.shape

(11314, 130107)

## TF-IDF Vectorizer

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(twenty_train.data)
x_train_tfidf.shape

(11314, 130107)

# Building a pipeline

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
text_clf = Pipeline([('vect', TfidfVectorizer()), ('clf', LinearSVC()), ])
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

## Load test data

In [12]:
twenty_test = load_files('12-twenty_newsgroups/20news-bydate-test/', encoding = 'latin1')
predicted = text_clf.predict(twenty_test.data)

import numpy as np
np.mean(predicted == twenty_test.target)

0.85315985130111527

In [14]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
                                   target_names = twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.82      0.80      0.81       319
           comp.graphics       0.76      0.80      0.78       389
 comp.os.ms-windows.misc       0.77      0.73      0.75       394
comp.sys.ibm.pc.hardware       0.71      0.76      0.74       392
   comp.sys.mac.hardware       0.84      0.86      0.85       385
          comp.windows.x       0.87      0.76      0.81       395
            misc.forsale       0.83      0.91      0.87       390
               rec.autos       0.92      0.91      0.91       396
         rec.motorcycles       0.95      0.95      0.95       398
      rec.sport.baseball       0.92      0.95      0.93       397
        rec.sport.hockey       0.96      0.98      0.97       399
               sci.crypt       0.93      0.94      0.93       396
         sci.electronics       0.81      0.79      0.80       393
                 sci.med       0.90      0.87      0.88       396
         

## Parameter tuning using grid search

In [15]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__use_idf': (True, False),
    'clf__C': (1.0, 0.1, 1e-2, 1e-3),
             }
gs_clf = GridSearchCV(text_clf, parameters, n_jobs = -1)

In [16]:
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [17]:
gs_clf.best_score_

0.9231041187908785

In [18]:
for param_name in sorted(parameters.keys()):
    print('%s: %r' % (param_name, gs_clf.best_params_[param_name]))

clf__C: 1.0
vect__ngram_range: (1, 2)
vect__use_idf: True


In [19]:
clf = gs_clf.best_estimator_
predicted = clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.85740839086563991

In [20]:
print(metrics.classification_report(twenty_test.target, predicted,
                                   target_names = twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.83      0.79      0.81       319
           comp.graphics       0.74      0.80      0.77       389
 comp.os.ms-windows.misc       0.77      0.77      0.77       394
comp.sys.ibm.pc.hardware       0.73      0.76      0.74       392
   comp.sys.mac.hardware       0.83      0.86      0.85       385
          comp.windows.x       0.87      0.76      0.81       395
            misc.forsale       0.84      0.91      0.87       390
               rec.autos       0.94      0.91      0.92       396
         rec.motorcycles       0.96      0.97      0.96       398
      rec.sport.baseball       0.91      0.94      0.93       397
        rec.sport.hockey       0.95      0.98      0.97       399
               sci.crypt       0.93      0.95      0.94       396
         sci.electronics       0.82      0.78      0.80       393
                 sci.med       0.90      0.86      0.88       396
         

In [21]:
gs_clf.cv_results_

{'mean_fit_time': array([ 16.53635852,  19.62261383,  96.99286644,  96.68818943,
         20.87671137,  20.36050256,  88.02994792,  81.66136734,
         20.71324174,  21.03106475,  84.9218154 ,  79.23255499,
         19.22479065,  19.10487374,  80.187289  ,  61.71674959]),
 'mean_score_time': array([  4.55368876,   5.07420142,  17.2919755 ,  16.61340459,
          6.85288127,   6.06176074,  18.33446638,  17.31005239,
          6.95669993,   6.65740387,  19.3032848 ,  17.12737862,
          6.48946404,   6.40022119,  22.01428517,  12.98279421]),
 'mean_test_score': array([ 0.91894997,  0.8902245 ,  0.92310412,  0.89985858,  0.8959696 ,
         0.82950327,  0.89853279,  0.8432915 ,  0.82791232,  0.64221319,
         0.83515998,  0.65494078,  0.70673502,  0.39208061,  0.71813682,
         0.38403748]),
 'mean_train_score': array([ 0.99929296,  0.99500614,  0.99973487,  0.99907207,  0.98258771,
         0.93167821,  0.9913825 ,  0.96199421,  0.90573668,  0.71398305,
         0.94175374, 