# sai: spooky author identification
## analysis 3: TF-IDF Vectorizer

## strategy
This will apply TF-IDF vectorizer with the following classifiers:

* Multinomial Naive Bayes 
* Logistic Regression
* Random Forest

The same practices done with CountVectorizer, will occur here as well.  Therefore, it will create about 6 experiments:

* Split, vectorize TF-IDF + MultinomialNB()
* Vectorize TF-IDF, split, MultinomialNB()
* Split, vectorize TF-IDF + LogisticRegression()
* Vectorize TF-IDF, split, LogisticRegression()
* Split, vectorize TF-IDF + RandomForestClassifier()
* Vectorize TF-IDF, split, RandomForestClassifier()

Runtime will be long.  RandomizedSearchCV will be used for all to save time.

## code
### preliminaries
This is the 'de facto' run, where it loads libraries and necessary modules to perform the analysis.  Afterwards, it will read a simple csv file into a dataframe called 'texts.'  

In [69]:
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# sklearn
from sklearn.cross_validation import train_test_split             # cross-validation
from sklearn.feature_extraction.text import TfidfVectorizer      # vectorizer
from sklearn.naive_bayes import MultinomialNB                     # classifier
from sklearn.linear_model import LogisticRegression               # classifier
from sklearn.ensemble import RandomForestClassifier               # classifier
from sklearn.ensemble import ExtraTreesClassifier                 # classifier
from sklearn.model_selection import GridSearchCV                  # parameter tuning
from sklearn.model_selection import RandomizedSearchCV            # parameter tuning
from sklearn.pipeline import Pipeline                             # pipeline
from sklearn import metrics                                       # metrics

# other modules
from stop_words import get_stop_words
from scipy.stats import randint as sp_randint
import string
from pprint import pprint

# Read training texts: texts
texts = pd.read_csv('train.csv')

### Vectorize + Naive Bayes = Experiment 1 (Split / Vectorize)

In [16]:
# Feature Selection
X = texts.text
y = texts.author

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y)

In [17]:
# pipeline
pipeline_A1 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# parameters
parameters_A1 = dict(
    tfidf__ngram_range = [(1,1), (1,2), (1,3)],
    tfidf__max_df = (0.5, 0.75, 1.0),
    nb__alpha = [0.05, 0.1, 1.0, 2.0]
)

In [18]:
# cv=5 academically proven as best fold, kept n_jobs (jobs running parallel)= 1, output time
rand_search_A1 = RandomizedSearchCV(pipeline_A1, 
                           parameters_A1, 
                           n_jobs=1, 
                           cv=5
                )

In [19]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_A1.steps])
print("parameters:")
pprint(parameters_A1, depth=2)

%time rand_search_A1.fit(X_train, y_train)

print("Best score: %0.3f" % rand_search_A1.best_score_)
print("Best parameters set:")

best_parameters = rand_search_A1.best_estimator_.get_params()

for param_name in sorted(parameters_A1.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'nb']
parameters:
{'nb__alpha': [0.05, 0.1, 1.0, 2.0],
 'tfidf__max_df': (0.5, 0.75, 1.0),
 'tfidf__ngram_range': [(...), (...), (...)]}
CPU times: user 2min 21s, sys: 4.33 s, total: 2min 25s
Wall time: 2min 26s
Best score: 0.846
Best parameters set:
	nb__alpha: 0.05
	tfidf__max_df: 0.75
	tfidf__ngram_range: (1, 2)


In [22]:
# instantiate estimator
vect_A1 = TfidfVectorizer(binary=True, ngram_range=(1,2), stop_words=None, max_df=0.75)
nb_A1 = MultinomialNB(alpha=0.05)

In [23]:
# fit & transform with vectorizer
X_train_dtm = vect_A1.fit_transform(X_train)

# fit with classifer
nb_A1.fit(X_train_dtm, y_train)

# predict with classifier
y_pred_train = nb_A1.predict(X_train_dtm)

# transform with vectorizer, then predict with classifier
X_test_dtm = vect_A1.transform(X_test)
y_pred_test = nb_A1.predict(X_test_dtm)

In [24]:
print(metrics.accuracy_score(y_pred_train, y_train))

0.999182783983


In [25]:
print(metrics.accuracy_score(y_pred_test, y_test))

0.852706843718


In [26]:
print(metrics.confusion_matrix(y_pred_train, y_train))

[[5922    3    3]
 [   0 4220    0]
 [   3    3 4530]]


In [27]:
print(metrics.confusion_matrix(y_pred_test, y_test))

[[1745  184  166]
 [  79 1128   44]
 [ 151   97 1301]]


In [28]:
print(metrics.classification_report(y_pred_train, y_train))

             precision    recall  f1-score   support

        EAP       1.00      1.00      1.00      5928
        HPL       1.00      1.00      1.00      4220
        MWS       1.00      1.00      1.00      4536

avg / total       1.00      1.00      1.00     14684



In [29]:
print(metrics.classification_report(y_pred_test,y_test))

             precision    recall  f1-score   support

        EAP       0.88      0.83      0.86      2095
        HPL       0.80      0.90      0.85      1251
        MWS       0.86      0.84      0.85      1549

avg / total       0.86      0.85      0.85      4895



### Vectorize + Naive Bayes = Experiment 2 (Vectorize / Split)


In [30]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_A1.steps])
print("parameters:")
pprint(parameters_A1, depth=2)

%time rand_search_A1.fit(X, y)

print("Best score: %0.3f" % rand_search_A1.best_score_)
print("Best parameters set:")

best_parameters = rand_search_A1.best_estimator_.get_params()

for param_name in sorted(parameters_A1.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'nb']
parameters:
{'nb__alpha': [0.05, 0.1, 1.0, 2.0],
 'tfidf__max_df': (0.5, 0.75, 1.0),
 'tfidf__ngram_range': [(...), (...), (...)]}
CPU times: user 3min 19s, sys: 5.9 s, total: 3min 25s
Wall time: 3min 25s
Best score: 0.854
Best parameters set:
	nb__alpha: 0.1
	tfidf__max_df: 1.0
	tfidf__ngram_range: (1, 2)


In [32]:
# instantiate estimator
vect_A2 = TfidfVectorizer(binary=True, ngram_range=(1,2), stop_words=None, max_df=1.0)
nb_A2 = MultinomialNB(alpha=0.1)

In [33]:
# fit with vectorizer
vect_A2.fit(X)

# transform with vectorizer
X_train_dtm = vect_A2.transform(X_train)

# fit with classifer
nb_A2.fit(X_train_dtm, y_train)

# predict with classifier
y_pred_train = nb_A2.predict(X_train_dtm)

# transform with vectorizer, then predict with classifier
X_test_dtm = vect_A2.transform(X_test)
y_pred_test = nb_A2.predict(X_test_dtm)

In [34]:
print(metrics.accuracy_score(y_pred_train, y_train))

0.999046581313


In [35]:
print(metrics.accuracy_score(y_pred_test, y_test))

0.85393258427


In [36]:
print(metrics.confusion_matrix(y_pred_train, y_train))

[[5923    5    4]
 [   0 4218    0]
 [   2    3 4529]]


In [37]:
print(metrics.confusion_matrix(y_pred_test, y_test))

[[1748  188  157]
 [  77 1121   43]
 [ 150  100 1311]]


In [38]:
print(metrics.classification_report(y_pred_train, y_train))

             precision    recall  f1-score   support

        EAP       1.00      1.00      1.00      5932
        HPL       1.00      1.00      1.00      4218
        MWS       1.00      1.00      1.00      4534

avg / total       1.00      1.00      1.00     14684



In [39]:
print(metrics.classification_report(y_pred_test,y_test))

             precision    recall  f1-score   support

        EAP       0.89      0.84      0.86      2093
        HPL       0.80      0.90      0.85      1241
        MWS       0.87      0.84      0.85      1561

avg / total       0.86      0.85      0.85      4895



### Vectorize + Logistic Regression = Experiment 1 (Split / Vectorize)

In [45]:
# pipeline
pipeline_B1 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('log_reg', LogisticRegression()),
])

# parameters
parameters_B1 = dict(
    tfidf__ngram_range = [(1,1), (1,2), (1,3)],
    tfidf__max_df = (0.5, 0.75, 1.0),
    log_reg__C = [0.05, 0.1, 1.0, 2.0]
)

In [48]:
# cv=5 academically proven as best fold, kept n_jobs (jobs running parallel)= 1, output time
rand_search_B1 = RandomizedSearchCV(pipeline_B1, 
                           parameters_B1, 
                           n_jobs=1, 
                           cv=5
                )

In [49]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_B1.steps])
print("parameters:")
pprint(parameters_B1, depth=2)

%time rand_search_B1.fit(X_train, y_train)

print("Best score: %0.3f" % rand_search_B1.best_score_)
print("Best parameters set:")

best_parameters = rand_search_B1.best_estimator_.get_params()

for param_name in sorted(parameters_B1.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'log_reg']
parameters:
{'log_reg__C': [0.05, 0.1, 1.0, 2.0],
 'tfidf__max_df': (0.5, 0.75, 1.0),
 'tfidf__ngram_range': [(...), (...), (...)]}
CPU times: user 4min, sys: 3.72 s, total: 4min 4s
Wall time: 2min 32s
Best score: 0.795
Best parameters set:
	log_reg__C: 2.0
	tfidf__max_df: 0.5
	tfidf__ngram_range: (1, 2)


In [50]:
# instantiate estimator
vect_B1 = TfidfVectorizer(binary=True, ngram_range=(1,2), stop_words=None, max_df=0.5)
logreg_B1 = LogisticRegression(C=2.0)

In [51]:
# fit & transform with vectorizer
X_train_dtm = vect_B1.fit_transform(X_train)

# fit with classifer
logreg_B1.fit(X_train_dtm, y_train)

# predict with classifier
y_pred_train = logreg_B1.predict(X_train_dtm)

# transform with vectorizer, then predict with classifier
X_test_dtm = vect_B1.transform(X_test)
y_pred_test = logreg_B1.predict(X_test_dtm)

In [52]:
print(metrics.accuracy_score(y_pred_train, y_train))

0.983587578317


In [53]:
print(metrics.accuracy_score(y_pred_test, y_test))

0.812257405516


In [54]:
print(metrics.confusion_matrix(y_pred_train, y_train))

[[5873   66   78]
 [  20 4142   27]
 [  32   18 4428]]


In [55]:
print(metrics.confusion_matrix(y_pred_test, y_test))

[[1673  241  190]
 [ 135 1070   88]
 [ 167   98 1233]]


In [56]:
print(metrics.classification_report(y_pred_train, y_train))

             precision    recall  f1-score   support

        EAP       0.99      0.98      0.98      6017
        HPL       0.98      0.99      0.98      4189
        MWS       0.98      0.99      0.98      4478

avg / total       0.98      0.98      0.98     14684



In [57]:
print(metrics.classification_report(y_pred_test,y_test))

             precision    recall  f1-score   support

        EAP       0.85      0.80      0.82      2104
        HPL       0.76      0.83      0.79      1293
        MWS       0.82      0.82      0.82      1498

avg / total       0.81      0.81      0.81      4895



### Vectorize + Logistic Regression = Experiment 2 (Vectorize / Split)

In [58]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_B1.steps])
print("parameters:")
pprint(parameters_B1, depth=2)

%time rand_search_B1.fit(X, y)

print("Best score: %0.3f" % rand_search_B1.best_score_)
print("Best parameters set:")

best_parameters = rand_search_B1.best_estimator_.get_params()

for param_name in sorted(parameters_B1.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'log_reg']
parameters:
{'log_reg__C': [0.05, 0.1, 1.0, 2.0],
 'tfidf__max_df': (0.5, 0.75, 1.0),
 'tfidf__ngram_range': [(...), (...), (...)]}
CPU times: user 5min 25s, sys: 7.69 s, total: 5min 33s
Wall time: 3min 39s
Best score: 0.819
Best parameters set:
	log_reg__C: 2.0
	tfidf__max_df: 0.5
	tfidf__ngram_range: (1, 1)


In [59]:
# instantiate estimator
vect_B2 = TfidfVectorizer(binary=True, ngram_range=(1,1), stop_words=None, max_df=0.5)
logreg_B2 = LogisticRegression(C=2.0)

In [60]:
# fit with vectorizer
vect_B2.fit(X)

# transform with vectorizer
X_train_dtm = vect_B2.transform(X_train)

# fit with classifer
logreg_B2.fit(X_train_dtm, y_train)

# predict with classifier
y_pred_train = logreg_B2.predict(X_train_dtm)

# transform with vectorizer, then predict with classifier
X_test_dtm = vect_B2.transform(X_test)
y_pred_test = logreg_B2.predict(X_test_dtm)

In [61]:
print(metrics.accuracy_score(y_pred_train, y_train))

0.937278670662


In [62]:
print(metrics.accuracy_score(y_pred_test, y_test))

0.811031664964


In [63]:
print(metrics.confusion_matrix(y_pred_train, y_train))

[[5638  214  249]
 [ 106 3940   99]
 [ 181   72 4185]]


In [64]:
print(metrics.confusion_matrix(y_pred_test, y_test))

[[1699  252  233]
 [ 128 1081   88]
 [ 148   76 1190]]


In [65]:
print(metrics.classification_report(y_pred_train, y_train))

             precision    recall  f1-score   support

        EAP       0.95      0.92      0.94      6101
        HPL       0.93      0.95      0.94      4145
        MWS       0.92      0.94      0.93      4438

avg / total       0.94      0.94      0.94     14684



In [66]:
print(metrics.classification_report(y_pred_test,y_test))

             precision    recall  f1-score   support

        EAP       0.86      0.78      0.82      2184
        HPL       0.77      0.83      0.80      1297
        MWS       0.79      0.84      0.81      1414

avg / total       0.81      0.81      0.81      4895



### Vectorize + Random Forest = Experiment 1 (Split / Vectorize)

In [70]:
# tuning
# pipeline
pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True, stop_words=None)),
    ('rf', RandomForestClassifier(oob_score=True, 
                                  random_state=1234, 
                                  warm_start=True,
                                  bootstrap=True))
])

# parameters (please note too many)
parameters = dict(
    rf__max_features = ['sqrt','log2'],
    rf__criterion = ["gini", "entropy"],
    rf__max_depth = [3, None],
    rf__min_samples_split = sp_randint(2, 11),
    rf__min_samples_leaf =  sp_randint(1, 11),
    rf__n_estimators = [10, 25, 50, 75, 100, 125, 150, 175, 200],
    vect__max_df = (0.5, 0.75, 1.0),
    vect__ngram_range = [(1,1), (1,2), (1,3)]
)

In [71]:
# cv=5 academically proven as best fold, kept n_jobs (jobs running parallel)= 1, output time
rand_search = RandomizedSearchCV(pipeline, 
                           parameters, 
                           n_jobs=1, 
                           cv=5
                )

In [72]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters, depth=2)

%time rand_search.fit(X_train, y_train)

Performing grid search...
pipeline: ['vect', 'rf']
parameters:
{'rf__criterion': ['gini', 'entropy'],
 'rf__max_depth': [3, None],
 'rf__max_features': ['sqrt', 'log2'],
 'rf__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1121079b0>,
 'rf__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x114cbe9b0>,
 'rf__n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200],
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': [(...), (...), (...)]}


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


CPU times: user 8min 52s, sys: 12 s, total: 9min 4s
Wall time: 9min 5s


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...stimators=10, n_jobs=1, oob_score=True, random_state=1234,
            verbose=0, warm_start=True))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'rf__max_features': ['sqrt', 'log2'], 'rf__criterion': ['gini', 'entropy'], 'rf__max_depth': [3, None], 'rf__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x114cbe9b0>, 'rf__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1121079b0>, 'rf__n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200], 'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': [(1, 1), (1, 2), (2, 2)

In [74]:
print("Best score: %0.3f" % rand_search.best_score_)
print("Best parameters set:")

best_parameters = rand_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.654
Best parameters set:
	rf__criterion: 'entropy'
	rf__max_depth: None
	rf__max_features: 'sqrt'
	rf__min_samples_leaf: 6
	rf__min_samples_split: 4
	rf__n_estimators: 175
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)


In [116]:
# import and instantiate CountVectorizer (with the default parameters)
vect_C1 = TfidfVectorizer(max_df=0.5, ngram_range=(1,2))

# random forest classifier; please note this is NOT including parameters
rf_C1 = RandomForestClassifier(oob_score=True, 
                               warm_start=True,
                               max_depth=None,
                               criterion='entropy',
                               max_features='sqrt',
                               min_samples_leaf=6,
                               min_samples_split=4,
                               n_estimators=175)

In [117]:
# fit & transform with vectorizer
X_train_dtm = vect_C1.fit_transform(X_train)

# transform test set
X_test_dtm = vect_C1.transform(X_test)

In [118]:
# fit with classifer
%time rf_C1.fit(X_train_dtm,y_train)

CPU times: user 7.49 s, sys: 268 ms, total: 7.75 s
Wall time: 7.76 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=6,
            min_samples_split=4, min_weight_fraction_leaf=0.0,
            n_estimators=175, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=True)

In [119]:
# predict with training class
y_train_pred = rf_C1.predict(X_train_dtm)

In [120]:
# predict with testing class
y_test_pred = rf_C1.predict(X_test_dtm)

In [121]:
# accuracy score, train
metrics.accuracy_score(y_train, y_train_pred)

0.74135113048215751

In [122]:
# accuracy score, test
metrics.accuracy_score(y_test, y_test_pred)

0.66864147088866188

In [123]:
# confusion matrix, train
metrics.confusion_matrix(y_train, y_train_pred)

array([[5699,   70,  156],
       [1662, 2473,   91],
       [1671,  148, 2714]])

In [124]:
# print confusion matrix
metrics.confusion_matrix(y_test, y_test_pred)

array([[1855,   33,   87],
       [ 735,  619,   55],
       [ 653,   59,  799]])

In [125]:
# classification report, train
print(metrics.classification_report(y_train, y_train_pred))

             precision    recall  f1-score   support

        EAP       0.63      0.96      0.76      5925
        HPL       0.92      0.59      0.72      4226
        MWS       0.92      0.60      0.72      4533

avg / total       0.80      0.74      0.74     14684



In [126]:
# classification report, test
print(metrics.classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

        EAP       0.57      0.94      0.71      1975
        HPL       0.87      0.44      0.58      1409
        MWS       0.85      0.53      0.65      1511

avg / total       0.74      0.67      0.66      4895



In [127]:
# OOB Score
print(rf_C1.oob_score_)

0.679310814492


### Vectorize + Random Forest = Experiment 2 (Vectorize / Split)

In [106]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters, depth=2)

%time rand_search.fit(X, y)

Performing grid search...
pipeline: ['vect', 'rf']
parameters:
{'rf__criterion': ['gini', 'entropy'],
 'rf__max_depth': [3, None],
 'rf__max_features': ['sqrt', 'log2'],
 'rf__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1121079b0>,
 'rf__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x114cbe9b0>,
 'rf__n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200],
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': [(...), (...), (...)]}


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


CPU times: user 8min 31s, sys: 14.1 s, total: 8min 45s
Wall time: 8min 46s


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...stimators=10, n_jobs=1, oob_score=True, random_state=1234,
            verbose=0, warm_start=True))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'rf__max_features': ['sqrt', 'log2'], 'rf__criterion': ['gini', 'entropy'], 'rf__max_depth': [3, None], 'rf__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x114cbe9b0>, 'rf__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1121079b0>, 'rf__n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200], 'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': [(1, 1), (1, 2), (2, 2)

In [107]:
print("Best score: %0.3f" % rand_search.best_score_)
print("Best parameters set:")

best_parameters = rand_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.666
Best parameters set:
	rf__criterion: 'gini'
	rf__max_depth: None
	rf__max_features: 'sqrt'
	rf__min_samples_leaf: 1
	rf__min_samples_split: 7
	rf__n_estimators: 75
	vect__max_df: 1.0
	vect__ngram_range: (1, 2)


In [128]:
# import and instantiate CountVectorizer (with the default parameters)
vect_C2 = TfidfVectorizer(max_df=1.0, ngram_range=(1,2))

# random forest classifier; please note this is NOT including parameters
rf_C2 = RandomForestClassifier(oob_score=True, 
                               warm_start=True,
                               max_depth=None,
                               criterion='gini',
                               max_features='sqrt',
                               min_samples_leaf=1,
                               min_samples_split=7,
                               n_estimators=75)

In [129]:
# fit with vectorizer
vect_C2.fit(X)

# transform train set
X_train_dtm = vect_C2.transform(X_train)

# transform test set
X_test_dtm = vect_C2.transform(X_test)

In [134]:
# fit with training class
%time rf_C2.fit(X_train_dtm, y_train)

  warn("Warm-start fitting without increasing n_estimators does not "


CPU times: user 928 ms, sys: 78.9 ms, total: 1.01 s
Wall time: 1.01 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=7, min_weight_fraction_leaf=0.0,
            n_estimators=75, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=True)

In [135]:
# predict with training class
y_train_pred = rf_C2.predict(X_train_dtm)

# predict with testing class
y_test_pred = rf_C2.predict(X_test_dtm)

In [136]:
# accuracy score, train
metrics.accuracy_score(y_train, y_train_pred)

1.0

In [139]:
# accuracy score, test
metrics.accuracy_score(y_test, y_test_pred)

0.67150153217568953

In [140]:
# accuracy score, test
metrics.confusion_matrix(y_train, y_train_pred)

array([[5925,    0,    0],
       [   0, 4226,    0],
       [   0,    0, 4533]])

In [145]:
# accuracy score, test
metrics.confusion_matrix(y_test, y_test_pred)

array([[1795,   72,  108],
       [ 661,  694,   54],
       [ 607,  106,  798]])

In [143]:
# accuracy score, test
print(metrics.classification_report(y_train, y_train_pred))

             precision    recall  f1-score   support

        EAP       1.00      1.00      1.00      5925
        HPL       1.00      1.00      1.00      4226
        MWS       1.00      1.00      1.00      4533

avg / total       1.00      1.00      1.00     14684



In [144]:
# accuracy score, test
print(metrics.classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

        EAP       0.59      0.91      0.71      1975
        HPL       0.80      0.49      0.61      1409
        MWS       0.83      0.53      0.65      1511

avg / total       0.72      0.67      0.66      4895



In [146]:
# OOB Score
print(rf_C2.oob_score_)

0.66391991283
