# sai: spooky author identification
## analysis 3: TF-IDF Vectorizer

## strategy
This will apply TF-IDF vectorizer with the following classifiers:

* Multinomial Naive Bayes 
* Logistic Regression
* Random Forest

The same practices done with CountVectorizer, will occur here as well.  Therefore, it will create about 6 experiments:

* Split, vectorize TF-IDF + MultinomialNB()
* Vectorize TF-IDF, split, MultinomialNB()
* Split, vectorize TF-IDF + LogisticRegression()
* Vectorize TF-IDF, split, LogisticRegression()
* Split, vectorize TF-IDF + RandomForestClassifier()
* Vectorize TF-IDF, split, RandomForestClassifier()

Runtime will be long.  RandomizedSearchCV will be used for all to save time.

## code
### preliminaries
This is the 'de facto' run, where it loads libraries and necessary modules to perform the analysis.  Afterwards, it will read a simple csv file into a dataframe called 'texts.'  

In [1]:
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# sklearn
from sklearn.cross_validation import train_test_split             # cross-validation
from sklearn.feature_extraction.text import TfidfVectorizer      # vectorizer
from sklearn.naive_bayes import MultinomialNB                     # classifier
from sklearn.linear_model import LogisticRegression               # classifier
from sklearn.ensemble import RandomForestClassifier               # classifier
from sklearn.ensemble import ExtraTreesClassifier                 # classifier
from sklearn.model_selection import GridSearchCV                  # parameter tuning
from sklearn.model_selection import RandomizedSearchCV            # parameter tuning
from sklearn.pipeline import Pipeline                             # pipeline
from sklearn import metrics                                       # metrics

# other modules
from stop_words import get_stop_words
from scipy.stats import randint as sp_randint
import string
from pprint import pprint

# Read training texts: texts
texts = pd.read_csv('train.csv')



### Vectorize + Naive Bayes = Experiment 1 (Split / Vectorize)

In [2]:
# Feature Selection
X = texts.text
y = texts.author

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y)

In [3]:
# pipeline
pipeline_A1 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# parameters
parameters_A1 = dict(
    tfidf__ngram_range = [(1,1), (1,2), (1,3)],
    tfidf__max_df = (0.5, 0.75, 1.0),
    nb__alpha = [0.05, 0.1, 1.0, 2.0]
)

In [4]:
# cv=5 academically proven as best fold, kept n_jobs (jobs running parallel)= 1, output time
rand_search_A1 = RandomizedSearchCV(pipeline_A1, 
                           parameters_A1, 
                           n_jobs=1, 
                           cv=5
                )

In [5]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_A1.steps])
print("parameters:")
pprint(parameters_A1, depth=2)

%time rand_search_A1.fit(X_train, y_train)

print("Best score: %0.3f" % rand_search_A1.best_score_)
print("Best parameters set:")

best_parameters = rand_search_A1.best_estimator_.get_params()

for param_name in sorted(parameters_A1.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'nb']
parameters:
{'nb__alpha': [0.05, 0.1, 1.0, 2.0],
 'tfidf__max_df': (0.5, 0.75, 1.0),
 'tfidf__ngram_range': [(...), (...), (...)]}
CPU times: user 2min 42s, sys: 5.28 s, total: 2min 47s
Wall time: 2min 48s
Best score: 0.845
Best parameters set:
	nb__alpha: 0.05
	tfidf__max_df: 0.5
	tfidf__ngram_range: (1, 2)


In [22]:
# instantiate estimator
stop_word = get_stop_words('english')
vect_A1 = TfidfVectorizer(binary=True, ngram_range=(1,2), stop_words=stop_word, max_df=0.75)
nb_A1 = MultinomialNB(alpha=0.05)

In [23]:
# fit & transform with vectorizer
X_train_dtm = vect_A1.fit_transform(X_train)

# fit with classifer
nb_A1.fit(X_train_dtm, y_train)

# predict with classifier
y_pred_train = nb_A1.predict(X_train_dtm)

# transform with vectorizer, then predict with classifier
X_test_dtm = vect_A1.transform(X_test)
y_pred_test = nb_A1.predict(X_test_dtm)

In [24]:
print(metrics.accuracy_score(y_pred_train, y_train))

0.998637973304


In [25]:
print(metrics.accuracy_score(y_pred_test, y_test))

0.834729315628


In [26]:
print(metrics.confusion_matrix(y_pred_train, y_train))

[[5922    9    5]
 [   2 4217    3]
 [   1    0 4525]]


In [27]:
print(metrics.confusion_matrix(y_pred_test, y_test))

[[1679  192  183]
 [ 128 1136   57]
 [ 168   81 1271]]


In [28]:
print(metrics.classification_report(y_pred_train, y_train))

             precision    recall  f1-score   support

        EAP       1.00      1.00      1.00      5936
        HPL       1.00      1.00      1.00      4222
        MWS       1.00      1.00      1.00      4526

avg / total       1.00      1.00      1.00     14684



In [29]:
print(metrics.classification_report(y_pred_test,y_test))

             precision    recall  f1-score   support

        EAP       0.85      0.82      0.83      2054
        HPL       0.81      0.86      0.83      1321
        MWS       0.84      0.84      0.84      1520

avg / total       0.84      0.83      0.83      4895



In [30]:
# store vocabulary of X_train
X_train_tokens = vect_A1.get_feature_names()

# rows represent classes
print(len(X_train_tokens), nb_A1.feature_count_.shape)

181713 (3, 181713)


In [31]:
# number of times each token appears across all EAP messages
EAP_token_count = nb_A1.feature_count_[0,:]

# number of times each token appears across all HPL messages
HPL_token_count = nb_A1.feature_count_[1,:]

# number of times each token appears across all MWS messages
MWS_token_count = nb_A1.feature_count_[2,:]

In [32]:
# DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'EAP':EAP_token_count, 'HPL':HPL_token_count, 'MWS':MWS_token_count}).set_index('token')
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,EAP,HPL,MWS
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
entirely covered,0.184236,0.0,0.0
rummer,0.244745,0.0,0.0
pelf centuried,0.0,0.23297,0.0
felix brother,0.0,0.0,0.306676
retain,0.557495,0.449432,0.0


In [33]:
# avoid dividing by 0 (class imbalance)
tokens['EAP'] = tokens.EAP + 1
tokens['HPL'] = tokens.HPL + 1
tokens['MWS'] = tokens.MWS + 1

In [34]:
# convert into frequencies 
tokens['EAP'] = tokens.EAP / nb_A1.class_count_[0]
tokens['HPL'] = tokens.HPL / nb_A1.class_count_[1]
tokens['MWS'] = tokens.MWS / nb_A1.class_count_[2]

In [35]:
# examine DataFrame by author
for name in tokens.columns:
    print(tokens[name].nlargest(10),'\n')

token
upon       0.009923
one        0.006662
now        0.005922
said       0.005776
will       0.005269
say        0.004377
however    0.004355
little     0.003919
well       0.003866
made       0.003659
Name: EAP, dtype: float64 

token
one       0.007390
old       0.006728
man       0.005220
now       0.005176
night     0.005148
seemed    0.005108
though    0.004830
like      0.004737
saw       0.004720
came      0.004646
Name: HPL, dtype: float64 

token
will       0.006415
raymond    0.006166
one        0.005755
life       0.005631
now        0.005615
love       0.005389
yet        0.005303
heart      0.004890
perdita    0.004564
us         0.004437
Name: MWS, dtype: float64 



### Vectorize + Naive Bayes = Experiment 2 (Vectorize / Split)


In [30]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_A1.steps])
print("parameters:")
pprint(parameters_A1, depth=2)

%time rand_search_A1.fit(X, y)

print("Best score: %0.3f" % rand_search_A1.best_score_)
print("Best parameters set:")

best_parameters = rand_search_A1.best_estimator_.get_params()

for param_name in sorted(parameters_A1.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'nb']
parameters:
{'nb__alpha': [0.05, 0.1, 1.0, 2.0],
 'tfidf__max_df': (0.5, 0.75, 1.0),
 'tfidf__ngram_range': [(...), (...), (...)]}
CPU times: user 3min 19s, sys: 5.9 s, total: 3min 25s
Wall time: 3min 25s
Best score: 0.854
Best parameters set:
	nb__alpha: 0.1
	tfidf__max_df: 1.0
	tfidf__ngram_range: (1, 2)


In [36]:
# instantiate estimator
vect_A2 = TfidfVectorizer(binary=True, ngram_range=(1,2), stop_words=stop_word, max_df=1.0)
nb_A2 = MultinomialNB(alpha=0.1)

In [37]:
# fit with vectorizer
vect_A2.fit(X)

# transform with vectorizer
X_train_dtm = vect_A2.transform(X_train)

# fit with classifer
nb_A2.fit(X_train_dtm, y_train)

# predict with classifier
y_pred_train = nb_A2.predict(X_train_dtm)

# transform with vectorizer, then predict with classifier
X_test_dtm = vect_A2.transform(X_test)
y_pred_test = nb_A2.predict(X_test_dtm)

In [38]:
print(metrics.accuracy_score(y_pred_train, y_train))

0.998501770635


In [39]:
print(metrics.accuracy_score(y_pred_test, y_test))

0.841675178754


In [40]:
print(metrics.confusion_matrix(y_pred_train, y_train))

[[5922    9    6]
 [   2 4216    3]
 [   1    1 4524]]


In [41]:
print(metrics.confusion_matrix(y_pred_test, y_test))

[[1689  185  165]
 [ 113 1136   51]
 [ 173   88 1295]]


In [42]:
print(metrics.classification_report(y_pred_train, y_train))

             precision    recall  f1-score   support

        EAP       1.00      1.00      1.00      5937
        HPL       1.00      1.00      1.00      4221
        MWS       1.00      1.00      1.00      4526

avg / total       1.00      1.00      1.00     14684



In [43]:
print(metrics.classification_report(y_pred_test,y_test))

             precision    recall  f1-score   support

        EAP       0.86      0.83      0.84      2039
        HPL       0.81      0.87      0.84      1300
        MWS       0.86      0.83      0.84      1556

avg / total       0.84      0.84      0.84      4895



In [46]:
# store vocabulary of X_train
X_train_tokens = vect_A2.get_feature_names()

# rows represent classes
print(len(X_train_tokens), nb_A2.feature_count_.shape)

234396 (3, 234396)


In [47]:
# number of times each token appears across all EAP messages
EAP_token_count = nb_A2.feature_count_[0,:]

# number of times each token appears across all HPL messages
HPL_token_count = nb_A2.feature_count_[1,:]

# number of times each token appears across all MWS messages
MWS_token_count = nb_A2.feature_count_[2,:]

In [48]:
# DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'EAP':EAP_token_count, 'HPL':HPL_token_count, 'MWS':MWS_token_count}).set_index('token')

In [49]:
# avoid dividing by 0 (class imbalance)
tokens['EAP'] = tokens.EAP + 1
tokens['HPL'] = tokens.HPL + 1
tokens['MWS'] = tokens.MWS + 1

In [50]:
# convert into frequencies 
tokens['EAP'] = tokens.EAP / nb_A1.class_count_[0]
tokens['HPL'] = tokens.HPL / nb_A1.class_count_[1]
tokens['MWS'] = tokens.MWS / nb_A1.class_count_[2]

In [51]:
# examine DataFrame by author
for name in tokens.columns:
    print(tokens[name].nlargest(10),'\n')

token
upon       0.009757
one        0.006578
now        0.005828
said       0.005638
will       0.005171
say        0.004312
however    0.004254
little     0.003846
well       0.003798
found      0.003588
Name: EAP, dtype: float64 

token
one       0.007299
old       0.006582
man       0.005133
now       0.005093
night     0.005036
seemed    0.004988
though    0.004724
saw       0.004658
like      0.004654
came      0.004572
Name: HPL, dtype: float64 

token
will       0.006296
raymond    0.006035
one        0.005683
life       0.005552
now        0.005526
love       0.005292
yet        0.005195
heart      0.004812
perdita    0.004476
us         0.004365
Name: MWS, dtype: float64 



### Vectorize + Logistic Regression = Experiment 1 (Split / Vectorize)

In [52]:
# pipeline
pipeline_B1 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('log_reg', LogisticRegression()),
])

# parameters
parameters_B1 = dict(
    tfidf__ngram_range = [(1,1), (1,2), (1,3)],
    tfidf__max_df = (0.5, 0.75, 1.0),
    log_reg__C = [0.05, 0.1, 1.0, 2.0]
)

In [53]:
# cv=5 academically proven as best fold, kept n_jobs (jobs running parallel)= 1, output time
rand_search_B1 = RandomizedSearchCV(pipeline_B1, 
                           parameters_B1, 
                           n_jobs=1, 
                           cv=5
                )

In [54]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_B1.steps])
print("parameters:")
pprint(parameters_B1, depth=2)

%time rand_search_B1.fit(X, y)

print("Best score: %0.3f" % rand_search_B1.best_score_)
print("Best parameters set:")

best_parameters = rand_search_B1.best_estimator_.get_params()

for param_name in sorted(parameters_B1.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'log_reg']
parameters:
{'log_reg__C': [0.05, 0.1, 1.0, 2.0],
 'tfidf__max_df': (0.5, 0.75, 1.0),
 'tfidf__ngram_range': [(...), (...), (...)]}
CPU times: user 3min 58s, sys: 32 s, total: 4min 30s
Wall time: 2min 56s
Best score: 0.806
Best parameters set:
	log_reg__C: 2.0
	tfidf__max_df: 1.0
	tfidf__ngram_range: (1, 1)


In [71]:
# instantiate estimator
vect_B1 = TfidfVectorizer(binary=True, ngram_range=(1,1), stop_words=stop_word, max_df=1.0)
logreg_B1 = LogisticRegression(C=2.0)

In [72]:
# fit & transform with vectorizer
X_train_dtm = vect_B1.fit_transform(X_train)

# fit with classifer
logreg_B1.fit(X_train_dtm, y_train)

# predict with classifier
y_pred_train = logreg_B1.predict(X_train_dtm)

# transform with vectorizer, then predict with classifier
X_test_dtm = vect_B1.transform(X_test)
y_pred_test = logreg_B1.predict(X_test_dtm)

In [73]:
print(metrics.accuracy_score(y_pred_train, y_train))

0.944701716154


In [74]:
print(metrics.accuracy_score(y_pred_test, y_test))

0.81695607763


In [75]:
print(metrics.confusion_matrix(y_pred_train, y_train))

[[5700  210  228]
 [  97 3947   80]
 [ 128   69 4225]]


In [76]:
print(metrics.confusion_matrix(y_pred_test, y_test))

[[1708  221  254]
 [ 127 1102   68]
 [ 140   86 1189]]


In [77]:
print(metrics.classification_report(y_pred_train, y_train))

             precision    recall  f1-score   support

        EAP       0.96      0.93      0.95      6138
        HPL       0.93      0.96      0.95      4124
        MWS       0.93      0.96      0.94      4422

avg / total       0.95      0.94      0.94     14684



In [78]:
print(metrics.classification_report(y_pred_test,y_test))

             precision    recall  f1-score   support

        EAP       0.86      0.78      0.82      2183
        HPL       0.78      0.85      0.81      1297
        MWS       0.79      0.84      0.81      1415

avg / total       0.82      0.82      0.82      4895



In [79]:
# Get feature names
X_train_tokens = vect_B1.get_feature_names()

# Calculate coefficient into odd, calculate odd into probability
log_reg_prob=np.exp(logreg_B1.coef_)/(np.exp(logreg_B1.coef_)+1)

In [80]:
# coefficients each token appears across all classes
EAP_token_prob = log_reg_prob[0,:]
HPL_token_prob = log_reg_prob[1,:]
MWS_token_prob = log_reg_prob[2,:]

# number of times each token appears
EAP_token_coef = logreg_B1.coef_[0,:]
HPL_token_coef = logreg_B1.coef_[1,:]
MWS_token_coef = logreg_B1.coef_[2,:]

In [81]:
# Coefficient DataFrame
coefs = pd.DataFrame({'token':X_train_tokens, 
                       'EAP':EAP_token_coef, 
                       'HPL':HPL_token_coef, 
                       'MWS':MWS_token_coef}).set_index('token')

In [82]:
# examine DataFrame by author
for name in coefs.columns:
    print(coefs[name].nlargest(10),'\n')

token
upon          6.562724
madame        3.764638
however       3.619355
dupin         3.466446
matter        3.426787
lady          3.340734
character     3.201394
altogether    3.184059
mr            3.156012
although      3.146067
Name: EAP, dtype: float64 

token
though       6.083431
west         4.823636
street       4.567506
later        4.501265
gilman       4.205996
despite      4.125355
old          3.986580
innsmouth    3.831145
men          3.678563
whilst       3.507929
Name: HPL, dtype: float64 

token
raymond    8.191651
perdita    6.080660
adrian     5.910498
towards    5.586380
love       4.787638
idris      4.634267
plague     4.205655
misery     4.176011
sister     4.062894
cottage    3.972599
Name: MWS, dtype: float64 



In [83]:
# Probability DataFrame
probs = pd.DataFrame({'token':X_train_tokens, 
                       'EAP':EAP_token_prob, 
                       'HPL':HPL_token_prob, 
                       'MWS':MWS_token_prob}).set_index('token')

In [84]:
# examine DataFrame by author
for name in probs.columns:
    print(probs[name].nlargest(10),'\n')

token
upon          0.998590
madame        0.977349
however       0.973900
dupin         0.969718
matter        0.968531
lady          0.965800
character     0.960887
altogether    0.960230
mr            0.959145
although      0.958753
Name: EAP, dtype: float64 

token
though       0.997725
west         0.992027
street       0.989723
later        0.989027
gilman       0.985313
despite      0.984099
old          0.981775
innsmouth    0.978775
men          0.975363
whilst       0.970913
Name: HPL, dtype: float64 

token
raymond    0.999723
perdita    0.997719
adrian     0.997296
towards    0.996265
love       0.991737
idris      0.990380
plague     0.985308
misery     0.984873
sister     0.983092
cottage    0.981523
Name: MWS, dtype: float64 



### Vectorize + Logistic Regression = Experiment 2 (Vectorize / Split)

In [85]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_B1.steps])
print("parameters:")
pprint(parameters_B1, depth=2)

%time rand_search_B1.fit(X, y)

print("Best score: %0.3f" % rand_search_B1.best_score_)
print("Best parameters set:")

best_parameters = rand_search_B1.best_estimator_.get_params()

for param_name in sorted(parameters_B1.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'log_reg']
parameters:
{'log_reg__C': [0.05, 0.1, 1.0, 2.0],
 'tfidf__max_df': (0.5, 0.75, 1.0),
 'tfidf__ngram_range': [(...), (...), (...)]}
CPU times: user 4min 50s, sys: 39.4 s, total: 5min 30s
Wall time: 3min 39s
Best score: 0.821
Best parameters set:
	log_reg__C: 2.0
	tfidf__max_df: 0.75
	tfidf__ngram_range: (1, 1)


In [86]:
# instantiate estimator
vect_B2 = TfidfVectorizer(binary=True, ngram_range=(1,1), stop_words=None, max_df=0.5)
logreg_B2 = LogisticRegression(C=2.0)

In [87]:
# fit with vectorizer
vect_B2.fit(X)

# transform with vectorizer
X_train_dtm = vect_B2.transform(X_train)

# fit with classifer
logreg_B2.fit(X_train_dtm, y_train)

# predict with classifier
y_pred_train = logreg_B2.predict(X_train_dtm)

# transform with vectorizer, then predict with classifier
X_test_dtm = vect_B2.transform(X_test)
y_pred_test = logreg_B2.predict(X_test_dtm)

In [88]:
print(metrics.accuracy_score(y_pred_train, y_train))

0.937278670662


In [89]:
print(metrics.accuracy_score(y_pred_test, y_test))

0.823493360572


In [90]:
print(metrics.confusion_matrix(y_pred_train, y_train))

[[5638  201  248]
 [ 108 3940  100]
 [ 179   85 4185]]


In [91]:
print(metrics.confusion_matrix(y_pred_test, y_test))

[[1696  211  222]
 [ 142 1128   82]
 [ 137   70 1207]]


In [92]:
print(metrics.classification_report(y_pred_train, y_train))

             precision    recall  f1-score   support

        EAP       0.95      0.93      0.94      6087
        HPL       0.93      0.95      0.94      4148
        MWS       0.92      0.94      0.93      4449

avg / total       0.94      0.94      0.94     14684



In [93]:
print(metrics.classification_report(y_pred_test,y_test))

             precision    recall  f1-score   support

        EAP       0.86      0.80      0.83      2129
        HPL       0.80      0.83      0.82      1352
        MWS       0.80      0.85      0.83      1414

avg / total       0.83      0.82      0.82      4895



In [94]:
# Get feature names
X_train_tokens = vect_B2.get_feature_names()

# Calculate coefficient into odd, calculate odd into probability
log_reg_prob=np.exp(logreg_B2.coef_)/(np.exp(logreg_B2.coef_)+1)

In [95]:
# coefficients each token appears across all classes
EAP_token_prob = log_reg_prob[0,:]
HPL_token_prob = log_reg_prob[1,:]
MWS_token_prob = log_reg_prob[2,:]

# number of times each token appears
EAP_token_coef = logreg_B2.coef_[0,:]
HPL_token_coef = logreg_B2.coef_[1,:]
MWS_token_coef = logreg_B2.coef_[2,:]

In [96]:
# Coefficient DataFrame
coefs = pd.DataFrame({'token':X_train_tokens, 
                       'EAP':EAP_token_coef, 
                       'HPL':HPL_token_coef, 
                       'MWS':MWS_token_coef}).set_index('token')

In [97]:
# examine DataFrame by author
for name in coefs.columns:
    print(coefs[name].nlargest(10),'\n')

token
upon         6.746061
is           3.889144
however      3.838754
madame       3.821593
lady         3.658483
dupin        3.278529
minutes      3.242017
although     3.234435
marie        3.210714
character    3.124110
Name: EAP, dtype: float64 

token
though       6.164898
west         4.582290
later        4.435798
street       4.219803
despite      4.102376
old          4.017074
uncle        3.968330
gilman       3.936558
men          3.906260
innsmouth    3.705354
Name: HPL, dtype: float64 

token
raymond    8.194954
adrian     5.920245
perdita    5.732935
her        5.660064
towards    5.109025
love       4.825883
my         4.687107
idris      4.593774
she        4.478862
plague     4.407684
Name: MWS, dtype: float64 



In [98]:
# Probability DataFrame
probs = pd.DataFrame({'token':X_train_tokens, 
                       'EAP':EAP_token_prob, 
                       'HPL':HPL_token_prob, 
                       'MWS':MWS_token_prob}).set_index('token')

In [99]:
# examine DataFrame by author
for name in probs.columns:
    print(probs[name].nlargest(10),'\n')

token
upon         0.998826
is           0.979947
however      0.978933
madame       0.978576
lady         0.974876
dupin        0.963685
minutes      0.962385
although     0.962110
marie        0.961235
character    0.957876
Name: EAP, dtype: float64 

token
though       0.997902
west         0.989872
later        0.988293
street       0.985511
despite      0.983736
old          0.982313
uncle        0.981446
gilman       0.980858
men          0.980281
innsmouth    0.975999
Name: HPL, dtype: float64 

token
raymond    0.999724
adrian     0.997323
perdita    0.996773
her        0.996530
towards    0.993994
love       0.992044
my         0.990871
idris      0.989987
she        0.988781
plague     0.987963
Name: MWS, dtype: float64 



### Vectorize + Random Forest = Experiment 1 (Split / Vectorize)

In [100]:
# tuning
# pipeline
pipeline = Pipeline([
    ('vect', TfidfVectorizer(binary=True, stop_words=None)),
    ('rf', RandomForestClassifier(oob_score=True, 
                                  random_state=1234, 
                                  warm_start=True,
                                  bootstrap=True))
])

# parameters (please note too many)
parameters = dict(
    rf__max_features = ['sqrt','log2'],
    rf__criterion = ["gini", "entropy"],
    rf__max_depth = [3, None],
    rf__min_samples_split = sp_randint(2, 11),
    rf__min_samples_leaf =  sp_randint(1, 11),
    rf__n_estimators = [10, 25, 50, 75, 100, 125, 150, 175, 200],
    vect__max_df = (0.5, 0.75, 1.0),
    vect__ngram_range = [(1,1), (1,2), (1,3)]
)

In [101]:
# cv=5 academically proven as best fold, kept n_jobs (jobs running parallel)= 1, output time
rand_search = RandomizedSearchCV(pipeline, 
                           parameters, 
                           n_jobs=1, 
                           cv=5
                )

In [102]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters, depth=2)

%time rand_search.fit(X_train, y_train)

Performing grid search...
pipeline: ['vect', 'rf']
parameters:
{'rf__criterion': ['gini', 'entropy'],
 'rf__max_depth': [3, None],
 'rf__max_features': ['sqrt', 'log2'],
 'rf__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11176b9b0>,
 'rf__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11176b710>,
 'rf__n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200],
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': [(...), (...), (...)]}


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


CPU times: user 3min 30s, sys: 16.2 s, total: 3min 46s
Wall time: 3min 47s


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...stimators=10, n_jobs=1, oob_score=True, random_state=1234,
            verbose=0, warm_start=True))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'rf__max_features': ['sqrt', 'log2'], 'rf__criterion': ['gini', 'entropy'], 'rf__max_depth': [3, None], 'rf__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11176b710>, 'rf__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11176b9b0>, 'rf__n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200], 'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)

In [103]:
print("Best score: %0.3f" % rand_search.best_score_)
print("Best parameters set:")

best_parameters = rand_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.642
Best parameters set:
	rf__criterion: 'entropy'
	rf__max_depth: None
	rf__max_features: 'log2'
	rf__min_samples_leaf: 2
	rf__min_samples_split: 2
	rf__n_estimators: 125
	vect__max_df: 0.5
	vect__ngram_range: (1, 1)


In [104]:
# import and instantiate CountVectorizer (with the default parameters)
vect_C1 = TfidfVectorizer(max_df=0.5, 
                          ngram_range=(1,2), 
                          stop_words=None)

# random forest classifier; please note this is NOT including parameters
rf_C1 = RandomForestClassifier(oob_score=True, 
                               warm_start=True,
                               max_depth=None,
                               criterion='entropy',
                               max_features='sqrt',
                               min_samples_leaf=6,
                               min_samples_split=4,
                               n_estimators=175)

In [105]:
# fit & transform with vectorizer
X_train_dtm = vect_C1.fit_transform(X_train)

# transform test set
X_test_dtm = vect_C1.transform(X_test)

In [106]:
# fit with classifer
%time rf_C1.fit(X_train_dtm,y_train)

CPU times: user 8.85 s, sys: 344 ms, total: 9.19 s
Wall time: 9.22 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=6,
            min_samples_split=4, min_weight_fraction_leaf=0.0,
            n_estimators=175, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=True)

In [107]:
# predict with training class
y_train_pred = rf_C1.predict(X_train_dtm)

In [108]:
# predict with testing class
y_test_pred = rf_C1.predict(X_test_dtm)

In [109]:
# accuracy score, train
metrics.accuracy_score(y_train, y_train_pred)

0.74891037864342136

In [110]:
# accuracy score, test
metrics.accuracy_score(y_test, y_test_pred)

0.68702757916241064

In [111]:
# confusion matrix, train
metrics.confusion_matrix(y_train, y_train_pred)

array([[5692,   72,  161],
       [1631, 2485,  110],
       [1563,  150, 2820]])

In [112]:
# print confusion matrix
metrics.confusion_matrix(y_test, y_test_pred)

array([[1870,   29,   76],
       [ 717,  650,   42],
       [ 620,   48,  843]])

In [113]:
# classification report, train
print(metrics.classification_report(y_train, y_train_pred))

             precision    recall  f1-score   support

        EAP       0.64      0.96      0.77      5925
        HPL       0.92      0.59      0.72      4226
        MWS       0.91      0.62      0.74      4533

avg / total       0.80      0.75      0.74     14684



In [114]:
# classification report, test
print(metrics.classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

        EAP       0.58      0.95      0.72      1975
        HPL       0.89      0.46      0.61      1409
        MWS       0.88      0.56      0.68      1511

avg / total       0.76      0.69      0.68      4895



In [115]:
# OOB Score
print(rf_C1.oob_score_)

0.680400435849


In [116]:
# vocabulary (same as first trial)
X_tokens = vect_C1.get_feature_names()

In [117]:
# feature importance
tokens_rf = pd.DataFrame({'token':X_tokens, 
                          'metric':rf_C1.feature_importances_}).set_index('token')

In [118]:
print(tokens_rf['metric'].nlargest(10),'\n')

token
her        0.021132
raymond    0.011684
my         0.010527
upon       0.010472
is         0.010361
on         0.009368
though     0.007773
old        0.007448
love       0.006973
perdita    0.006906
Name: metric, dtype: float64 



### Vectorize + Random Forest = Experiment 2 (Vectorize / Split)

In [119]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters, depth=2)

%time rand_search.fit(X, y)

Performing grid search...
pipeline: ['vect', 'rf']
parameters:
{'rf__criterion': ['gini', 'entropy'],
 'rf__max_depth': [3, None],
 'rf__max_features': ['sqrt', 'log2'],
 'rf__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11176b9b0>,
 'rf__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11176b710>,
 'rf__n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200],
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': [(...), (...), (...)]}


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


CPU times: user 4min 40s, sys: 28.5 s, total: 5min 8s
Wall time: 5min 9s


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...stimators=10, n_jobs=1, oob_score=True, random_state=1234,
            verbose=0, warm_start=True))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'rf__max_features': ['sqrt', 'log2'], 'rf__criterion': ['gini', 'entropy'], 'rf__max_depth': [3, None], 'rf__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11176b710>, 'rf__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11176b9b0>, 'rf__n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200], 'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)

In [120]:
print("Best score: %0.3f" % rand_search.best_score_)
print("Best parameters set:")

best_parameters = rand_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.665
Best parameters set:
	rf__criterion: 'entropy'
	rf__max_depth: None
	rf__max_features: 'sqrt'
	rf__min_samples_leaf: 9
	rf__min_samples_split: 4
	rf__n_estimators: 50
	vect__max_df: 0.5
	vect__ngram_range: (1, 1)


In [122]:
# import and instantiate CountVectorizer (with the default parameters)
vect_C2 = TfidfVectorizer(max_df=1.0, 
                          ngram_range=(1,2),
                          stop_words=stop_word)

# random forest classifier; please note this is NOT including parameters
rf_C2 = RandomForestClassifier(oob_score=True, 
                               warm_start=True,
                               max_depth=None,
                               criterion='gini',
                               max_features='sqrt',
                               min_samples_leaf=1,
                               min_samples_split=7,
                               n_estimators=75)

In [123]:
# fit with vectorizer
vect_C2.fit(X)

# transform train set
X_train_dtm = vect_C2.transform(X_train)

# transform test set
X_test_dtm = vect_C2.transform(X_test)

In [124]:
# fit with training class
%time rf_C2.fit(X_train_dtm, y_train)

CPU times: user 4min 17s, sys: 411 ms, total: 4min 18s
Wall time: 4min 18s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=7, min_weight_fraction_leaf=0.0,
            n_estimators=75, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=True)

In [125]:
# predict with training class
y_train_pred = rf_C2.predict(X_train_dtm)

# predict with testing class
y_test_pred = rf_C2.predict(X_test_dtm)

In [126]:
# accuracy score, train
metrics.accuracy_score(y_train, y_train_pred)

0.99870607463906291

In [127]:
# accuracy score, test
metrics.accuracy_score(y_test, y_test_pred)

0.72339121552604702

In [128]:
# accuracy score, test
metrics.confusion_matrix(y_train, y_train_pred)

array([[5922,    0,    3],
       [   5, 4214,    7],
       [   4,    0, 4529]])

In [129]:
# accuracy score, test
metrics.confusion_matrix(y_test, y_test_pred)

array([[1784,   83,  108],
       [ 489,  825,   95],
       [ 495,   84,  932]])

In [130]:
# accuracy score, test
print(metrics.classification_report(y_train, y_train_pred))

             precision    recall  f1-score   support

        EAP       1.00      1.00      1.00      5925
        HPL       1.00      1.00      1.00      4226
        MWS       1.00      1.00      1.00      4533

avg / total       1.00      1.00      1.00     14684



In [131]:
# accuracy score, test
print(metrics.classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

        EAP       0.64      0.90      0.75      1975
        HPL       0.83      0.59      0.69      1409
        MWS       0.82      0.62      0.70      1511

avg / total       0.75      0.72      0.72      4895



In [132]:
# OOB Score
print(rf_C2.oob_score_)

0.718060473985


In [133]:
# vocabulary (same as first trial)
X_tokens = vect_C2.get_feature_names()

In [134]:
# feature importance
tokens_rf = pd.DataFrame({'token':X_tokens, 
                          'metric':rf_C2.feature_importances_}).set_index('token')

In [135]:
print(tokens_rf['metric'].nlargest(10),'\n')

token
upon       0.006144
raymond    0.005534
perdita    0.003854
though     0.003818
love       0.003631
adrian     0.003392
old        0.003087
father     0.002661
towards    0.002489
life       0.002411
Name: metric, dtype: float64 

