# sai: spooky author identification
## analysis 3: random forest

## discussion
In the previous code, the author identification analysis was performed with Naive Bayes Classification and Logistic Regression.  This used CountVectorizer to identify the author based on the text prose and split the training set into two areas: split and vectorize, or vectorize and split.  The later proved productive; therefore, this experiment will use random forests with hyperparameter tuning with CountVectorizer to obtain better predictions.  


## code
### preliminaries
Import libraries and modules.  Read csv file.  And show what's in it!

In [1]:
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# sklearn
from sklearn.cross_validation import train_test_split             # cross-validation
from sklearn.feature_extraction.text import CountVectorizer       # vectorizer
from sklearn.ensemble import RandomForestClassifier               # classifier
from sklearn.ensemble import ExtraTreesClassifier                 # classifier
from sklearn.model_selection import GridSearchCV                  # parameter tuning
from sklearn.model_selection import RandomizedSearchCV            # parameter tuning
from sklearn.pipeline import Pipeline                             # pipeline
from sklearn import metrics                                       # metrics

# other modules
from stop_words import get_stop_words
from scipy.stats import randint as sp_randint
import string
from pprint import pprint


# Read training texts: texts
texts = pd.read_csv('train.csv')
texts.head()



Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


### Simple Trial (no hyperparameter tuning)
#### feature selection

In [2]:
# choose X and y
X = texts.text
y = texts.author

#### vectorize

In [3]:
stop_word = get_stop_words('english')

In [4]:
vect = CountVectorizer(stop_words=stop_word)

In [5]:
# document-term matrix
X_dtm = vect.fit_transform(X)

#### split

In [6]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [7]:
# transform training case
X_train_dtm = vect.transform(X_train)

# transform test case
X_test_dtm = vect.transform(X_test)

#### fit and predict

In [8]:
# random forest classifier; please note this is NOT including parameters
rf = RandomForestClassifier(oob_score = True)

In [9]:
# fit the model
%time rf.fit(X_dtm, y)

CPU times: user 5.2 s, sys: 31.1 ms, total: 5.23 s
Wall time: 5.23 s


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)

In [10]:
# predict with training class
y_train_pred = rf.predict(X_train_dtm)

In [11]:
# predict with testing class
y_test_pred = rf.predict(X_test_dtm)

#### evaluate

In [12]:
# oob score
print(rf.oob_score_)

0.610858572961


In [13]:
# accuracy score, train
metrics.accuracy_score(y_train, y_train_pred)

0.98896758376464178

In [14]:
# accuracy score, test
metrics.accuracy_score(y_test, y_test_pred)

0.9893769152196118

In [15]:
# confusion matrix, train
metrics.confusion_matrix(y_train, y_train_pred)

array([[5942,   10,   12],
       [  62, 4118,   11],
       [  50,   17, 4462]])

In [16]:
# print confusion matrix
metrics.confusion_matrix(y_test, y_test_pred)

array([[1923,    8,    5],
       [  17, 1426,    1],
       [  13,    8, 1494]])

In [17]:
# classification report, train
print(metrics.classification_report(y_train, y_train_pred))

             precision    recall  f1-score   support

        EAP       0.98      1.00      0.99      5964
        HPL       0.99      0.98      0.99      4191
        MWS       0.99      0.99      0.99      4529

avg / total       0.99      0.99      0.99     14684



In [18]:
# classification report, test
print(metrics.classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

        EAP       0.98      0.99      0.99      1936
        HPL       0.99      0.99      0.99      1444
        MWS       1.00      0.99      0.99      1515

avg / total       0.99      0.99      0.99      4895



In [19]:
# vocabulary
X_tokens = vect.get_feature_names()

In [20]:
# feature importance
tokens_rf = pd.DataFrame({'token':X_tokens, 'metric':rf.feature_importances_}).set_index('token')

In [21]:
print(tokens_rf['metric'].nlargest(10),'\n')

token
raymond    0.009076
upon       0.007015
though     0.006953
perdita    0.005213
love       0.005178
old        0.004164
adrian     0.003842
father     0.003819
life       0.003697
towards    0.002850
Name: metric, dtype: float64 



### updated parameters 
This will use the code from sci-kit learn developers to estimate the OOB (out of bag) error and the best feature handler for this dataset.  Here, the n_estimators is 100; n_estimators  

In [22]:
model =  RandomForestClassifier(n_estimators = 100, oob_score = True, warm_start = True)

In [23]:
%time model.fit(X_dtm, y)
y_train_pred = model.predict(X_train_dtm)
y_test_pred = model.predict(X_test_dtm)

CPU times: user 51.8 s, sys: 322 ms, total: 52.1 s
Wall time: 52.3 s


In [24]:
print(metrics.confusion_matrix(y_train, y_train_pred))

[[5964    0    0]
 [   1 4190    0]
 [   1    0 4528]]


In [25]:
print(metrics.classification_report(y_train, y_train_pred))

             precision    recall  f1-score   support

        EAP       1.00      1.00      1.00      5964
        HPL       1.00      1.00      1.00      4191
        MWS       1.00      1.00      1.00      4529

avg / total       1.00      1.00      1.00     14684



In [26]:
y_test_pred = model.predict(X_test_dtm)

In [27]:
print(metrics.accuracy_score(y_test, y_test_pred))

0.999795709908


In [28]:
# oob score
print(rf.oob_score_)

0.610858572961


In [29]:
print(metrics.confusion_matrix(y_test, y_test_pred))

[[1936    0    0]
 [   0 1444    0]
 [   1    0 1514]]


In [30]:
print(metrics.classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

        EAP       1.00      1.00      1.00      1936
        HPL       1.00      1.00      1.00      1444
        MWS       1.00      1.00      1.00      1515

avg / total       1.00      1.00      1.00      4895



In [33]:
# vocabulary (same as first trial)
X_tokens = vect.get_feature_names()

In [34]:
# feature importance
tokens_rf = pd.DataFrame({'token':X_tokens, 'metric':model.feature_importances_}).set_index('token')

In [35]:
print(tokens_rf['metric'].nlargest(10),'\n')

token
raymond    0.008629
upon       0.008079
perdita    0.005141
love       0.004937
though     0.004735
adrian     0.004516
old        0.004420
father     0.003796
west       0.003255
towards    0.003008
Name: metric, dtype: float64 



## hyperparameter tuning
### discussion
Random Forests may be the best estimator; however, tuning them is not!  Below is a simple RandomForestClassifier model:

`
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
`


There are five different parameters to evaluate for a better score.  And with computational time an issue, it is best to break down each parameter and promote one with the best estimate parameters.  

* max_features=`['sqrt','none','log2']`
* criterion=`['gini','entropy']`
* n_estimators=`[50, 100, 150, 200]`
* min_samples_leaf=`[50,100,150]`

Other parameters will make it easy to train; this includes:

* random_state=42
* oob_score=True 
* warm_start=True (use previous build to build new set)


### 1 - `n_estimators` 
This will use the code from sci-kit learn developers to estimate the OOB (out of bag) error and the best feature handler for this dataset.  Here, the n_estimators is 10; n_estimators will increase every 10 steps per iteration so the final n will be 410 (40 * 10 + 10).  

In [None]:
errors = []
oob_error = []
growing_rf = RandomForestClassifier(n_estimators=10, 
                                    n_jobs=-1,  
                                    warm_start=True,
                                    oob_score=True,
                                    random_state=1514)

for i in range(40):
    growing_rf.fit(X_train_dtm, y_train)
    errors.append(metrics.log_loss(y_test, growing_rf.predict_proba(X_test_dtm)))
    oob_error.append([growing_rf.n_estimators, 1 - growing_rf.oob_score_])
    growing_rf.n_estimators += 10

In [None]:
x, y = [], []
for i in range(len(oob_error)):
    y.append(oob_error[i][1])
    x.append(oob_error[i][0])

In [None]:
_ = plt.figure(figsize=(20,5))
_ = plt.plot(x,y, '-r')
_ = plt.title('OOB Error by Time', size=14)
_ = plt.xlabel('n_estimators', size=14)
_ = plt.ylabel('OOB Error', size=14)

So far, within the first 5 iterations (where n_estimators = 10 to 60), the error goes down significantly but afterwards, it hovers around 0.77-0.79.  So it steadied.

### 2 - `n_estimators`  +  `max_features`
This will use the same code but will also use `max_features`.  This comes from the following link: http://scikit-learn.org/stable/auto_examples/ensemble/plot_ensemble_oob.html#sphx-glr-auto-examples-ensemble-plot-ensemble-oob-py.  This is essential to see how one feature does when another feature gets tweaked.  And it will determine a better choice for `max_features`.  

In [31]:
RANDOM_STATE = 123

In [None]:
# NOTE: Setting the `warm_start` construction parameter to `True` disables
# support for parallelized ensembles but is necessary for tracking the OOB
# error trajectory during training.

ensemble_clfs = [
    ("RandomForestClassifier, max_features='sqrt'",
        RandomForestClassifier(warm_start=True, 
                               oob_score=True,
                               max_features="sqrt",
                               n_estimators=10,
                               random_state=RANDOM_STATE)),
    ("RandomForestClassifier, max_features='log2'",
        RandomForestClassifier(warm_start=True, 
                               max_features='log2',
                               n_estimators=10,
                               oob_score=True,
                               random_state=RANDOM_STATE)),
    ("RandomForestClassifier, max_features=None",
        RandomForestClassifier(warm_start=True, 
                               max_features=None,
                               n_estimators=10,
                               oob_score=True,
                               random_state=RANDOM_STATE))
]

In [None]:
from collections import OrderedDict

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)
oob_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

for label, clf in ensemble_clfs:
    for i in range(40):
        clf.fit(X_train_dtm, y_train)
        
        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = 1 - clf.oob_score_
        log_loss_error = metrics.log_loss(y_test, clf.predict_proba(X_test_dtm))
        
        # Enter error rate details
        error_rate[label].append((i, clf.n_estimators, log_loss_error))
        oob_rate[label].append((i, clf.n_estimators, oob_error))
        
        # Update n_estimators 
        clf.n_estimators += 10

In [None]:
# Generate the "OOB error rate" vs. "n_estimators" plot.
plt.figure(figsize=(20,5))

for label, clf_err in error_rate.items():
    xs, ys, zs = zip(*clf_err)
    plt.plot(ys, zs, label=label)

plt.xlabel("n_estimators", fontsize=14)
plt.ylabel("log loss rate", fontsize=14)
plt.title('Log Loss vs n_estimators by max_feature', size=14)
plt.legend(loc="upper right", fontsize=14)
plt.show()

Like above, the first five estimators start with high errors.  However, the error goes down.  However, with NO maximum features, it steadied around 0.38-0.39.  With the other two maximum features, the 'sqrt' random forest had a slightly higher error than the 'log2' random forest but the 'log2' overtook it, having a smaller error over the long term.  This would continue for another 50 n_estimators.  

Working with n_estimators, the log2 random forest does well with this training set than the sqrt random forest and especially the random forest where there are no maximum features.  Even in a quick grid search, `sqrt` did the best with the training set although the best values were not present.  

In [None]:
# Generate the "log loss" vs. "n_estimators" plot.
plt.figure(figsize=(20,5))

for label, clf_err in oob_rate.items():
    xs, ys, zs  = zip(*clf_err)
    plt.plot(ys, zs, label=label)

plt.xlabel("n_estimators", fontsize=14)
plt.ylabel("OOB error rate", fontsize=14)
plt.title('OOB Error vs n_estimators by max_feature', size=14)
plt.legend(loc="upper right", fontsize=14)
plt.show()

Using the same evaluation metric, log_loss, the 'None' random forest has the same curvature as above.  However, the two have steady log-loss error rates below 0.80.  'log2', which did well in the OOB error curve, had a larger error by 0.01-0.02 against 'sqrt', as to why the pipeline preferred 'sqrt' as the best.  But also, it doesn't seem that one parameter is influencing the other.  While this is a simple choice, it will remain the same.  

### 3 - `min_samples_leaf`
This will now test the minimum sample leafs using the information obtained above.  Seeing that the n_estimators perform well after 50 and that the 'log2' random forest and the 'sqrt' random forest did better than having no maximum features, it would be best to continue on this route.  Hypothetically if having 15 possibilities per feature to check, then with n features, it would recur 15 * n, and this is not including cross-validation.  

In [None]:
sample_leaf_options = list(range(1,100))

In [None]:
leaf_error = []
oob_rate_leaf = []

for leaf_size in sample_leaf_options:
    clf = RandomForestClassifier(n_estimators = 100, 
                                oob_score = True, 
                                warm_start = True,
                                n_jobs = -1,
                                random_state = RANDOM_STATE, 
                                max_features = "log2", 
                                min_samples_leaf = leaf_size)
    
    clf.fit(X_train_dtm, y_train)
    
    # OOB and log-loss error
    
    log_loss_error = metrics.log_loss(y_test, clf.predict_proba(X_test_dtm))
    
        
    # Enter error rate details
    leaf_error.append((leaf_size, log_loss_error))
    oob_rate_leaf.append((leaf_size, clf.oob_score_))

In [None]:
x, y = [], []
for i in range(len(oob_rate_leaf)):
    y.append(oob_rate_leaf[i][1])
    x.append(oob_rate_leaf[i][0])

In [None]:
_ = plt.figure(figsize=(20,5))
_ = plt.plot(x,y, '-r')
_ = plt.title('OOB Error by Time', size=14)
_ = plt.xlabel('min_samples_leaf', size=14)
_ = plt.ylabel('OOB Error', size=14)

The default leaf sample size (= 1) seems to be the best choice.  As the minimal sample increases so does the leaf error (the log-loss error when changing the number).  Also, after the ninth iteration, the error will remain stabilized.  It will still compute but it will stabilize.  Thus the min_samples_leaf is best left alone.

### 4 - Other Tuning Points

In [57]:
# tuning
# pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(binary=True, stop_words=None)),
    ('rf', RandomForestClassifier(oob_score=True, 
                                  random_state=RANDOM_STATE, 
                                  warm_start=True,
                                 bootstrap=True))
])

# parameters (please note too many)
parameters = dict(
    rf__max_features = ['sqrt','log2'],
    rf__criterion = ["gini", "entropy"],
    rf__max_depth = [3, None],
    rf__min_samples_split = sp_randint(2, 11),
    rf__min_samples_leaf =  sp_randint(1, 11),
    rf__n_estimators = [10, 25, 50, 75, 100, 125, 150, 175, 200],
    vect__max_df = (0.5, 0.75, 1.0),
    vect__ngram_range = [(1,1), (1,2), (2,2)]
)

In [58]:
# cv=5 academically proven as best fold, kept n_jobs (jobs running parallel)= 1, output time
rand_search = RandomizedSearchCV(pipeline, 
                           parameters, 
                           n_jobs=1, 
                           cv=5
                )

In [59]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters, depth=2)

%time rand_search.fit(X_train, y_train)

Performing grid search...
pipeline: ['vect', 'rf']
parameters:
{'rf__criterion': ['gini', 'entropy'],
 'rf__max_depth': [3, None],
 'rf__max_features': ['sqrt', 'log2'],
 'rf__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f88e2b0>,
 'rf__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f8690f0>,
 'rf__n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200],
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': [(...), (...), (...)]}


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


CPU times: user 2min 47s, sys: 7.84 s, total: 2min 55s
Wall time: 2min 55s


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...estimators=10, n_jobs=1, oob_score=True, random_state=123,
            verbose=0, warm_start=True))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'rf__max_features': ['sqrt', 'log2'], 'rf__criterion': ['gini', 'entropy'], 'rf__max_depth': [3, None], 'rf__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f8690f0>, 'rf__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f88e2b0>, 'rf__n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200], 'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': [(1, 1), (1, 2), (2, 2)

In [60]:
print("Best score: %0.3f" % rand_search.best_score_)
print("Best parameters set:")

best_parameters = rand_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.678
Best parameters set:
	rf__criterion: 'entropy'
	rf__max_depth: None
	rf__max_features: 'sqrt'
	rf__min_samples_leaf: 4
	rf__min_samples_split: 5
	rf__n_estimators: 100
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)


In [61]:
# fit model for best parameters
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters, depth=2)

%time rand_search.fit(X, y)

Performing grid search...
pipeline: ['vect', 'rf']
parameters:
{'rf__criterion': ['gini', 'entropy'],
 'rf__max_depth': [3, None],
 'rf__max_features': ['sqrt', 'log2'],
 'rf__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f88e2b0>,
 'rf__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f8690f0>,
 'rf__n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200],
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': [(...), (...), (...)]}
CPU times: user 10min 30s, sys: 24.2 s, total: 10min 54s
Wall time: 10min 59s


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...estimators=10, n_jobs=1, oob_score=True, random_state=123,
            verbose=0, warm_start=True))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'rf__max_features': ['sqrt', 'log2'], 'rf__criterion': ['gini', 'entropy'], 'rf__max_depth': [3, None], 'rf__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f8690f0>, 'rf__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f88e2b0>, 'rf__n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200], 'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': [(1, 1), (1, 2), (2, 2)

In [62]:
print("Best score: %0.3f" % rand_search.best_score_)
print("Best parameters set:")

best_parameters = rand_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.663
Best parameters set:
	rf__criterion: 'entropy'
	rf__max_depth: None
	rf__max_features: 'sqrt'
	rf__min_samples_leaf: 8
	rf__min_samples_split: 5
	rf__n_estimators: 100
	vect__max_df: 0.5
	vect__ngram_range: (1, 1)


## experiment 1 - split / vectorize

In [63]:
# import and instantiate CountVectorizer (with the default parameters)
        
vect_rf1 = CountVectorizer(max_df=0.5, 
                          ngram_range=(1,2),
                          stop_words=stop_word)

# random forest classifier; please note this is NOT including parameters
rf3 = RandomForestClassifier(oob_score = True,
                            criterion = 'entropy',
                            max_features = 'sqrt',
                            min_samples_leaf = 4,
                            min_samples_split = 5,
                            n_estimators = 5)

In [65]:
# Fit and transform
vect_rf1.fit(X_train)
X_train_dtm = vect_rf1.transform(X_train)
X_test_dtm = vect_rf1.transform(X_test)

# Fit and predict
rf3.fit(X_train_dtm, y_train)
y_train_pred = rf3.predict(X_train_dtm)
y_test_pred = rf3.predict(X_test_dtm)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


In [66]:
# oob score
print(rf3.oob_score_)

0.583492236448


In [67]:
# accuracy score, test
metrics.accuracy_score(y_test, y_test_pred)

0.66557711950970377

In [68]:
# confusion matrix, train
metrics.confusion_matrix(y_train, y_train_pred)

array([[5193,  403,  329],
       [1299, 2673,  254],
       [1453,  367, 2713]])

In [69]:
# print confusion matrix
metrics.confusion_matrix(y_test, y_test_pred)

array([[1658,  173,  144],
       [ 519,  780,  110],
       [ 530,  161,  820]])

In [70]:
# classification report, train
print(metrics.classification_report(y_train, y_train_pred))

             precision    recall  f1-score   support

        EAP       0.65      0.88      0.75      5925
        HPL       0.78      0.63      0.70      4226
        MWS       0.82      0.60      0.69      4533

avg / total       0.74      0.72      0.72     14684



In [71]:
# classification report, test
print(metrics.classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

        EAP       0.61      0.84      0.71      1975
        HPL       0.70      0.55      0.62      1409
        MWS       0.76      0.54      0.63      1511

avg / total       0.68      0.67      0.66      4895



In [72]:
# vocabulary
X_tokens = vect_rf1.get_feature_names()

In [73]:
# feature importance
tokens_rf = pd.DataFrame({'token':X_tokens, 'metric':rf3.feature_importances_}).set_index('token')

In [74]:
print(tokens_rf['metric'].nlargest(10),'\n')

token
raymond    0.015360
father     0.013091
windsor    0.012269
though     0.010241
thus       0.009343
upon       0.008020
adrian     0.007948
idris      0.006922
thing      0.006913
sister     0.006606
Name: metric, dtype: float64 



## experiment 2 - vectorize / split

In [36]:
# import and instantiate CountVectorizer (with the default parameters)
vect_rf = CountVectorizer(max_df=1.0, 
                          ngram_range=(1,2),
                          stop_words=stop_word)

In [37]:
# learn the 'vocabulary' of the training data (occurs in-place)
X_dtm = vect_rf.fit_transform(X)

In [38]:
# cross validation
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y)

In [39]:
# transform 
X_train_dtm = vect_rf.transform(X_train)
X_test_dtm = vect_rf.transform(X_test)

In [40]:
# random forest classifier; please note this is NOT including parameters
rf = RandomForestClassifier(oob_score=True, 
                            warm_start=True,
                            max_features='log2',
                            min_samples_split=4,
                            n_estimators=175
                           )

In [41]:
# fit the model
%time rf.fit(X_dtm, y)

CPU times: user 3min 50s, sys: 1.34 s, total: 3min 51s
Wall time: 3min 52s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=4, min_weight_fraction_leaf=0.0,
            n_estimators=175, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=True)

In [42]:
# predict with training class
y_train_pred = rf.predict(X_train_dtm)

In [43]:
# predict with testing class
y_test_pred = rf.predict(X_test_dtm)

In [44]:
# calculate predicted probabilities for X_test_dtm
y_pred_prob = rf.predict_proba(X_test_dtm)

In [45]:
# accuracy score, train
metrics.accuracy_score(y_train, y_train_pred)

0.999591391991283

In [46]:
# oob score
print(rf.oob_score_)

0.71341743705


In [47]:
# accuracy score, test
metrics.accuracy_score(y_test, y_test_pred)

0.99959141981613897

In [48]:
# confusion matrix, train
metrics.confusion_matrix(y_train, y_train_pred)

array([[5925,    0,    0],
       [   5, 4221,    0],
       [   1,    0, 4532]])

In [49]:
# print confusion matrix
metrics.confusion_matrix(y_test, y_test_pred)

array([[1975,    0,    0],
       [   1, 1408,    0],
       [   1,    0, 1510]])

In [50]:
# classification report, train
print(metrics.classification_report(y_train, y_train_pred))

             precision    recall  f1-score   support

        EAP       1.00      1.00      1.00      5925
        HPL       1.00      1.00      1.00      4226
        MWS       1.00      1.00      1.00      4533

avg / total       1.00      1.00      1.00     14684



In [51]:
# classification report, test
print(metrics.classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

        EAP       1.00      1.00      1.00      1975
        HPL       1.00      1.00      1.00      1409
        MWS       1.00      1.00      1.00      1511

avg / total       1.00      1.00      1.00      4895



### feature importance

In [52]:
# vocabulary
X_tokens = vect_rf.get_feature_names()

In [53]:
# feature importance
tokens_rf = pd.DataFrame({'token':X_tokens, 'metric':rf.feature_importances_}).set_index('token')

In [54]:
print(tokens_rf['metric'].nlargest(10),'\n')

token
upon       0.001877
raymond    0.001590
love       0.001330
perdita    0.001265
adrian     0.001253
father     0.000988
though     0.000964
old        0.000926
heart      0.000900
life       0.000838
Name: metric, dtype: float64 

