In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import os

In [2]:
os.listdir(os.curdir)

['random_forests.ipynb', '.ipynb_checkpoints']

In [3]:
data_file_name = 'spambase.data'
data_path = f'../data/spam/{data_file_name}'

In [4]:
spam_data = pd.read_csv(data_path, header=None)
spam_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


## Comparison I - OOB 70% vs CV 70% + Test 30%

In [5]:
full_x_data = spam_data.drop(57, axis=1)
full_y_data = spam_data[57]
print(full_x_data.shape)
print(full_y_data.shape)

(4601, 57)
(4601,)


In [6]:
x_train, x_test, y_train, y_test = train_test_split(full_x_data, full_y_data, test_size=0.3, random_state=42)

In [7]:
parameters = {'max_features': ('sqrt', 'log2', 1, 3, 7, 15, 35, 57)}

In [8]:
rf_clf = RandomForestClassifier(n_estimators=200, criterion="gini", oob_score=True, 
                                # warm_start=True, # Required for OOB.
                                verbose=1, 
                                n_jobs=-1, 
                                max_depth=20,
                                random_state=42)

In [12]:
g_clf = GridSearchCV(rf_clf, parameters, cv=10, verbose=4, n_jobs=-1, scoring='f1')
g_clf.fit(x_train, y_train)

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    0.6s finished


GridSearchCV(cv=10,
             estimator=RandomForestClassifier(max_depth=20, n_estimators=200,
                                              n_jobs=-1, oob_score=True,
                                              random_state=42, verbose=1),
             n_jobs=-1, param_grid={'max_features': (1, 3, 7, 15, 35, 57)},
             scoring='f1', verbose=4)

In [13]:
print("Best parameters set found on development set:")
print()
print(g_clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = g_clf.cv_results_['mean_test_score']
stds = g_clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, g_clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, g_clf.predict(x_test)
print(classification_report(y_true, y_pred))
print()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


Best parameters set found on development set:

{'max_features': 7}

Grid scores on development set:

0.918 (+/-0.055) for {'max_features': 1}
0.931 (+/-0.047) for {'max_features': 3}
0.931 (+/-0.039) for {'max_features': 7}
0.928 (+/-0.044) for {'max_features': 15}
0.921 (+/-0.032) for {'max_features': 35}
0.919 (+/-0.026) for {'max_features': 57}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       804
           1       0.96      0.93      0.94       577

    accuracy                           0.95      1381
   macro avg       0.95      0.95      0.95      1381
weighted avg       0.95      0.95      0.95      1381




[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.1s finished
