## Load Data

In [3]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('../data/processed/mushrooms_pca.csv')
y = df['class']
X = df.drop(columns=['class']).values
X.view()

array([[ 0.24373845,  0.16963259,  0.18727096, ..., -0.2621439 ,
        -0.04811481,  0.10087257],
       [ 1.5713711 , -0.39656834,  0.45390158, ..., -0.11453773,
         0.41043087, -0.28486641],
       [ 0.01171017,  0.71284725, -0.7501287 , ...,  0.00313137,
        -0.25922659, -0.06733154],
       ...,
       [-0.89697748, -0.1697835 ,  0.3055329 , ...,  0.53583814,
         0.06813399, -0.06614521],
       [-1.88993255, -0.39236622, -0.58529202, ...,  0.1682014 ,
         0.37251319, -0.41726506],
       [ 0.66668421,  0.4509778 , -0.09989701, ..., -0.18299176,
        -0.0747118 , -0.36911983]])

In [57]:
def get_param_df(results):
    lines = []
    for _,r in results.iterrows():
        line = r['params']
        line['time'] = r['mean_fit_time']
        line['f1_score'] = r['mean_test_score']
        lines.append(line)
    return pd.DataFrame(lines)

## Random Search

In [25]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Number of trees in random forest
n_estimators = [10, 100, 200, 400, 600, 800, 1000, 1200, 1400]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

random_grid

{'n_estimators': [10, 100, 200, 400, 600, 800, 1000, 1200, 1400],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4]}

In [26]:
# define a RF estimator
rf = RandomForestClassifier()

# set up randomized search with cross-validation
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               cv = 3, 
                               n_iter = 100,
                               verbose=20, 
                               scoring='f1',
                               n_jobs = -1)

# fit on the data
rf_random.fit(X, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   49.0s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   49.6s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   58.7s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   59.7s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  2

[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 145 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 16.0min
[Parallel(n_jobs=-1)]: Done 147 tasks      | elapsed: 16.0min
[Paralle

[Parallel(n_jobs=-1)]: Done 265 tasks      | elapsed: 29.1min
[Parallel(n_jobs=-1)]: Done 266 tasks      | elapsed: 29.1min
[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed: 29.2min
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed: 29.4min
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed: 29.4min
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed: 29.4min
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed: 29.4min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 29.4min
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed: 29.6min
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed: 29.6min
[Parallel(n_jobs=-1)]: Done 275 tasks      | elapsed: 29.7min
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed: 29.7min
[Parallel(n_jobs=-1)]: Done 277 tasks      | elapsed: 30.0min
[Parallel(n_jobs=-1)]: Done 278 tasks      | elapsed: 30.1min
[Parallel(n_jobs=-1)]: Done 279 tasks      | elapsed: 30.7min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 30.7min
[Paralle

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [10, 100, 200, 400, 600, 800, 1000, 1200, 1400], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='f1', verbose=20)

In [27]:
print(rf_random.best_estimator_)
print(rf_random.best_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1400, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.7851887810204893


In [28]:
import pickle
pickle.dump(rf_random, open( "../models/rf-random-search.p", "wb" ) )

In [55]:
import warnings
warnings.filterwarnings('ignore')
param_df = get_param_df(pd.DataFrame(rf_random.cv_results_))
print(param_df.sort_values(by=['score'], ascending=False).reset_index(drop=True).head().to_latex())

\begin{tabular}{lrrrrrr}
\toprule
{} &  max\_depth &  min\_samples\_leaf &  min\_samples\_split &  n\_estimators &     score &        time \\
\midrule
0 &        NaN &                 1 &                  2 &          1400 &  0.785189 &  118.201951 \\
1 &       20.0 &                 1 &                  2 &          1000 &  0.785053 &   81.395167 \\
2 &       50.0 &                 1 &                  2 &          1000 &  0.784630 &   84.439326 \\
3 &       70.0 &                 1 &                 10 &          1000 &  0.784376 &   82.668020 \\
4 &        NaN &                 1 &                  5 &           600 &  0.783916 &   45.652530 \\
\bottomrule
\end{tabular}



## Grid Search

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Number of trees in random forest
n_estimators = [1400, 1600, 1800, 2000]
# Maximum number of levels in tree
max_depth = [None]
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 4]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

random_grid

{'n_estimators': [1400, 1600, 1800, 2000],
 'max_depth': [None],
 'min_samples_split': [2, 3, 4],
 'min_samples_leaf': [1, 2]}

In [34]:
# define a RF estimator
rf = RandomForestClassifier()

# set up randomized search with cross-validation
rf_grid = GridSearchCV(estimator = rf, 
                               param_grid = random_grid, 
                               cv = 10, 
                               verbose=20, 
                               scoring='f1',
                               n_jobs = -1)

# fit on the data
rf_grid.fit(X, y)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  6

[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed: 57.8min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed: 59.1min
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed: 59.1min
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed: 59.3min
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed: 59.3min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 60.3min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 60.4min
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed: 60.9min
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed: 60.9min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed: 62.6min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed: 62.6min
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed: 62.9min
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed: 62.9min
[Parallel(n_jobs=-1)]: Done 145 tasks      | elapsed: 63.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 63.9min
[Parallel(n_jobs=-1)]: Done 147 tasks      | elapsed: 64.4min
[Paralle

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [1400, 1600, 1800, 2000], 'max_depth': [None], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=20)

In [35]:
import pickle
pickle.dump(rf_grid, open( "../models/rf-grid-search.p", "wb" ) )

In [36]:
print(rf_grid.best_estimator_)
print(rf_grid.best_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.8076609162889906


In [56]:
import warnings
warnings.filterwarnings('ignore')
param_df = get_param_df(pd.DataFrame(rf_grid.cv_results_))
print(param_df.sort_values(by=['score'], ascending=False).reset_index(drop=True).head().to_latex())

\begin{tabular}{llrrrrr}
\toprule
{} & max\_depth &  min\_samples\_leaf &  min\_samples\_split &  n\_estimators &     score &        time \\
\midrule
0 &      None &                 1 &                  3 &          2000 &  0.807661 &  242.734366 \\
1 &      None &                 1 &                  3 &          1600 &  0.807331 &  191.676363 \\
2 &      None &                 1 &                  4 &          1400 &  0.806906 &  170.072244 \\
3 &      None &                 1 &                  4 &          1800 &  0.806871 &  216.219033 \\
4 &      None &                 2 &                  3 &          1600 &  0.806436 &  182.417167 \\
\bottomrule
\end{tabular}



## Confusion matrix

In [58]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_pred = cross_val_predict(rf_grid.best_estimator_, X, y, cv=10)
conf_mat = confusion_matrix(y, y_pred)

In [59]:
conf_mat

array([[3694,  514],
       [ 828, 3088]])

## Classification

In [64]:
first_ten = X[:10,]
pd.DataFrame(first_ten)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,74,75,76,77,78,79,80,81,82,83
0,0.243738,0.169633,0.187271,1.376409,-0.106942,-0.878097,-0.210952,-0.687473,-1.018611,-0.662291,...,-0.441639,0.042792,-0.063609,-0.120454,0.507655,0.023254,0.222428,-0.262144,-0.048115,0.100873
1,1.571371,-0.396568,0.453902,-0.311216,-1.197749,0.324961,-0.004409,-0.342539,-0.37538,0.408702,...,0.352448,-0.049725,0.265986,-0.026997,0.042791,-0.083705,-0.486171,-0.114538,0.410431,-0.284866
2,0.01171,0.712847,-0.750129,-0.960438,-0.482845,-1.151565,0.035126,0.225844,0.366011,0.077844,...,0.366079,-0.188762,0.210489,0.006704,0.176389,0.223179,0.171282,0.003131,-0.259227,-0.067332
3,-0.582489,-0.780651,1.257254,-0.378224,0.070664,-0.385106,0.960738,-1.132339,-0.243777,-0.334009,...,-0.312745,-0.489868,0.084006,0.331273,-0.240141,0.242239,0.228877,-0.268884,-0.078804,-0.339742
4,-0.534264,0.601503,1.705691,-0.109213,0.405468,0.472674,-0.394598,-0.405106,0.541237,1.374447,...,0.153536,-0.014964,0.05249,0.104576,0.26149,0.152714,0.093212,0.101715,0.029535,-0.002264
5,-1.204925,0.884577,-0.394593,-0.503846,-0.251984,-0.343648,-0.294517,-0.046666,-0.096009,-0.005519,...,0.191728,-0.481568,0.260171,-0.742385,0.139754,0.116191,-0.568823,-0.259215,0.364157,-0.107321
6,1.337115,0.585181,0.052322,-0.439883,0.592823,-0.591784,0.196912,-0.696469,-0.883731,-0.502377,...,0.43905,-0.195,0.331904,-0.056237,0.210434,0.069898,-0.397399,-0.101941,0.298627,-0.350048
7,-0.462326,-0.08898,0.838394,0.756321,-0.68089,-1.515351,-0.060695,-0.09949,-0.523774,-0.452852,...,0.065353,-0.058422,-0.102397,0.297188,-0.11298,0.162091,0.523398,-0.090301,-0.206897,0.179758
8,-1.272053,0.66887,-0.470649,1.165287,-0.314027,-0.300755,0.542034,-0.709432,-0.36477,-0.049232,...,-0.159938,0.418124,-0.478721,-0.080912,0.032135,0.018549,0.184515,-0.267636,0.124479,-0.314951
9,1.395281,0.239368,0.434368,-0.667229,-0.949681,0.228731,-0.372862,-0.874304,-0.429802,-0.171727,...,0.177853,-0.241379,0.149066,-0.044175,0.189669,0.146438,-0.284725,-0.093229,0.127044,-0.256531


## Random scribblings

In [39]:
btrue = params_time[params_time['bootstrap'] == True]
print(btrue.groupby(['max_depth'])['score'].mean())
print(btrue.groupby(['min_samples_split'])['score'].mean())
print(btrue.groupby(['n_estimators'])['score'].mean())

max_depth
10.0    -0.591961
50.0    -0.542059
100.0   -0.550178
Name: score, dtype: float64
min_samples_split
2    -0.567002
5    -0.557997
10   -0.555188
Name: score, dtype: float64
n_estimators
10    -0.625875
50    -0.534197
100   -0.520114
Name: score, dtype: float64
