In [2]:
from sklearn.metrics import confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd
import numpy as np
import joblib

import warnings
warnings.filterwarnings('ignore')

In [3]:
root_directory = '../../../data/train_test_split/'

X_train_transformed = pd.read_csv(root_directory+'x_train_rf.csv')
y_train = pd.read_csv(root_directory+'y_train.csv')['is_drafted']

X_test_transformed = pd.read_csv(root_directory+'x_test_rf.csv')
y_test = pd.read_csv(root_directory+'y_test.csv')['is_drafted']

In [17]:
# Sensitivity Analysis - Randomized Search with CV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score

clf = RandomForestClassifier()

# Define the parameter grid for RandomizedSearchCV
# Expanded parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 75, 100, 150],  # How many trees
    'max_depth': [10, 15, 20, 30, 50, 75],  # How many splits in the trees are allowed
    'min_samples_split': [10, 15, 20],  # More options for minimum samples split
    'min_samples_leaf': [3, 8, 10],  # How many samples are allowed to comprise a leaf
    'bootstrap': [True],  # Whether bootstrap samples are used
    'max_features': ['sqrt'],  # Different ways to limit the number of features considered at each split
    'criterion': ['entropy'],  # Different criteria for splitting nodes
    'class_weight': ['balanced']
}


# Randomized Search cross validation

random_search = RandomizedSearchCV(
    clf, 
    param_distributions=param_dist, 
    n_iter=100,  # Number of parameter settings sampled
    scoring="f1_micro",  # Optimize for f1 score (weight
    cv=3,  # 5-fold cross-validation
    random_state=0, 
    n_jobs=-1,  # Use all available cores
)

# # Assuming you have X_train and y_train already defined
random_search.fit(X_train_transformed, y_train)

# Extracting results from GridSearchCV
results = pd.DataFrame(random_search.cv_results_)





In [18]:
results.shape

(100, 19)

In [19]:
cols = ['param_n_estimators', 'param_min_samples_split', 'param_min_samples_leaf', 'param_max_depth',
        'mean_test_score', 'rank_test_score']

results = results[cols]

In [20]:
results.head()

Unnamed: 0,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,mean_test_score,rank_test_score
0,150,15,8,75,0.884253,34
1,75,10,3,15,0.892998,24
2,75,15,8,20,0.883975,35
3,75,10,10,50,0.873893,68
4,50,20,3,50,0.894558,21


In [21]:
import altair as alt

In [108]:
rename_dict = {'param_n_estimators': 'num_trees',
               'param_min_samples_split': 'min_samples_split',
               'param_min_samples_leaf': 'min_samples_leaf',
               'param_max_depth': 'max_depth',
              'mean_test_score': 'f1 score'}

results = results.rename(rename_dict, axis = 1)
results.head()

Unnamed: 0,num_trees,min_samples_split,min_samples_leaf,max_depth,f1 score,rank_test_score
0,150,15,8,75,0.884253,34
1,75,10,3,15,0.892998,24
2,75,15,8,20,0.883975,35
3,75,10,10,50,0.873893,68
4,50,20,3,50,0.894558,21


In [118]:
a = alt.Chart(results).mark_rect().encode(
    x='max_depth:O',
    y='num_trees:Q',
    color='mean(f1 score)'
    ).properties(height = 200, width = 200)

b = alt.Chart(results).mark_rect().encode(
    x='max_depth:O',
    y='min_samples_split:Q',
    color='mean(f1 score)'
    ).properties(height = 200, width = 200)

c = alt.Chart(results).mark_rect().encode(
    x='max_depth:O',
    y='min_samples_leaf:Q',
    color='mean(f1 score)'
    ).properties(height = 200, width = 200)

x = a|b|c

x.properties(title = 'Average f1 Score of Models: Broken Down by Max Depth Value vs All Other Parameters*').configure_title(fontSize=20)\
    .configure_axis(
    labelFontSize=16,
    titleFontSize=16
)

In [34]:
a = alt.Chart(results).mark_rect().encode(
    x='num_trees:O',
    y='min_samples_split:O',
    color='mean(mean_test_score)'
    ).properties(height = 200, width = 200)

b = alt.Chart(results).mark_rect().encode(
    x='num_trees:O',
    y='min_samples_leaf:O',
    color='mean(mean_test_score)'
    ).properties(height = 200, width = 200)

c = alt.Chart(results).mark_rect().encode(
    x='num_trees:O',
    y='max_depth:O',
    color='mean(mean_test_score)'
    ).properties(height = 200, width = 200)

a|b|c

In [35]:
a = alt.Chart(results).mark_rect().encode(
    x='min_samples_leaf:O',
    y='min_samples_leaf:O',
    color='mean(mean_test_score)'
    ).properties(height = 200, width = 200)

b = alt.Chart(results).mark_rect().encode(
    x='min_samples_split:O',
    y='max_depth:O',
    color='mean(mean_test_score)'
    ).properties(height = 200, width = 200)


a|b

In [36]:
alt.Chart(results).mark_rect().encode(
    x='min_samples_leaf:O',
    y='max_depth:O',
    color='mean(mean_test_score)'
    ).properties(height = 200, width = 200)

In [38]:
# Sensitivity Analysis - Randomized Search with CV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score

clf = RandomForestClassifier()

# Define the parameter grid for RandomizedSearchCV
# Expanded parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 75, 100, 150],  # How many trees
    'max_depth': [10, 15, 20, 30],  # How many splits in the trees are allowed
    'min_samples_split': [10, 15, 20],  # More options for minimum samples split
    'min_samples_leaf': [8, 10],  # How many samples are allowed to comprise a leaf
    'bootstrap': [True, False],  # Whether bootstrap samples are used
    'max_features': [sqrt', 'log2', None],  # Different ways to limit the number of features considered at each split
    'criterion': ['gini', 'entropy'],  # Different criteria for splitting nodes
    'class_weight': ['balanced']
}


# Randomized Search cross validation

random_search = RandomizedSearchCV(
    clf, 
    param_distributions=param_dist, 
    n_iter=100,  # Number of parameter settings sampled
    scoring="f1_micro",  # Optimize for f1 score (weight
    cv=3,  # 5-fold cross-validation
    random_state=0, 
    n_jobs=-1,  # Use all available cores
)

# # Assuming you have X_train and y_train already defined
random_search.fit(X_train_transformed, y_train)

# Extracting results from GridSearchCV
results_all_params = pd.DataFrame(random_search.cv_results_)

results_all_params.head()


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_criterion,param_class_weight,param_bootstrap,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,3.14597,0.091797,0.114244,0.006125,100,20,8,sqrt,15,entropy,balanced,False,"{'n_estimators': 100, 'min_samples_split': 20,...",0.856307,0.860127,0.844084,0.853506,0.006842,59
1,3.131213,0.058297,0.156384,0.00842,150,10,8,sqrt,15,entropy,balanced,True,"{'n_estimators': 150, 'min_samples_split': 10,...",0.880368,0.880849,0.871825,0.87768,0.004145,16
2,5.032048,0.082188,0.158257,0.011457,150,10,10,sqrt,30,entropy,balanced,False,"{'n_estimators': 150, 'min_samples_split': 10,...",0.865664,0.868483,0.859291,0.864479,0.003845,37
3,2.355835,0.07112,0.110611,0.002601,100,15,8,log2,15,gini,balanced,False,"{'n_estimators': 100, 'min_samples_split': 15,...",0.854637,0.85996,0.847761,0.854119,0.004994,58
4,2.522644,0.007489,0.111255,0.007847,150,20,10,sqrt,10,entropy,balanced,True,"{'n_estimators': 150, 'min_samples_split': 20,...",0.829574,0.835561,0.818349,0.827828,0.007135,70


In [56]:
results_all_params.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_n_estimators', 'param_min_samples_split',
       'param_min_samples_leaf', 'param_max_features', 'param_max_depth',
       'param_criterion', 'param_class_weight', 'param_bootstrap', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'mean_test_score', 'std_test_score', 'rank_test_score', 'f1_rounded'],
      dtype='object')

In [102]:
# 80% fall between f1-scores of .80 and .90

results_all_params['f1_rounded'] = results_all_params['mean_test_score'].round(2)
a = results_all_params.groupby(by = 'f1_rounded').agg({'params':'count'}).reset_index()

a['pct_total'] = a['params'] / 100

alt.Chart(a).mark_bar(size = 10).encode(
    x = alt.Y('f1_rounded', title = 'Mean CV f1 score (3-fold)'),
    y = alt.Y('pct_total', title = '% of Models')).properties(
    width = 320, height = 400, title = 'Distribution of f1-scores on 100 trained models').configure_title(fontSize=18)

In [90]:
# 21 of the 30 worst-performing models had max depth of 10.

alt.Chart(results_all_params).mark_bar().encode(
    x = alt.Y('f1_rounded', title = 'Mean CV f1 score (3-fold)', bin=alt.Bin(maxbins=25)),
    y = alt.Y('count()', title = '# of Models'),
    color = 'param_max_depth:N').properties(height = 200, width = 400, title = 'Model Performance by Max Depth Param Values').configure_title(fontSize=18)

In [82]:
# Bottom 30 models and the params that led to it:

results_all_params.sort_values(by = 'mean_test_score')[['param_n_estimators', 'param_min_samples_split',
       'param_min_samples_leaf', 'param_max_features', 'param_max_depth',
       'param_criterion', 'param_class_weight', 'param_bootstrap']].head(30)

Unnamed: 0,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_criterion,param_class_weight,param_bootstrap
59,100,20,8,,10,gini,balanced,False
72,50,20,8,,10,gini,balanced,False
75,100,15,8,,10,gini,balanced,False
42,75,15,10,,10,gini,balanced,False
51,150,10,8,,10,entropy,balanced,False
89,50,10,10,,10,entropy,balanced,False
19,50,15,8,,10,entropy,balanced,False
29,100,15,10,,15,entropy,balanced,False
76,50,20,8,,15,entropy,balanced,False
43,150,10,8,,15,gini,balanced,False


In [97]:
results_all_params.sort_values(by = 'mean_test_score', ascending = True).head(30)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_criterion,...,param_bootstrap,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,f1_rounded,shallow_tree_flag
59,14.404406,0.244905,0.085834,0.004404,100,20,8,,10,gini,...,False,"{'n_estimators': 100, 'min_samples_split': 20,...",0.664996,0.651237,0.656751,0.657661,0.005654,100,0.66,1
72,7.611088,0.272133,0.055501,0.005492,50,20,8,,10,gini,...,False,"{'n_estimators': 50, 'min_samples_split': 20, ...",0.664996,0.651237,0.65742,0.657884,0.005627,99,0.66,1
75,14.524488,0.090089,0.097533,0.003668,100,15,8,,10,gini,...,False,"{'n_estimators': 100, 'min_samples_split': 15,...",0.665998,0.652574,0.657253,0.658608,0.005564,98,0.66,1
42,11.34499,0.261275,0.076888,0.003026,75,15,10,,10,gini,...,False,"{'n_estimators': 75, 'min_samples_split': 15, ...",0.672682,0.647894,0.657754,0.659443,0.01019,97,0.66,1
51,24.998441,0.426494,0.137282,0.010245,150,10,8,,10,entropy,...,False,"{'n_estimators': 150, 'min_samples_split': 10,...",0.715288,0.637032,0.704378,0.685566,0.034607,96,0.69,1
89,7.998067,0.089562,0.042825,0.003081,50,10,10,,10,entropy,...,False,"{'n_estimators': 50, 'min_samples_split': 10, ...",0.713784,0.638536,0.704378,0.685566,0.033476,95,0.69,1
19,7.87302,0.087363,0.048242,0.011468,50,15,8,,10,entropy,...,False,"{'n_estimators': 50, 'min_samples_split': 15, ...",0.715789,0.637032,0.704378,0.685733,0.034751,94,0.69,1
29,19.480526,0.353396,0.098593,0.006116,100,15,10,,15,entropy,...,False,"{'n_estimators': 100, 'min_samples_split': 15,...",0.762072,0.719251,0.691845,0.724389,0.028899,93,0.72,0
76,10.870728,0.182828,0.052533,0.00337,50,20,8,,15,entropy,...,False,"{'n_estimators': 50, 'min_samples_split': 20, ...",0.763743,0.724766,0.696524,0.728344,0.027558,92,0.73,0
43,26.908381,0.187222,0.169345,0.017107,150,10,8,,15,gini,...,False,"{'n_estimators': 150, 'min_samples_split': 10,...",0.768588,0.731618,0.711063,0.73709,0.023801,91,0.74,0
