In [1]:
from sklearn.metrics import confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd
import numpy as np
import joblib

import warnings
warnings.filterwarnings('ignore')

In [2]:
root_directory = '../../../data/train_test_split/'

X_train_transformed = pd.read_csv(root_directory+'x_train_rf.csv')
y_train = pd.read_csv(root_directory+'y_train.csv')['is_drafted']

X_test_transformed = pd.read_csv(root_directory+'x_test_rf.csv')
y_test = pd.read_csv(root_directory+'y_test.csv')['is_drafted']

In [3]:
# Sensitivity Analysis - Randomized Search with CV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score

clf = RandomForestClassifier()

# Define the parameter grid for RandomizedSearchCV
# Expanded parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 75, 100, 150],  # How many trees
    'max_depth': [10, 15, 20, 30, 50, 75],  # How many splits in the trees are allowed
    'min_samples_split': [10, 15, 20],  # More options for minimum samples split
    'min_samples_leaf': [3, 8, 10],  # How many samples are allowed to comprise a leaf
    'bootstrap': [True],  # Whether bootstrap samples are used
    'max_features': ['sqrt'],  # Different ways to limit the number of features considered at each split
    'criterion': ['entropy'],  # Different criteria for splitting nodes
    'class_weight': ['balanced']
}


# Randomized Search cross validation

random_search = RandomizedSearchCV(
    clf, 
    param_distributions=param_dist, 
    n_iter=100,  # Number of parameter settings sampled
    scoring="f1_micro",  # Optimize for f1 score (weight
    cv=3,  # 5-fold cross-validation
    random_state=0, 
    n_jobs=-1,  # Use all available cores
)

# # Assuming you have X_train and y_train already defined
random_search.fit(X_train_transformed, y_train)

# Extracting results from GridSearchCV
results = pd.DataFrame(random_search.cv_results_)

In [4]:
results.shape

(100, 19)

In [5]:
cols = ['param_n_estimators', 'param_min_samples_split', 'param_min_samples_leaf', 'param_max_depth',
        'mean_test_score', 'rank_test_score']

results = results[cols]

In [6]:
results.head()

Unnamed: 0,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,mean_test_score,rank_test_score
0,150,15,8,75,0.885479,27
1,75,10,3,15,0.893444,22
2,75,15,8,20,0.882582,40
3,75,10,10,50,0.873392,66
4,50,20,3,50,0.894558,19


In [7]:
import altair as alt

In [8]:
rename_dict = {'param_n_estimators': 'num_trees',
               'param_min_samples_split': 'min_samples_split',
               'param_min_samples_leaf': 'min_samples_leaf',
               'param_max_depth': 'max_depth',
              'mean_test_score': 'f1 score'}

results = results.rename(rename_dict, axis = 1)
results.head()

Unnamed: 0,num_trees,min_samples_split,min_samples_leaf,max_depth,f1 score,rank_test_score
0,150,15,8,75,0.885479,27
1,75,10,3,15,0.893444,22
2,75,15,8,20,0.882582,40
3,75,10,10,50,0.873392,66
4,50,20,3,50,0.894558,19


In [9]:
a = alt.Chart(results).mark_rect().encode(
    x='max_depth:O',
    y='num_trees:Q',
    color='mean(f1 score)'
    ).properties(height = 200, width = 200)

b = alt.Chart(results).mark_rect().encode(
    x='max_depth:O',
    y='min_samples_split:Q',
    color='mean(f1 score)'
    ).properties(height = 200, width = 200)

c = alt.Chart(results).mark_rect().encode(
    x='max_depth:O',
    y='min_samples_leaf:Q',
    color='mean(f1 score)'
    ).properties(height = 200, width = 200)

x = a|b|c

x.properties(title = 'Average f1 Score of Models: Broken Down by Max Depth Value vs All Other Parameters*').configure_title(fontSize=20)\
    .configure_axis(
    labelFontSize=16,
    titleFontSize=16
)

In [10]:
results

Unnamed: 0,num_trees,min_samples_split,min_samples_leaf,max_depth,f1 score,rank_test_score
0,150,15,8,75,0.885479,27
1,75,10,3,15,0.893444,22
2,75,15,8,20,0.882582,40
3,75,10,10,50,0.873392,66
4,50,20,3,50,0.894558,19
...,...,...,...,...,...,...
95,150,20,3,50,0.896340,16
96,50,15,8,30,0.884922,31
97,100,10,10,15,0.867710,78
98,150,10,3,30,0.907369,2


In [11]:
a = alt.Chart(results).mark_rect().encode(
    x='max_depth:O',
    y='num_trees:O',
    color='mean(f1 score)'
    ).properties(height = 200, width = 200)

b = alt.Chart(results).mark_rect().encode(
    x='max_depth:O',
    y='min_samples_leaf:O',
    color='mean(f1 score)'
    ).properties(height = 200, width = 200)

c = alt.Chart(results).mark_rect().encode(
    x='max_depth:O',
    y='min_samples_split:O',
    color='mean(f1 score)'
    ).properties(height = 200, width = 200)

a|b|c

In [12]:
a = alt.Chart(results).mark_rect().encode(
    x='min_samples_leaf:O',
    y='max_depth:O',
    color='mean(f1 score)'
    ).properties(height = 200, width = 200)

b = alt.Chart(results).mark_rect().encode(
    x='min_samples_split:O',
    y='max_depth:O',
    color='mean(f1 score)'
    ).properties(height = 200, width = 200)


a|b

In [13]:
alt.Chart(results).mark_rect().encode(
    x='min_samples_leaf:O',
    y='max_depth:O',
    color='mean(f1 score)'
    ).properties(height = 200, width = 200)

In [14]:
# # Sensitivity Analysis - Randomized Search with CV
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import make_scorer, f1_score

# clf = RandomForestClassifier()

# # Define the parameter grid for RandomizedSearchCV
# # Expanded parameter grid for RandomizedSearchCV
# param_dist = {
#     'n_estimators': [50, 75, 100, 150],  # How many trees
#     'max_depth': [10, 15, 20, 30],  # How many splits in the trees are allowed
#     'min_samples_split': [10, 15, 20],  # More options for minimum samples split
#     'min_samples_leaf': [8, 10],  # How many samples are allowed to comprise a leaf
#     'bootstrap': [True, False],  # Whether bootstrap samples are used
#     'max_features': ['sqrt', 'log2', None],  # Different ways to limit the number of features considered at each split
#     'criterion': ['gini', 'entropy'],  # Different criteria for splitting nodes
#     'class_weight': ['balanced']
# }


# # Randomized Search cross validation

# random_search = RandomizedSearchCV(
#     clf, 
#     param_distributions=param_dist, 
#     n_iter=100,  # Number of parameter settings sampled
#     scoring="f1_micro",  # Optimize for f1 score (weight
#     cv=3,  # 5-fold cross-validation
#     random_state=0, 
#     n_jobs=-1,  # Use all available cores
# )

# # # Assuming you have X_train and y_train already defined
# random_search.fit(X_train_transformed, y_train)

# # Extracting results from GridSearchCV
# results_all_params = pd.DataFrame(random_search.cv_results_)

# results_all_params.head()


In [16]:
# 80% fall between f1-scores of .80 and .90

results['f1_rounded'] = results['f1 score'].round(2)
a = results.groupby(by = 'f1_rounded').agg({'rank_test_score':'count'}).reset_index()

a['pct_total'] = a['rank_test_score'] / 100

alt.Chart(a).mark_bar(size = 20).encode(
    x = alt.Y('f1_rounded', title = 'Mean CV f1 score (3-fold)'),
    y = alt.Y('pct_total', title = '% of Models')).properties(
    width = 320, height = 400, title = 'Distribution of f1-scores on 100 trained models').configure_title(fontSize=18)

In [18]:
# 21 of the 30 worst-performing models had max depth of 10.

alt.Chart(results).mark_bar().encode(
    x = alt.Y('f1_rounded', title = 'Mean CV f1 score (3-fold)', bin=alt.Bin(maxbins=25)),
    y = alt.Y('count()', title = '# of Models'),
    color = 'max_depth:N').properties(height = 200, width = 400, title = 'Model Performance by Max Depth Param Values').configure_title(fontSize=18)

In [23]:
# Bottom 30 worst performing models:

results.sort_values(by = 'f1 score', ascending = True).head(30)

Unnamed: 0,num_trees,min_samples_split,min_samples_leaf,max_depth,f1 score,rank_test_score,f1_rounded
67,50,10,10,10,0.82599,100,0.83
68,100,15,10,10,0.826268,99,0.83
36,75,20,10,10,0.82755,98,0.83
88,150,10,10,10,0.828274,97,0.83
61,50,20,3,10,0.829053,96,0.83
46,100,20,8,10,0.83,95,0.83
64,75,10,8,10,0.830001,94,0.83
57,100,10,10,10,0.830056,93,0.83
17,50,10,8,10,0.830279,92,0.83
43,50,15,8,10,0.830613,91,0.83
