# Hyperparameter search for Random forest classifier for patient dataset

## Load our data and environment from `Data-pre-processing.ipynb`

In [6]:
%%capture
%run Data-pre-processing.ipynb

In [7]:
train

Unnamed: 0,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,icu_id,icu_stay_type,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
58446,176,87.0,37.664970,0,2,0,167.600006,0,628,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,0,1
39507,199,56.0,31.506144,1,2,0,167.600006,2,471,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0
8172,89,68.0,23.072186,1,2,0,162.600006,2,133,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,4,0
85185,188,69.0,27.447725,0,2,0,167.600006,0,840,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,6,0
27860,147,75.0,24.948097,0,2,1,170.000000,1,394,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75366,166,85.0,27.808533,0,2,0,155.000000,0,700,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,6,1
19145,161,43.0,29.197634,0,2,0,162.500000,0,324,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,4,0
8041,118,60.0,38.293205,0,0,1,182.899994,0,114,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
52241,26,27.0,24.547794,0,-1,0,154.899994,3,550,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,4,0


## First Grid search

In [9]:
from sklearn.model_selection import GridSearchCV

# Number of trees in random forest
n_estimators = [300, 900, 1200, 1500]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [2, 5, 10]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [5, 10, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               # Fixed parameters
               'criterion':['gini'], 'class_weight': ['balanced']
              }
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = sklearn.ensemble.RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(rf, random_grid, cv=3, n_jobs=6, verbose=10)
rf_random.fit(train_x, train_y)


{'n_estimators': [300, 900, 1200, 1500], 'max_features': ['sqrt'], 'max_depth': [2, 5, 10, None], 'min_samples_split': [5, 10, 20], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True], 'criterion': ['gini'], 'class_weight': ['balanced']}
Fitting 3 folds for each of 144 candidates, totalling 432 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=6,
             param_grid={'bootstrap': [True], 'class_weight': ['balanced'],
                         'criterion': ['gini'], 'max_depth': [2, 5, 10, None],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [5, 10, 20],
                         'n_estimators': [300, 900, 1200, 1500]},
             verbose=10)

In [10]:
rf_random.best_params_

{'bootstrap': True,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 1500}

In [12]:
# This takes really long, so we save the results to be able to analyse them alter on
df_results = pd.DataFrame(rf_random.cv_results_)
df_results.to_csv('grid-search-results.csv', index=False)

### Second iteration
There might be a more optimal model with more estimators

In [3]:
from sklearn.model_selection import GridSearchCV

# Number of trees in random forest
n_estimators = [1500, 1750, 2000]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [2, 5, 10]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [5, 10, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               # Fixed parameters
               'criterion':['gini'], 'class_weight': ['balanced']
              }
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = sklearn.ensemble.RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(rf, random_grid, cv=3, n_jobs=6, verbose=10)
rf_random.fit(train_x, train_y)


{'n_estimators': [1500, 1750, 2000], 'max_features': ['sqrt'], 'max_depth': [2, 5, 10, None], 'min_samples_split': [5, 10, 20], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True], 'criterion': ['gini'], 'class_weight': ['balanced']}
Fitting 3 folds for each of 108 candidates, totalling 324 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=6,
             param_grid={'bootstrap': [True], 'class_weight': ['balanced'],
                         'criterion': ['gini'], 'max_depth': [2, 5, 10, None],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [5, 10, 20],
                         'n_estimators': [1500, 1750, 2000]},
             verbose=10)

In [16]:
rf_random.best_params_

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'min_samples_leaf': 2,
 'min_samples_split': 5}

In [17]:
# This takes really long, so we save the results to be able to analyse them alter on
df_results = pd.DataFrame(rf_random.cv_results_)
df_results.to_csv('grid-search-tree-results.csv', index=False)

In [15]:
from sklearn.model_selection import GridSearchCV

# Number of features to consider at every split
criterion = ["gini", "entropy"]
# Maximum number of levels in tree
max_depth = [2, 5, 10, 20, None]
# Minimum number of samples required to split a node
min_samples_split = [5, 10, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4]
# Create the random grid
random_grid = {'criterion': criterion,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               # Fixed parameters
               'class_weight': ['balanced']
              }
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = sklearn.tree.DecisionTreeClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(rf, random_grid, cv=3, n_jobs=6, verbose=10)
rf_random.fit(train_x, train_y)


{'criterion': ['gini', 'entropy'], 'max_depth': [2, 5, 10, 20, None], 'min_samples_split': [5, 10, 20], 'min_samples_leaf': [2, 4], 'class_weight': ['balanced']}
Fitting 3 folds for each of 60 candidates, totalling 180 fits


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(), n_jobs=6,
             param_grid={'class_weight': ['balanced'],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 5, 10, 20, None],
                         'min_samples_leaf': [2, 4],
                         'min_samples_split': [5, 10, 20]},
             verbose=10)

In [16]:
rf_random.best_params_

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'min_samples_leaf': 2,
 'min_samples_split': 5}

In [17]:
# This takes really long, so we save the results to be able to analyse them alter on
df_results = pd.DataFrame(rf_random.cv_results_)
df_results.to_csv('grid-search-tree-results.csv', index=False)