# Random forrest classifier for patient dataset

## Load our data and environment from `Data-pre-processing.ipynb`

In [8]:
%%capture
%run Data-pre-processing.ipynb

In [7]:
train

Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
58446,40166,68493,176,87.0,37.664970,0,2,0,167.600006,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,0,1
39507,107417,124638,199,56.0,31.506144,1,2,0,167.600006,2,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0
8172,56424,46242,89,68.0,23.072186,1,2,0,162.600006,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,4,0
85185,94204,11082,188,69.0,27.447725,0,2,0,167.600006,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,6,0
27860,72715,88152,147,75.0,24.948097,0,2,1,170.000000,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75366,120284,122901,166,85.0,27.808533,0,2,0,155.000000,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,6,1
19145,49126,75091,161,43.0,29.197634,0,2,0,162.500000,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,4,0
8041,125259,74471,118,60.0,38.293205,0,0,1,182.899994,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
52241,69272,122563,26,27.0,24.547794,0,-1,0,154.899994,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,4,0


In [9]:
from sklearn.model_selection import GridSearchCV

# Number of trees in random forest
n_estimators = [300, 900, 1200, 1500]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [2, 5, 10]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [5, 10, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               # Fixed parameters
               'criterion':['gini'], 'class_weight': ['balanced']
              }
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = sklearn.ensemble.RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(rf, random_grid, cv=3, n_jobs=6, verbose=10)
rf_random.fit(train_x, train_y)


{'n_estimators': [300, 900, 1200, 1500], 'max_features': ['sqrt'], 'max_depth': [2, 5, 10, None], 'min_samples_split': [5, 10, 20], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True], 'criterion': ['gini'], 'class_weight': ['balanced']}
Fitting 3 folds for each of 144 candidates, totalling 432 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=6,
             param_grid={'bootstrap': [True], 'class_weight': ['balanced'],
                         'criterion': ['gini'], 'max_depth': [2, 5, 10, None],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [5, 10, 20],
                         'n_estimators': [300, 900, 1200, 1500]},
             verbose=10)

In [10]:
rf_random.best_params_

{'bootstrap': True,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 1500}

In [12]:
# This takes really long, so we save the results to be able to analyse them alter on
df_results = pd.DataFrame(rf_random.cv_results_)
df_results.to_csv('grid-search-results.csv', index=False)

### Second iteration
There might be a more optimal model with more estimators

In [None]:
from sklearn.model_selection import GridSearchCV

# Number of trees in random forest
n_estimators = [1500, 1750, 2000]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [2, 5, 10]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [5, 10, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               # Fixed parameters
               'criterion':['gini'], 'class_weight': ['balanced']
              }
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = sklearn.ensemble.RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(rf, random_grid, cv=3, n_jobs=6, verbose=10)
rf_random.fit(train_x, train_y)


{'n_estimators': [1500, 1750, 2000], 'max_features': ['sqrt'], 'max_depth': [2, 5, 10, None], 'min_samples_split': [5, 10, 20], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True], 'criterion': ['gini'], 'class_weight': ['balanced']}
Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [None]:
rf_random.best_params_

In [None]:
# This takes really long, so we save the results to be able to analyse them alter on
df_results = pd.DataFrame(rf_random.cv_results_)
df_results.to_csv('grid-search-2-results.csv', index=False)