## Random Forest  - Model Development & Tuning

**Results**
- Accuracy and F1 on train data both reaches 0.980 in final model.
- When switched to test data, accuracy is 0.675, F1 dropped to 0.637.
- Tuned on n_estimators from 500-1500 (223.2min）, improved accuracy on test only to 0.677, overfitting still exists.

**Hyperparameters Tuning**

- bootstrap: [True, False],
- max_depth
- max_features : ['auto', 'sqrt']
- min_samples_leaf
- min_samples_split
- n_estimators

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import (
    RandomizedSearchCV, 
    GridSearchCV
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [3]:
X_train = pd.read_csv('X_train.csv', index_col=0)
X_test = pd.read_csv('X_test.csv', index_col=0)
y_train = pd.read_csv('y_train.csv', index_col=0).squeeze()
y_test = pd.read_csv('y_test.csv', index_col=0).squeeze()

In [4]:
X_train.head()

Unnamed: 0,HeartRate_Mean,SysBP_Mean,DiasBP_Mean,TempC_Max,RespRate_Mean,Glucose_Mean,ICU_LOS,age,ANIONGAP_min,ANIONGAP_max,...,CHEST PAIN,CONGESTIVE HEART FAILURE,CORONARY ARTERY DISEASE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT /SDA,GASTROINTESTINAL BLEED,INTRACRANIAL HEMORRHAGE,PNEUMONIA,SEPSIS,GENDER_F,GENDER_M
0,89.764706,121.882353,59.411765,37.833332,12.954545,148.0,0.9278,59.0,12.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,171.0,119.248137,60.741501,37.507355,18.672821,140.6299,0.1715,0.0,12.976617,15.560846,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,74.423077,117.56,65.44,37.055556,15.344828,110.0,2.7068,51.0,8.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,86.846154,137.208333,50.833333,37.222222,20.035714,87.0,4.795,70.0,9.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,127.962963,119.248137,60.741501,37.507355,18.672821,140.6299,10.1448,0.0,17.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
X_test.head()

Unnamed: 0,HeartRate_Mean,SysBP_Mean,DiasBP_Mean,TempC_Max,RespRate_Mean,Glucose_Mean,ICU_LOS,age,ANIONGAP_min,ANIONGAP_max,...,CHEST PAIN,CONGESTIVE HEART FAILURE,CORONARY ARTERY DISEASE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT /SDA,GASTROINTESTINAL BLEED,INTRACRANIAL HEMORRHAGE,PNEUMONIA,SEPSIS,GENDER_F,GENDER_M
0,69.47619,144.333333,57.714286,36.555557,18.380952,144.0,24.6678,63.0,16.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,141.0,119.248137,60.741501,37.507355,18.672821,89.0,0.1286,0.0,12.976617,15.560846,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,100.571429,109.210526,66.368421,39.222222,22.857143,168.0,0.8547,34.0,17.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,75.857143,108.75,63.142857,39.0,17.793103,128.294118,1.0936,40.0,9.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,153.28,119.248137,60.741501,37.507355,18.672821,140.6299,8.1058,0.0,12.976617,15.560846,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [86]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

In [87]:
pred_tr = rf.predict(X_train)
pred_te = rf.predict(X_test)

In [88]:
print('Train accuracy:{}'.format(round(accuracy_score(y_train, pred_tr),5)))
print('Train F1:{}'.format(round(f1_score(y_train, pred_tr, average='weighted'),5)))
print('Test accuracy:{}'.format(round(accuracy_score(y_test, pred_te),5)))
print('Test F1:{}'.format(round(f1_score(y_test, pred_te, average='weighted'),5)))

Train accuracy:0.99949
Train F1:0.99949
Test accuracy:0.67315
Test F1:0.63448


## Hyper parameter tuning

In [3]:
from numpy import mean
from numpy import arange
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report

In [4]:
# Number of trees in random forest
n_estimators = [int(x) for x in range(500,1600,200)]
#n_estimators = [1150, 1200, 1250, 1300,1350]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]


In [5]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [500, 700, 900, 1100, 1300, 1500], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [6]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 61.0min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 134.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 223.2min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [9]:
rf_random.best_params_

{'n_estimators': 1100,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 50,
 'bootstrap': False}

In [10]:
def evaluate(model, test_features, test_labels, train_features, train_labels):
    pred_te = model.predict(test_features)
    pred_tr = model.predict(train_features)
    
    target_names = ['HOME', 'SNF', 'Others','Dead']
    print('------------------- Train -------------------')
    print('Train accuracy:{}'.format(round(accuracy_score(y_train, pred_tr),5)))
    print('Train F1:{}'.format(round(f1_score(y_train, pred_tr,average='weighted'),5)))
    print(classification_report(train_labels, pred_tr, target_names=target_names))
    
    print('------------------- Test ------------------- ')   
    print('Test accuracy:{}'.format(round(accuracy_score(y_test, pred_te),5)))
    print('Test F1:{}'.format(round(f1_score(y_test, pred_te,average='weighted'),5)))
    print(classification_report(test_labels, pred_te, target_names=target_names))

best_random = rf_random.best_estimator_
evaluate(best_random, X_test, y_test, X_train, y_train)

------------------- Train -------------------
Train accuracy:0.9889
Train F1:0.98887
              precision    recall  f1-score   support

        HOME       0.98      1.00      0.99     21858
         SNF       1.00      0.97      0.98      4437
      Others       1.00      0.97      0.98      7407
        Dead       1.00      0.99      0.99      3514

    accuracy                           0.99     37216
   macro avg       0.99      0.98      0.99     37216
weighted avg       0.99      0.99      0.99     37216

------------------- Test ------------------- 
Test accuracy:0.67767
Test F1:0.64019
              precision    recall  f1-score   support

        HOME       0.73      0.94      0.82      5464
         SNF       0.45      0.29      0.35      1109
      Others       0.52      0.29      0.37      1852
        Dead       0.65      0.35      0.45       879

    accuracy                           0.68      9304
   macro avg       0.59      0.47      0.50      9304
weighted avg    

## Optimal Model

In [6]:
rf_final = RandomForestClassifier(n_estimators = 1100,
 min_samples_split= 10,
 min_samples_leaf= 2,
 max_features= 'sqrt',
 max_depth= 50,
 bootstrap= False)
rf_final.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, max_depth=50, max_features='sqrt',
                       min_samples_leaf=2, min_samples_split=10,
                       n_estimators=1100)

In [7]:
pred_tr_fl = rf_final.predict(X_train)
pred_te_fl = rf_final.predict(X_test)

In [8]:
print('Train accuracy:{}'.format(round(accuracy_score(y_train, pred_tr_fl),5)))
print('Train F1:{}'.format(round(f1_score(y_train, pred_tr_fl, average='weighted'),5)))
print('Test accuracy:{}'.format(round(accuracy_score(y_test, pred_te_fl),5)))
print('Test F1:{}'.format(round(f1_score(y_test, pred_te_fl, average='weighted'),5)))

Train accuracy:0.98917
Train F1:0.98914
Test accuracy:0.67885
Test F1:0.64147


In [9]:
import pickle

In [11]:
def model_save(model, pickle_file):
    """Save compiled models for reuse."""
    
    with open(pickle_file, 'wb') as f:
        pickle.dump(model, f)

model_save(rf_final, 'rf.pickle')