# Random Forest

- Ensemble of decision trees
- Trained using bagging method (repeated sampling with replacement)
- Use uncorrelated trees

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns

In [8]:
df = sns.load_dataset('titanic')
df.dropna(inplace=True)
X = df[['pclass', 'sex', 'age']].copy()
le = preprocessing.LabelEncoder()
X['sex'] = le.fit_transform(df['sex'])
y = df['survived'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [6]:
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score

def printScore(clf, X_train, X_test, y_train, y_test, train=True):
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        res = clf.predict(X_train)
        print('Train Results:\n')
        print('Accuracy: %.2f\n' % accuracy_score(y_train, res))
        print('Classification Report: \n {} \n'.format(classification_report(y_train, res)))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_train, res)))
        print('ROC AUC: {0:.4f}\n'.format(roc_auc_score(lb.transform(y_train), lb.transform(res))))
    else:
        res_test = clf.predict(X_test)
        print('Test Results:\n')
        print('Accuracy: %.2f\n' % accuracy_score(y_test, res_test))
        print('Classification Report: \n {} \n'.format(classification_report(y_test, res_test)))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_test, res_test)))
        print('ROC AUC: {0:.4f}\n'.format(roc_auc_score(lb.transform(y_test), lb.transform(res_test))))

In [10]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)
rf_clf.fit(X_train, y_train);

In [13]:
printScore(rf_clf, X_train, X_test, y_train, y_test)
printScore(rf_clf, X_train, X_test, y_train, y_test, train=False)


Train Results:

Accuracy: 0.93

Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.85      0.88        48
           1       0.94      0.97      0.95       115

    accuracy                           0.93       163
   macro avg       0.93      0.91      0.92       163
weighted avg       0.93      0.93      0.93       163
 

Confusion Matrix: 
 [[ 41   7]
 [  4 111]] 

ROC AUC: 0.9097

Test Results:

Accuracy: 0.79

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.64      0.78        11
           1       0.67      1.00      0.80         8

    accuracy                           0.79        19
   macro avg       0.83      0.82      0.79        19
weighted avg       0.86      0.79      0.79        19
 

Confusion Matrix: 
 [[7 4]
 [0 8]] 

ROC AUC: 0.8182



In [15]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [16]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)

In [19]:
params_grid = {
    'max_depth' : [3, None],
    'min_samples_split' : [2, 3, 10],
    'min_samples_leaf' : [1, 3, 10],
    'bootstrap' : [True, False],
    'criterion' : ['gini', 'entropy']
}
grid_search = GridSearchCV(
    rf_clf, params_grid, n_jobs=-1, cv=5, verbose=1, 
    scoring='accuracy'
)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [3, None], 'min_samples_leaf': [1, 3, 10],
                         'min_samples_split': [2, 3, 10]},
             scoring='accuracy', verbose=1)

In [20]:
grid_search.best_score_

0.803409090909091

In [22]:
grid_search.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [23]:
printScore(grid_search, X_train, X_test, y_train, y_test)
printScore(grid_search, X_train, X_test, y_train, y_test, train=False)

Train Results:

Accuracy: 0.93

Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.85      0.88        48
           1       0.94      0.97      0.95       115

    accuracy                           0.93       163
   macro avg       0.93      0.91      0.92       163
weighted avg       0.93      0.93      0.93       163
 

Confusion Matrix: 
 [[ 41   7]
 [  4 111]] 

ROC AUC: 0.9097

Test Results:

Accuracy: 0.79

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.64      0.78        11
           1       0.67      1.00      0.80         8

    accuracy                           0.79        19
   macro avg       0.83      0.82      0.79        19
weighted avg       0.86      0.79      0.79        19
 

Confusion Matrix: 
 [[7 4]
 [0 8]] 

ROC AUC: 0.8182

