# Random Forest Algorithm

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

In [None]:
df = sns.load_dataset('titanic')

In [None]:
df.head()

In [None]:
df.dropna(inplace=True)

## Data Pre-processing

In [None]:
X = df[['pclass', 'sex', 'age']]

In [None]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

In [None]:
X['sex'] = lb.fit_transform(X['sex'])

In [None]:
y = df['survived']

In [None]:
y.value_counts()

***

* Ensemble of Decision Trees

* Training via the bagging method (Repeated sampling with replacement)
  * Bagging: Sample from samples
  * RF: Sample from predictors. $m=sqrt(p)$ for classification and $m=p/3$ for regression problems.

* Utilise uncorrelated trees

Random Forest
* Sample both observations and features of training data

Bagging
* Samples only observations at random
* Decision Tree select best feature when splitting a node

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    if train:
        '''
        training performance
        '''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))    
        

In [None]:
# instantiating the random forest classifier with default random state hyper-parameter
rf_clf = RandomForestClassifier(random_state=42)

In [None]:
# aaaaaaaaaaand fit the model
rf_clf.fit(X_train, y_train)

In [None]:
print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)

In [None]:
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

## Grid Search

in order to achieve more and exact knowledge about hyper-parameters, we use to beg Grid Search to do so for us.

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
# importing grid search cross validation
from sklearn.model_selection import GridSearchCV

In [None]:
rf_clf = RandomForestClassifier(random_state=42)

In [None]:
# these are the hyper-parameters we gonna tune

# max_depth will be 3 or none
# min_samples_split will be 2 , 3 or 10
# and goes like this...
params_grid = {"max_depth": [3, None],
               "min_samples_split": [2, 3, 10],
               "min_samples_leaf": [1, 3, 10],
               "bootstrap": [True, False],
               "criterion": ['gini', 'entropy']}

In [None]:
grid_search = GridSearchCV(rf_clf, params_grid,
                           n_jobs=-1, cv=5,
                           verbose=1, scoring='accuracy')

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_.get_params()

In [None]:
print_score(grid_search, X_train, y_train, X_test, y_test, train=True)

In [None]:
print_score(grid_search, X_train, y_train, X_test, y_test, train=False)

***

end