### Example workflow of random forest in python sci-kit learn

I choose random forest as the underlying algorithm since it's a powerful and widely used algorithm in industries as well as competitions like Kaggle

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
os.chdir('C:/Users/Bangda/Desktop')
iris = pd.read_csv('iris.csv')
iris.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.preprocessing import LabelEncoder

#### Data preparation

In [4]:
X = iris.drop(['Species'], axis = 1).values
y = iris['Species'].values
X.shape, y.shape

((150L, 4L), (150L,))

In [5]:
y_encoder = LabelEncoder()
y_encoder.fit(y)
y = y_encoder.transform(y)
np.unique(y)

array([0, 1, 2], dtype=int64)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 233)
X_train.shape, X_test.shape

((120L, 4L), (30L, 4L))

#### Basic and naive modeling

In [7]:
rf_default = RandomForestClassifier()
rf_default.fit(X_train, y_train)
y_pred_train = rf_default.predict(X_train)
confusion_matrix(y_train, y_pred_train)

array([[41,  0,  0],
       [ 0, 38,  0],
       [ 0,  2, 39]])

In [8]:
y_pred_test = rf_default.predict(X_test)
confusion_matrix(y_test, y_pred_test)

array([[ 9,  0,  0],
       [ 0, 12,  0],
       [ 0,  1,  8]])

In [9]:
print(classification_report(y_test, y_pred_test))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         9
          1       0.92      1.00      0.96        12
          2       1.00      0.89      0.94         9

avg / total       0.97      0.97      0.97        30



In [10]:
y_pred_test_prob = rf_default.predict_proba(X_test)
y_pred_test_prob[:5, :]

array([[ 0. ,  1. ,  0. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0.9,  0.1],
       [ 0. ,  0. ,  1. ],
       [ 1. ,  0. ,  0. ]])

#### Introduing cross-validation, hyperparameters tuning and grid search

In [11]:
rf_tuned = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100],
          'max_depth': [3, 4, 5],
          'min_samples_split': [2, 3],
          'max_features': [2, 3]}

In [12]:
rf_cv = GridSearchCV(rf_tuned, params, cv = 5)

In [13]:
rf_cv.fit(X_train, y_train) # take one minute or two for this step

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 50, 100], 'min_samples_split': [2, 3], 'max_depth': [3, 4, 5], 'max_features': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [14]:
rf_cv.best_params_

{'max_depth': 3, 'max_features': 3, 'min_samples_split': 2, 'n_estimators': 50}

In [15]:
rf_cv.best_score_

0.96666666666666667

In [16]:
rf_tuned = RandomForestClassifier(max_depth = 3, max_features = 3, min_samples_split = 2, n_estimators = 10)
rf_tuned.fit(X_train, y_train)
y_pred_test = rf_tuned.predict(X_test)
confusion_matrix(y_test, y_pred_test)

array([[ 9,  0,  0],
       [ 0, 12,  0],
       [ 0,  1,  8]])