# Random Forest

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

%cd "G:/Archive"

G:\Archive


## Data Prep

In [2]:
#load data
data = pd.read_csv("data/wine quality red.csv", header = 0)

#aggregate into binary classification problem
data["quality"] = pd.Categorical(np.where(data["quality"] > 5, "above_avg", "below_avg"))

#split data
X = data.iloc[:, :11]
y = data.iloc[:, 11]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

## Searching Optimal Parameters

In [4]:
#by default, sklearn combines trees by averaging their probabilistic prediction, 
#instead of counting major vote as designed by Breiman
#two tuning parameters: n_estimators and max_features
#10-fold cv with accuracy as target metrics
#use OOB sample to estimate accuracy of each tree
grid1 = {"n_estimators": 100 * np.arange(1, 4), 
         "max_features": range(1, 12)}
model1 = GridSearchCV(RandomForestClassifier(oob_score = True), param_grid = grid1, cv = 10, scoring = "accuracy")
model1.fit(X_train, y_train)

#show tuning results
print("Best score: ", model1.best_score_)
print("Corresponding parms: ", model1.best_params_)
print("Model description: ", model1.best_estimator_)

#predict on test set
pred1 = model1.predict(X_test)

#show prediction performance
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, pred1)))
print("Recall: {0:.4f}".format(metrics.recall_score(y_test, pred1, pos_label = "above_avg", average = "binary")))
print("Precision: {0:.4f}".format(metrics.precision_score(y_test, pred1, pos_label = "above_avg", average = "binary")))
print("F1 score: {0:.4f}".format(metrics.f1_score(y_test, pred1, pos_label = "above_avg", average = "binary")))

Best score:  0.8053088090551181
Corresponding parms:  {'max_features': 5, 'n_estimators': 100}
Model description:  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)
Accuracy: 0.8438
Recall: 0.8721
Precision: 0.8427
F1 score: 0.8571


## Visualization