In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn import cross_validation, grid_search
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from utilities import visualize_classifier



In [2]:
#load data
input_file = "data_random_forests.txt"
data = np.loadtxt(input_file, delimiter=",")
X, Y = data[:, :-1], data[:, -1]

In [3]:
#split data by class
class_0 = np.array(X[Y==0])
class_1 = np.array(X[Y==1])
class_2 = np.array(X[Y==2])

In [4]:
#split into training and test set
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.25, random_state=5)

In [25]:
#setting range of parameter for grid_search ; grid means "range of parameter"
#exactly, It's not range, but expected value list for each parameter
parameter_grid = [{"n_estimators":[100], "max_depth":[2,4,7,12,16]}, 
                  {"max_depth":[4], "n_estimators":[25,50,100,250]}]

In [26]:
metrics = ['precision_weighted', 'recall_weighted']

In [28]:
for metric in metrics : 
    print("\n#### Searching optimal parameters for", metric)
    classifier = grid_search.GridSearchCV(ExtraTreesClassifier(random_state=0), parameter_grid, cv=5, scoring=metric)
    classifier.fit(X_train, Y_train)
    print("\nGrid scores for the parameter grid:")
    for params, avg_score, _ in classifier.grid_scores_ : 
        print(params, '-->', round(avg_score, 3))
    print("\nBest parameters:", classifier.best_params_)
    
    Y_pred = classifier.predict(X_test)
    print("\nPerformance report:\n")
    print(classification_report(Y_test, Y_pred))


#### Searching optimal parameters for precision_weighted

Grid scores for the parameter grid:
{'n_estimators': 100, 'max_depth': 2} --> 0.847
{'n_estimators': 100, 'max_depth': 4} --> 0.841
{'n_estimators': 100, 'max_depth': 7} --> 0.844
{'n_estimators': 100, 'max_depth': 12} --> 0.836
{'n_estimators': 100, 'max_depth': 16} --> 0.818
{'n_estimators': 25, 'max_depth': 4} --> 0.846
{'n_estimators': 50, 'max_depth': 4} --> 0.84
{'n_estimators': 100, 'max_depth': 4} --> 0.841
{'n_estimators': 250, 'max_depth': 4} --> 0.845

Best parameters: {'n_estimators': 100, 'max_depth': 2}

Performance report:

             precision    recall  f1-score   support

        0.0       0.94      0.81      0.87        79
        1.0       0.81      0.86      0.83        70
        2.0       0.83      0.91      0.87        76

avg / total       0.86      0.86      0.86       225


#### Searching optimal parameters for recall_weighted

Grid scores for the parameter grid:
{'n_estimators': 100, 'max_depth': 2