In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

from scipy.stats import mode

In [2]:
X_moons, y_moons = make_moons(n_samples=10000, noise=0.4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_moons, y_moons, random_state=42)
tree_clf = DecisionTreeClassifier()

In [3]:
param_grid = [
  {'max_leaf_nodes': [2,4,8,16,32,64,128],
   'max_depth': [2,4,8,16,32]}
]

grid_search_cv = GridSearchCV(tree_clf, param_grid, scoring='accuracy')
grid_search_cv.fit(X_train, y_train)

In [4]:
# Getting the best parameters from the grid search
print (grid_search_cv.best_params_)

# Getting the evaluation data from the grid search
cv_res = pd.DataFrame(grid_search_cv.cv_results_)
cv_res.sort_values(by="rank_test_score", ascending=True, inplace=True)
cv_res.head()

{'max_depth': 8, 'max_leaf_nodes': 32}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_leaf_nodes,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
18,0.005679,0.000234,0.000278,2.3e-05,8,32,"{'max_depth': 8, 'max_leaf_nodes': 32}",0.848667,0.867333,0.866,0.856667,0.858667,0.859467,0.006778,1
32,0.005661,0.000212,0.000301,5.3e-05,32,32,"{'max_depth': 32, 'max_leaf_nodes': 32}",0.847333,0.871333,0.863333,0.856667,0.858,0.859333,0.007911,2
25,0.005769,0.000281,0.000309,3.2e-05,16,32,"{'max_depth': 16, 'max_leaf_nodes': 32}",0.847333,0.869333,0.863333,0.856667,0.858,0.858933,0.007322,3
17,0.004997,6.4e-05,0.000279,2.9e-05,8,16,"{'max_depth': 8, 'max_leaf_nodes': 16}",0.854667,0.858667,0.861333,0.850667,0.856667,0.8564,0.003617,4
24,0.004972,4.7e-05,0.000309,4e-05,16,16,"{'max_depth': 16, 'max_leaf_nodes': 16}",0.854667,0.858667,0.861333,0.850667,0.856667,0.8564,0.003617,4


In [6]:
best_tree = grid_search_cv.best_estimator_
best_tree.fit(X_train, y_train)
best_tree.score(X_test, y_test)

0.8644

### Homemade Random Forest

In [7]:
from sklearn.model_selection import ShuffleSplit


X_moons, y_moons = make_moons(n_samples=10000, noise=0.4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_moons, y_moons, random_state=42)

# Returns random indices for each split
split  = ShuffleSplit(n_splits=1000, random_state=42)

random_forest = []
for i, (train_index, _) in enumerate(split.split(X_train)):
  tree_clf = DecisionTreeClassifier(max_depth=8, max_leaf_nodes=32, random_state=42)
  X_train_subset = X_train[train_index]
  y_train_subset = y_train[train_index]
  tree_clf.fit(X_train_subset, y_train_subset)
  random_forest.append(tree_clf)

In [8]:
# Predicting the accuracy score of each tree individually on the test set and getting the mean
accuracy_scores = [tree_clf.score(X_test, y_test) for tree_clf in random_forest]
np.mean(accuracy_scores)

0.8595176

In [9]:
# Ensemble method
# Getting predictions from all tree for each test example and choosing the most frequent example
y_pred_all = []
for tree_clf in random_forest:
  y_pred = tree_clf.predict(X_test)
  y_pred_all.append(y_pred)

y_pred_all = np.array(y_pred_all)

In [10]:
# Each row represents the prediction of tree i over test set. Each cell (i,j) is the prediction of tree i over sample j in 
# test set.
from sklearn.metrics import accuracy_score


print (y_pred_all.shape)

# Condensing the results by taking the most frequent result over the axis (over all trees)
y_pred_combined = mode(y_pred_all, axis=0).mode
print (y_pred_combined.shape)

accuracy_score(y_test, y_pred_combined)

(1000, 2500)
(2500,)


0.8608