In [45]:
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import ShuffleSplit
from sklearn.base import clone
from scipy.stats import mode

In [48]:
# Get data and split into test train
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Use GridSearchCV to find best params
param_grid = {
    'max_leaf_nodes': list(range(2,100)),
    'min_samples_split': list(range(1,10)),
}
search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid=param_grid, n_jobs=-1, verbose=1, cv=5)
search.fit(X_train, y_train)
print(search.best_params_)

Fitting 5 folds for each of 882 candidates, totalling 4410 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 2536 tasks      | elapsed:    2.6s
{'max_leaf_nodes': 23, 'min_samples_split': 2}
[Parallel(n_jobs=-1)]: Done 4410 out of 4410 | elapsed:    4.5s finished


In [49]:
# Use best estimator to get accuracy
y_pred = search.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

Accuracy: 0.8735


In [54]:
# Grow a forest
# Split
n_trees = 1000
n_instances = 100
mini_sets = []
rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)
for mini_train_index, mini_test_index in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append([X_mini_train, y_mini_train])

In [61]:
forest = [clone(search.best_estimator_) for _ in range(n_trees)]
accuracy_scores = []
for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))
print(np.mean(accuracy_scores))

0.7988824999999999


In [46]:
Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)
for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)
y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

In [47]:
accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.8735