In [56]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbs
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import ShuffleSplit, train_test_split

X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [57]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

params = {
    'max_leaf_nodes': list(range(2, 100)),
    'max_depth': list(range(1, 7)),
    'min_samples_split': [2, 3, 4]
}

tree_search = GridSearchCV(DecisionTreeClassifier(random_state=42), params, cv=3)
tree_search.fit(X_train, y_train)

In [58]:
from sklearn.metrics import accuracy_score

y_pred = tree_search.predict(X_test)
accuracy_score(y_test, y_pred)


0.8595

In [59]:
from sklearn.base import clone

indices = ShuffleSplit(n_splits=1000, test_size=len(X_train) - 100, random_state=42)
scores = []
split_sets = []
forest = [clone(tree_search.best_estimator_) for _ in range(1000)]

for train_split_index, test_split_index in indices.split(X_train):
    X_train_split = X_train[train_split_index]
    y_train_split = y_train[train_split_index]
    split_sets.append((X_train_split, y_train_split))

for tree,(X_train_split, y_train_split) in zip(forest, split_sets):
    tree.fit(X_train_split, y_train_split)
    y_pred = tree.predict(X_test)
    scores.append(accuracy_score(y_pred, y_test))
    
np.mean(scores)

0.805671

In [60]:
from scipy.stats import mode

Y_pred = np.empty([1000, len(X_test)])
accuracy_scores = []

for idx, tree in enumerate(forest):
    Y_pred[idx] = tree.predict(X_test)

Y_pred_majority, votes = mode(Y_pred, axis=0)

  Y_pred_majority, votes = mode(Y_pred, axis=0)


In [61]:
accuracy_score(y_test, Y_pred_majority.reshape(-1,1))

0.873