In [47]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import scipy as scipy

# plot options
plt.rcParams.update({'font.size': 11})
# plt.rcParams['figure.figsize'] = [16, 10] # for big plots
# %matplotlib notebook

#print option
pd.set_option("display.max_columns", 500)
# pd.set_option('display.max_rows',100)
# clear output in Jupyter cell
from IPython.display import clear_output

In [35]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
import sklearn.preprocessing as sklrnpp
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

Get the moons data

In [4]:
X,y=make_moons(n_samples=1000,noise=0.4)

# Grid search

In [12]:
params={ 'max_leaf_nodes': range(2, 100),
        'min_samples_split': [2, 3, 4]
        }   

clf = DecisionTreeClassifier()
grid_clf = GridSearchCV(clf, param_grid=params,cv=3,scoring='accuracy',verbose=10,n_jobs=-1)
grid_clf.fit(X,y)

print('Grid best score accuracy = {:.3f}'.format(grid_clf.best_score_))
print('Grid best parameters: ', grid_clf.best_params_)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1736s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0170s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0279s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0439s.) Setting batch_size=16.
[Parallel(n_jobs=-1)]: Done 156 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0728s.) Setting batch_size=32.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1476s.) Setting batc

Grid best score accuracy = 0.838
Grid best parameters:  {'max_leaf_nodes': 11, 'min_samples_split': 2}


[Parallel(n_jobs=-1)]: Batch computation too fast (0.1681s.) Setting batch_size=128.
[Parallel(n_jobs=-1)]: Done 666 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 754 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 882 out of 882 | elapsed:    2.6s finished


# Forest

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [40]:
rs = ShuffleSplit(n_splits=1000, test_size=X_train.shape[0] - 100)

In [42]:
ind=0
preds=np.empty((1000,len(y_test)))
accs=[]
for train_index, _ in rs.split(X_train):
    # reset the classifier
    clf=DecisionTreeClassifier(max_leaf_nodes=11, min_samples_split=2)
    # fit the classifier
    clf.fit(X_train[train_index],y_train[train_index])
    # get the predictions
    cur_preds=clf.predict(X_test)
    # store the accuracy
    accs.append(accuracy_score(y_test,cur_preds))
    # store the predicitions
    preds[ind,:]=cur_preds
    
    # increase the index
    ind+=1    

### Accuracies

Take the mode of the predictions for the forest

In [58]:
mode_preds=scipy.stats.mode(preds)[0][0]

In [59]:
print('Accuracy of individual trees')
print('\t mean = {:.3f}'.format(np.mean(accs)))
print('\t median = {:.3f}'.format(np.median(accs)))
print('\t std = {:.3f}'.format(np.std(accs)))
print('\n Accuracy of forest = {:.3f}'.format(accuracy_score(y_test,mode_preds)))

Accuracy of individual trees
	 mean = 0.806
	 median = 0.810
	 std = 0.029

 Accuracy of forest = 0.850
