## Train and fine-tune a Decision Tree for the moons dataset

In [1]:
from sklearn.datasets import make_moons

moons = make_moons(n_samples = 10000, noise= 0.4)
moons

(array([[ 0.72421123,  0.54211831],
        [-0.32988897,  1.18780642],
        [-0.93047904,  0.38015665],
        ...,
        [ 1.02806188,  0.53426052],
        [ 0.70049764,  0.46673958],
        [ 0.96015176,  0.39724371]]),
 array([0, 0, 0, ..., 1, 0, 0]))

In [2]:
moons[0]

array([[ 0.46654404,  1.22651313],
       [ 0.58662884, -0.18575817],
       [ 0.7280187 ,  1.0420609 ],
       ...,
       [-0.17366043,  0.58585676],
       [ 0.86602821,  0.25804901],
       [-1.82357241, -0.03429702]])

In [2]:
#Lets split the dataset in test and train datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(moons[0], moons[1])


In [3]:
#Now we'll do grid search cross validation to look for the best hyperparameters for a decision tree classifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid = {'max_leaf_nodes':[2,5,7,10]}
DTreeClassifier = DecisionTreeClassifier()

grid_search = GridSearchCV(DTreeClassifier, param_grid)

grid_search.fit(X_train, y_train)


GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'max_leaf_nodes': [2, 5, 7, 10]})

In [4]:
from sklearn.metrics import mean_squared_error
import numpy as np

y_predicted = grid_search.predict(X_train)
mse = mean_squared_error(y_train, y_predicted)
np.sqrt(mse)


0.37327380477785116

In [5]:
from sklearn.metrics import accuracy_score

accuracy_score(y_train,y_predicted)

0.8606666666666667

## Now grow a forest

In [7]:
#We'll generate 1000 subsets first with 100 instances in each:
from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=1000, train_size=100, random_state=0)
print(rs)


ShuffleSplit(n_splits=1000, random_state=0, test_size=None, train_size=100)


In [8]:
#Let's see how the shuffle split works:
for train_index,test_index in rs.split(X_train):
    print(f'Train : {train_index}  Test : {test_index}')

5455 5093 3663 4346 5113
 5430 6080 6891 3270 7011 2853 6887   73 3685 4038 4112 2729 6677 3614
 4325 4208   26 2360 1768 3828 1654  523  477 2300 3233 2282 5456 2642
 4410 3717 5448 5358 5580 3968 2512 7052 1058  358 3682 7161 7008 2132
 1650  728 6055 4753 2551 1490  118 2964  993 3376 4700 6440 2133 2089
 3967 2914]  Test : [1738 4293 7136 ... 5576 4429 2894]
Train : [6480 4902  540 3375 1391 5519  564 2050 2258 4036 5386 3386 6204  179
  673 1365 3484 4290 2296 1118 3602 7403 6436 3476 2018 4331 6793 2133
  594 5186 2106 4069 5487 4153 1949 4887 3096  929 5503 5339 7450 4801
 6381 1777 6543 5675 1561 6116 6447 5286  359 5933 5090 7163 7098 6335
 4174 2288 5408 4823  781 1589 3261 1952  818 2679 3201 4157 2815   16
 3266 5871 1076  453 4160 3293  178 2898  688 4125 6659 3751 2161 4654
 1326  258  180 6867   14 5293   15 7334 2187 1748  353 3741 6938 2719
 4704 5860]  Test : [3627 5566  346 ... 5468 2931 6764]
Train : [ 770 3508 3501 7362 4271 5181 1173 5550 5435 3321 1269 2855 3245 

In [11]:
#Now, we want to use those indexes as masks on the training set and then save those mini subsets

subsets = []
for train_index, test_index in rs.split(X_train):
    subsets.append([X_train[train_index],y_train[train_index]])


In [19]:
#Let's take a look at the structure of each subset
print(f'Instances : {subsets[0][0]}')
print(f'Labels : {subsets[0][1]}')

Instances : [[-6.31133500e-02  2.54608017e-01]
 [ 6.74796497e-01  6.40993741e-01]
 [ 8.55144319e-01 -9.08019262e-01]
 [ 2.14973498e+00 -7.36523899e-01]
 [ 7.62969473e-02  9.99801356e-01]
 [ 1.71371352e+00  3.91760997e-03]
 [ 9.72627865e-01 -1.36327132e+00]
 [ 8.45395321e-01 -1.72215914e-01]
 [-2.78699593e-01  1.68340402e-01]
 [-7.13836932e-01  2.44085491e-01]
 [ 9.01435905e-01  9.32643505e-01]
 [ 7.18722991e-01  1.48512173e+00]
 [ 1.11136765e+00  1.39499363e+00]
 [ 1.48802103e+00  4.61411610e-01]
 [ 1.12797620e+00  2.73833566e-02]
 [-2.52602418e-01  3.03876257e-01]
 [ 1.33328605e+00 -4.11302660e-01]
 [ 2.33243800e+00  1.81983046e-04]
 [-6.15051236e-01 -2.17198902e-01]
 [ 1.05312941e-01  1.31028847e+00]
 [ 3.53143944e-01  4.77547662e-01]
 [ 2.16977686e+00  4.81907079e-01]
 [ 1.49213910e+00  2.84530025e-01]
 [-1.45785937e+00  5.93821016e-01]
 [ 8.13821872e-01  1.13291121e-01]
 [ 4.94138926e-01  3.76153568e-02]
 [ 8.77065347e-01 -5.32788565e-01]
 [ 7.56773998e-01 -2.83673222e-01]
 [ 4.508

In [21]:
#We now will train 1000 decision trees on these subsets using the parameters provided by the grid search
accuracy_scores = []
for subset in subsets:
    tree = DecisionTreeClassifier()
    tree.set_params(**grid_search.best_params_)
    tree.fit(subset[0],subset[1])
    prediction = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test,prediction))



In [23]:
#Computing the mean of the accuracy scores:
np.mean(accuracy_scores)

0.799048

In [32]:
#For each instance of the test set generate the predictions of 1000 Decision Trees and keep the mode of the predictions
#We may want to have all our trees in a forest:
forest = []
for subset in subsets:
    single_tree = DecisionTreeClassifier()
    single_tree.set_params(**grid_search.best_params_)
    single_tree.fit(subset[0],subset[1])

    forest.append(single_tree)


In [34]:
#Now that we have a forest, we can generate the predictions:
from scipy.stats import mode

predictions=[]
temp_pred = []
for instance in X_test:
    for tree in forest:
        
        temp_pred.append(tree.predict(instance.reshape(1,-1)))
    predictions.append(mode(temp_pred)[0])
    temp_pred.clear()




The previous method is not recommendable. I did it because I didn't want to copy the method used in the book

In [42]:
predictions = np.array(predictions).reshape([-1])

In [43]:
#Finally let's test the accuracy of the predictions
accuracy_score(y_test,predictions)

0.8496

5% more accuracy is not bad at all