In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import numpy as np

iris = load_iris()

In [2]:
X = iris.data[:, 2:] # petal length and width
y = iris.target

In [3]:
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [4]:
from sklearn.tree import export_graphviz

In [5]:
import os

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"

def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, fig_id)

In [6]:
export_graphviz(tree_clf,
               out_file=image_path("iris_tree.dot"),
                feature_names=iris.feature_names[2:],
                class_names=iris.target_names,
                rounded=True,
                filled=True
               )


In [7]:
tree_clf.predict_proba([[5, 1.5]])

array([[0.        , 0.90740741, 0.09259259]])

In [8]:
tree_clf.predict([[5, 1.5]])

array([1])

In [9]:
##############
# Regression #
##############

In [10]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=2)
tree_reg.fit(X, y)


DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [11]:
# Quadratic training set + noise
np.random.seed(42)
m = 200
X = np.random.rand(m, 1)
y = 4 * (X - 0.5) ** 2
y = y + np.random.randn(m, 1) / 10

In [12]:
tree_reg1 = DecisionTreeRegressor(random_state=42, max_depth=2)
tree_reg1.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

In [13]:
export_graphviz(
        tree_reg1,
        out_file=image_path("regression_tree.dot"),
        feature_names=["x1"],
        rounded=True,
        filled=True
    )

In [14]:
#############
# Exercises #
#############

In [15]:

# 7. Train and fine-tune a Decision Tree for the moons dataset.


In [16]:
# a. Generate a moons dataset using make_moons(n_samples=10000, noise=0.4)

from sklearn.datasets import make_moons
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

In [17]:
X, y = make_moons(n_samples=10000, noise=0.4)

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

decision_tree_clf = DecisionTreeClassifier()
decision_tree_clf.fit(X_train,y_train)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [20]:
param_grid = [
    {'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 30], 'max_depth': [None, 2, 4, 6, 8, 10],
    'max_features': [1, 2], 'presort': [True, False]}
]

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid=[{'max_depth': [None, 2, 4, 6, 8, 10],
                          'max_features': [1, 2],
                       

In [21]:
grid_search.best_params_
# {'max_depth': None, 'max_features': 2, 'max_leaf_nodes': 10, 'presort': True}

{'max_depth': 8, 'max_features': 1, 'max_leaf_nodes': 14, 'presort': True}

In [22]:
grid_search.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
                       max_features=1, max_leaf_nodes=14,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=True,
                       random_state=None, splitter='best')

In [23]:
grid_search.best_estimator_.feature_importances_
# array([0.40794317, 0.59205683])

array([0.32513054, 0.67486946])

In [24]:
from sklearn.model_selection import cross_val_score

In [25]:
cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=3, scoring="accuracy")
# array([0.84632684, 0.86459115, 0.84283571])

array([0.8200225 , 0.856018  , 0.82745686])

In [26]:
final_predictions = grid_search.best_estimator_.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score

In [28]:
final_accuracy = accuracy_score(y_test, final_predictions)
print(final_accuracy) # 0.8625

0.848


In [29]:
# 8. Grow a forest.
    # a. Continuing the previous exercise, 
        # generate 1,000 subsets of the training set, 
        # each containing 100 instances selected randomly. 
        # Hint: you can use Scikit- Learn’s ShuffleSplit class for this.

In [30]:
from sklearn.model_selection import ShuffleSplit

In [31]:
shuffle_split = ShuffleSplit(n_splits=1000, train_size=100, random_state=0)

In [32]:
shuffle_split.get_n_splits(X_train)

1000

In [33]:
from sklearn.model_selection import GridSearchCV

params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1, verbose=1, cv=3)

grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 882 out of 882 | elapsed:    5.6s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=42,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14, 15, 16, 

In [34]:
mini_sets = []

rs = ShuffleSplit(n_splits=1000, test_size=100, random_state=42)
# for train_index, test_index in shuffle_split.split(X_train):
for mini_train_index, mini_test_index in rs.split(X_train):
    mini_X_train = X_train[mini_train_index]
    mini_y_train = y_train[mini_train_index]
    mini_sets.append((mini_X_train, mini_y_train))

In [35]:
from sklearn.base import clone
scores_ = []
forest = [clone(grid_search_cv.best_estimator_) for _ in range(0, 100)]

for tree, (mini_X, mini_y) in zip(forest, mini_sets):
    
    tree.fit(mini_X, mini_y)
    y_pred = tree.predict(mini_X)
    
    mini_accuracy = accuracy_score(mini_y, y_pred)
#     print('mini', mini_accuracy)
    scores_.append(mini_accuracy)
    
np.mean(scores_)

0.8645746835443039

In [43]:
from sklearn.base import clone
_scores_ = []
# forest = [clone(grid_search_cv.best_estimator_) for _ in range(0, 100)]

for mini_X, mini_y in mini_sets:
    # clone can occur inside for loop or outside
    tree2 = clone(grid_search_cv.best_estimator_) 
    tree2.fit(mini_X, mini_y)
    y_pred = tree2.predict(mini_X)
    
    mini_accuracy = accuracy_score(mini_y, y_pred)
#     print('mini', mini_accuracy)
    _scores_.append(mini_accuracy)
    
np.mean(_scores_)

0.8645265822784811

In [37]:
## b. Train one Decision Tree on each subset, 
# using the best hyperparameter values found above. 
# Evaluate these 1,000 Decision Trees on the test set. 
# Since they were trained on smaller sets, these Decision Trees will likely perform worse than the first Decision Tree,
# achieving only about 80% accuracy.

In [38]:
# Initial attempt - not correct
# scores = []

# for train_index_list, test_index_list in shuffle_split.split(X_train):
#     new_X_train = [X_train[index] for index in train_index_list]
#     new_y_train = [y_train[index] for index in train_index_list]
    
#     new_X_test = [X_train[index] for index in test_index_list]
#     new_y_test = [y_train[index] for index in test_index_list]
    
#     new_decision_tree = grid_search.best_estimator_.fit(new_X_train, new_y_train)
    
#     new_final_predictions = grid_search.best_estimator_.predict(new_X_test)
#     decision_tree_accuracy = accuracy_score(new_y_test, new_final_predictions)

#     scores.append(decision_tree_accuracy)

# #     cvs = cross_val_score(new_decision_tree, new_X, new_y, cv=3, scoring="accuracy")
# #     max_score = max(cvs)
# #     mean_score = np.mean(cvs)

In [39]:
np.mean(scores)

0.7896198734177214

In [40]:
grid_search = GridSearchCV(decision_tree_clf, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 360 candidates, totalling 1800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 1800 out of 1800 | elapsed:    5.5s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid=[{'max_depth': [None, 2, 4, 6, 8, 10],
                          'max_features': [1, 2],
                         

In [41]:
# Classifier Algorithm vs
# Trained Classifier vs
# Training Set


In [None]:
# c. 
# For each test set instance, generate the predictions of the 1,000 Decision Trees,
# and keep only the most frequent prediction (you can use SciPy’s mode() function for this).
# This gives you majority-vote predictions over the test set.

In [None]:
from scipy.stats import mode

In [None]:
mode(scores)
# 0.7821519

In [None]:
##########
# ANSWER #
##########

from sklearn.model_selection import ShuffleSplit

n_trees = 1000
n_instances = 100

mini_sets = []

rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)
for mini_train_index, mini_test_index in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1, verbose=1, cv=3)

grid_search_cv.fit(X_train, y_train)

In [None]:
from sklearn.base import clone

forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_trees)]

accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

np.mean(accuracy_scores)

In [None]:
from sklearn.metrics import accuracy_score

y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test, y_pred)