# Training and Visualizing a Decision Tree

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:] # all instances, petal length and width features
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [2]:
from sklearn.tree import export_graphviz

export_graphviz(
    tree_clf,
    out_file='iris_tree.dot',
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True,
)

![Generated decision tree](iris_tree.png)

# Estimating Class Probabilities

In [3]:
tree_clf.predict_proba([[5, 1.5]]) # Probabilities for each class for a single instance with length=5 and width=1.5

array([[0.        , 0.90740741, 0.09259259]])

In [4]:
iris.target_names[tree_clf.predict([[5, 1.5]])[0]]

'versicolor'

# Decision Tree for Moons dataset
## Exercise 7: Train and fine-tune a Decision Tree for the moons dataset

In [8]:
# Make moons dataset

from sklearn.datasets import make_moons
moons = make_moons(n_samples=10000, noise=0.4)
moons


(array([[ 0.59578787,  0.65276125],
        [ 0.46187279,  0.43994396],
        [ 0.60315338, -0.36345955],
        ...,
        [ 2.0115901 , -0.1365683 ],
        [ 0.71606259, -0.20057932],
        [ 1.40514576,  0.25021496]]), array([0, 1, 1, ..., 1, 0, 0]))

In [42]:
# Split into training and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(moons[0], moons[1], test_size=0.2, random_state=42)

In [43]:
# Create a basic Decision Tree classifier

from sklearn.tree import DecisionTreeClassifier
moons_clf = DecisionTreeClassifier()
moons_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [51]:
# Manually evaluate performance
num_instances = y_test.shape[0]
num_correct = 0

for i in range(num_instances):
    if moons_clf.predict([X_test[i]]) == y_test[i]:
        num_correct += 1

print('Accuracy rate:', num_correct / num_instances)

Accuracy rate: 0.821


In [55]:
# Built-in evaluation
print('Score on test set:', moons_clf.score(X_test, y_test))
print('Score on training:', moons_clf.score(X_train, y_train)) # If this is much higher than we're seeing overfitting

Score on test set: 0.821
Score on training: 1.0


In [66]:
# Grid search for hyperparameter tuning - let's generalize more better!

import numpy as np
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':np.arange(1,11)} # Try out different values of max_depth
clf = GridSearchCV(moons_clf, parameters, cv=5)

In [67]:
clf.fit(X_train, y_train)

# Should be closer together
print('GridSearchCV score on test:', clf.score(X_test, y_test))
print('GridSearchCV score on train:', clf.score(X_train, y_train))

GridSearchCV score on test: 0.8585
GridSearchCV score on train: 0.858125


In [68]:
# Which values 1-10 performed best?
clf.cv_results_['rank_test_score']

array([10,  4,  4,  1,  6,  3,  1,  7,  8,  9], dtype=int32)

## Exercise 8: Grow a forest