# Training and Visualizing a Decision Tree

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:] # all instances, petal length and width features
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [2]:
from sklearn.tree import export_graphviz

export_graphviz(
    tree_clf,
    out_file='iris_tree.dot',
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True,
)

![Generated decision tree](iris_tree.png)

# Estimating Class Probabilities

In [3]:
tree_clf.predict_proba([[5, 1.5]]) # Probabilities for each class for a single instance with length=5 and width=1.5

array([[0.        , 0.90740741, 0.09259259]])

In [4]:
iris.target_names[tree_clf.predict([[5, 1.5]])[0]]

'versicolor'

# Decision Tree for Moons dataset
## Exercise 7: Train and fine-tune a Decision Tree for the moons dataset

In [5]:
# Make moons dataset

from sklearn.datasets import make_moons
moons = make_moons(n_samples=10000, noise=0.4)
moons


(array([[-1.22633058,  0.65229144],
        [-0.17756477,  0.73007452],
        [ 0.61244218,  1.16114535],
        ...,
        [-0.38454732,  0.90153922],
        [-1.26379663,  0.49008327],
        [-0.59130673,  0.11966608]]), array([0, 0, 0, ..., 0, 0, 0]))

In [6]:
# Split into training and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(moons[0], moons[1], test_size=0.2, random_state=42)

In [7]:
# Create a basic Decision Tree classifier

from sklearn.tree import DecisionTreeClassifier
moons_clf = DecisionTreeClassifier()
moons_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [8]:
# Manually evaluate performance
num_instances = y_test.shape[0]
num_correct = 0

for i in range(num_instances):
    if moons_clf.predict([X_test[i]]) == y_test[i]:
        num_correct += 1

print('Accuracy rate:', num_correct / num_instances)

Accuracy rate: 0.8035


In [9]:
# Built-in evaluation
print('Score on test set:', moons_clf.score(X_test, y_test))
print('Score on training:', moons_clf.score(X_train, y_train)) # If this is much higher than we're seeing overfitting

Score on test set: 0.8035
Score on training: 1.0


In [10]:
# Grid search for hyperparameter tuning - let's generalize more better!

import numpy as np
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':np.arange(1,11)} # Try out different values of max_depth
clf = GridSearchCV(moons_clf, parameters, cv=5)

In [11]:
clf.fit(X_train, y_train)

# Should be closer together
print('GridSearchCV score on test:', clf.score(X_test, y_test))
print('GridSearchCV score on train:', clf.score(X_train, y_train))

GridSearchCV score on test: 0.8565
GridSearchCV score on train: 0.859125


In [12]:
# Which values 1-10 performed best?
clf.cv_results_['rank_test_score']

array([10,  1,  1,  4,  5,  6,  3,  7,  8,  9], dtype=int32)

## Exercise 8: Grow a forest

In [13]:
# Create 1000 training sets of 100 random examples from the moons dataset

from sklearn.model_selection import ShuffleSplit
shuffleSplit = ShuffleSplit(n_splits=1000, train_size=0.01)



In [20]:
# Create 1000 decision trees based off the mini datasets

decisionTrees = []

for train_indices, _ in shuffleSplit.split(moons[0]):
    small_X = []
    small_y = []
    
    small_clf = DecisionTreeClassifier()
    
    for train_index in train_indices:
        small_X.append(moons[0][train_index])
        small_y.append(moons[1][train_index])
        
    small_clf.fit(small_X, small_y)
    decisionTrees.append(small_clf)

In [46]:
# Make predictions with each mini decision tree

correct_predictions = 0
num_test = len(X_test)

for i in range(num_test):
    prediction = 0
    pred_count = 0
    
    for decisionTree in decisionTrees:
        pred_count += decisionTree.predict([X_test[i]])[0]
    
    if pred_count >= len(decisionTrees) / 2:
        prediction = 1
    
    if prediction == y_test[i]:
        correct_predictions += 1

print('Accuracy:', correct_predictions/num_test)

Accuracy: 0.8695
