# Decision Trees

In [1]:
%pylab inline
from sklearn import datasets, metrics, tree, cross_validation
from matplotlib import pyplot as plt
%matplotlib inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
#use the iris dataset
iris = datasets.load_iris()


In [3]:
#start and instance of a tree
myTree = tree.DecisionTreeClassifier()

In [4]:
#fit the model
myTree_fit = myTree.fit(iris.data, iris.target)

In [5]:
#do the prediction
y_pred = myTree_fit.predict(iris.data)

In [6]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [7]:
#how did we do?
print('Number of mislabelled points: %d' % (iris.target != y_pred).sum())

Number of mislabelled points: 0


In [8]:
print('Absolutely ridiculous overfit score: %d' % (myTree_fit.score(iris.data, iris.target)))

Absolutely ridiculous overfit score: 1


In [9]:
metrics.confusion_matrix(iris.target, y_pred)

array([[50,  0,  0],
       [ 0, 50,  0],
       [ 0,  0, 50]])

In [10]:
print metrics.classification_report(iris.target, y_pred)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       1.00      1.00      1.00        50
          2       1.00      1.00      1.00        50

avg / total       1.00      1.00      1.00       150



In [11]:
#generalising the tree
clf = tree.DecisionTreeClassifier()

x_train, x_test, y_train, y_test = cross_validation.train_test_split(iris.data, iris.target, test_size=.3)

clf.fit(x_train, y_train)

metrics.confusion_matrix(y_train, clf.predict(x_train))

print metrics.classification_report(y_train, clf.predict(x_train))

metrics.confusion_matrix(y_test, clf.predict(x_test))

print metrics.classification_report(y_test, clf.predict(x_test))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        34
          1       1.00      1.00      1.00        40
          2       1.00      1.00      1.00        31

avg / total       1.00      1.00      1.00       105

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        16
          1       0.77      1.00      0.87        10
          2       1.00      0.84      0.91        19

avg / total       0.95      0.93      0.93        45



In [12]:
#can prune the model by changing min_samples_leaf and max_depth
clf.set_params(min_samples_leaf = 5)
clf.set_params(max_depth = 5)
clf.fit(x_train, y_train)
metrics.confusion_matrix(y_train, clf.predict(x_train))
metrics.confusion_matrix(y_test, clf.predict(x_test))

array([[16,  0,  0],
       [ 0, 10,  0],
       [ 0,  2, 17]])

In [13]:
#get the parents and children out of the tree structure
def get_lineage(tree, feature_names):
    left = tree.tree_.children_left
    right = tree.tree_.children_right
    threshold = tree.tree_.threshold
    features = [feature_names[i] for i in tree.tree_.feature]
    #get ids of child nodes
    idx = np.argwhere(left == -1)[:,0]
    def recurse(left, right, child, lineage=None):
        if lineage is None:
            lineage = [child]
        
        if child in left:
            parent = np.where(left==child)[0].item()
            split = 'l'
        
        else:
            parent = np.where(right==child)[0].item()
            split = 'r'
        
        lineage.append((parent, split, threshold[parent], features[parent]))
        
        if parent == 0:
            lineage.reverse()
            return lineage
        else: 
            return recurse(left, right, parent, lineage)
        
    for child in idx:
        for node in recurse(left, right, child):
            print node

In [14]:
get_lineage(myTree_fit, iris.feature_names)

(0, 'l', 0.80000001192092896, 'petal width (cm)')
1
(0, 'r', 0.80000001192092896, 'petal width (cm)')
(2, 'l', 1.75, 'petal width (cm)')
(3, 'l', 4.9499998092651367, 'petal length (cm)')
(4, 'l', 1.6500000953674316, 'petal width (cm)')
5
(0, 'r', 0.80000001192092896, 'petal width (cm)')
(2, 'l', 1.75, 'petal width (cm)')
(3, 'l', 4.9499998092651367, 'petal length (cm)')
(4, 'r', 1.6500000953674316, 'petal width (cm)')
6
(0, 'r', 0.80000001192092896, 'petal width (cm)')
(2, 'l', 1.75, 'petal width (cm)')
(3, 'r', 4.9499998092651367, 'petal length (cm)')
(7, 'l', 1.5499999523162842, 'petal width (cm)')
8
(0, 'r', 0.80000001192092896, 'petal width (cm)')
(2, 'l', 1.75, 'petal width (cm)')
(3, 'r', 4.9499998092651367, 'petal length (cm)')
(7, 'r', 1.5499999523162842, 'petal width (cm)')
(9, 'l', 6.9499998092651367, 'sepal length (cm)')
10
(0, 'r', 0.80000001192092896, 'petal width (cm)')
(2, 'l', 1.75, 'petal width (cm)')
(3, 'r', 4.9499998092651367, 'petal length (cm)')
(7, 'r', 1.5499999

In [15]:
#use pydot
import StringIO

#install pydot
import pydot
import pyparsing

In [16]:
dot_data = StringIO.StringIO()
tree.export_graphviz(myTree_fit, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf('iris_decisiontree.pdf')
print '\nimage created!'


image created!
