In [1]:
import json
import networkx
import pydot

import numpy as np
import pandas as pd

from networkx.readwrite import json_graph
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [2]:
def dot2json(file_in):
    """
    Translate .dot file to .json output (required for javascript consumption).
    """
    graph_netx = networkx.drawing.nx_pydot.read_dot(file_in)
    graph_json = json_graph.node_link_data( graph_netx )
    return json_graph.node_link_data(graph_netx)

In [3]:
data = load_iris()
X = data.data
y = data.target
display(X.shape, y.shape)

(150, 4)

(150,)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 50, test_size = 0.25)

In [325]:
clf = DecisionTreeClassifier(max_depth=4)
clf.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=4)

In [326]:
y_pred = clf.predict(X_test)

In [327]:
print('Accuracy Score on train data (using Default criterionas gini): ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))
print('Accuracy Score on test data (using Default criterionas gini): ', accuracy_score(y_true=y_test, y_pred=y_pred))

Accuracy Score on train data (using Default criterionas gini):  1.0
Accuracy Score on test data (using Default criterionas gini):  0.9473684210526315


In [328]:
# Changing the Criterion to Entropy
clf_entropy = DecisionTreeClassifier(criterion = 'entropy')
clf_entropy.fit(X_train, y_train)
y_pred_entropy =  clf_entropy.predict(X_test)
print('Accuracy Score on train data (using Default criterion as Entropy): ', accuracy_score(y_true=y_train, y_pred =clf_entropy.predict(X_train)))
print('Accuracy Score on test data (using Default criterion as Entropy): ', accuracy_score(y_true=y_test, y_pred =y_pred_entropy))

Accuracy Score on train data (using Default criterion as Entropy):  1.0
Accuracy Score on test data (using Default criterion as Entropy):  0.9473684210526315


In [329]:
# # export decisiont tree rules to .dot graph
# dotfile_name = 'dtree2.dot'
# dotfile = open(dotfile_name, 'w')
# export_graphviz(clf, out_file = dotfile, filled=True, rounded=True,
#                 special_characters=True,feature_names = data.feature_names,class_names=data.target_names)
# dotfile.close()
# json_data = dot2json(dotfile_name)
# jsonfile_name = 'treedata.json'
# with open(jsonfile_name, 'w') as f:
#     json.dump(json_data, f)

In [330]:
from sklearn.tree import export_text
print(export_text(clf))
[ 2 -2  3  2 -2  0 -2 -2  2  1 -2 -2 -2]

|--- feature_2 <= 2.60
|   |--- class: 0
|--- feature_2 >  2.60
|   |--- feature_3 <= 1.65
|   |   |--- feature_2 <= 5.00
|   |   |   |--- class: 1
|   |   |--- feature_2 >  5.00
|   |   |   |--- feature_0 <= 6.05
|   |   |   |   |--- class: 1
|   |   |   |--- feature_0 >  6.05
|   |   |   |   |--- class: 2
|   |--- feature_3 >  1.65
|   |   |--- feature_2 <= 4.85
|   |   |   |--- feature_1 <= 3.10
|   |   |   |   |--- class: 2
|   |   |   |--- feature_1 >  3.10
|   |   |   |   |--- class: 1
|   |   |--- feature_2 >  4.85
|   |   |   |--- class: 2



In [331]:
# sklearn PR version
def export_dict(decision_tree, feature_names=None):
    js = {}
    def node_to_str(tree, node_id, criterion):

        if not isinstance(criterion, six.string_types):
            criterion = "impurity"
        value = tree.tree_.value[node_id]
        if tree.tree_.n_outputs == 1:
            value = value[0, :]
        if tree.tree_.children_left[node_id] == sklearn.tree._tree.TREE_LEAF:

            return {"id": str(node_id),
#                     "criterion": criterion,
                    "impurity": str(tree.tree_.impurity[node_id]),
                    "samples": str(tree.tree_.n_node_samples[node_id]),
                    "value": str(value),
                   "class":  str(value.argmax(axis=0))
                   } 
        else:
            if feature_names is not None:
                feature = feature_names[tree.tree_.feature[node_id]]
            else:
                feature = tree.tree_.feature[node_id]
            return {"id": str(node_id),
                    "rule": f"{feature} <= {tree.tree_.threshold[node_id]:.4f}",
                    criterion: str(tree.tree_.impurity[node_id]),
                    "samples": str(tree.tree_.n_node_samples[node_id])
                   } 
            
    def recurse(tree, node_id, criterion, parent=None):
        left_child = tree.tree_.children_left[node_id]
        right_child = tree.tree_.children_right[node_id]
        js = node_to_str(tree, node_id, criterion)
        if left_child != sklearn.tree._tree.TREE_LEAF:
            js["left"] = recurse(tree, left_child, criterion=criterion, parent=node_id)
            js['right'] = recurse(tree,right_child,criterion=criterion,parent=node_id)
            
        return js
    
    
    if isinstance(decision_tree, sklearn.tree.DecisionTreeClassifier):
        js = recurse(decision_tree, 0, criterion="impurity")
    else:
        js = recurse(decision_tree.tree_, 0, criterion=decision_tree.criterion)

    return js 

In [360]:
#d3 version
def export_dict(decision_tree, feature_names=None):
    js = {}
    def node2str(tree, node_id, criterion):

        if not isinstance(criterion, six.string_types):
            criterion = "impurity"
        value = tree.tree_.value[node_id]
        if tree.tree_.n_outputs == 1:
            value = value[0, :]
        if tree.tree_.children_left[node_id] == sklearn.tree._tree.TREE_LEAF:
            # if end node
            return {"id": str(node_id),
                    "impurity": str(tree.tree_.impurity[node_id]),
                    "samples": str(tree.tree_.n_node_samples[node_id]),
                    "value": str(value),
                   "class":  str(value.argmax(axis=0))
                   } 
        else:
            if feature_names is not None:
                feature = feature_names[tree.tree_.feature[node_id]]
            else:
                feature = tree.tree_.feature[node_id]
            return {"id": str(node_id),
                    'children':[],
                    "name": f"{feature} <= {tree.tree_.threshold[node_id]:.4f}",
                    criterion: str(tree.tree_.impurity[node_id]),
                    "samples": str(tree.tree_.n_node_samples[node_id])
                   } 
            
    def recurse(tree, node_id, criterion, parent=None):
        left_child = tree.tree_.children_left[node_id]
        right_child = tree.tree_.children_right[node_id]
        js = node2str(tree, node_id, criterion)
        # if not end node, append left and right to children
        if left_child != sklearn.tree._tree.TREE_LEAF: 
            js['children'].append(recurse(tree, left_child, criterion=criterion, parent=node_id))
            js['children'].append(recurse(tree, right_child,criterion=criterion,parent=node_id))
        return js
    
    
    if isinstance(decision_tree, sklearn.tree.DecisionTreeClassifier):
        js = recurse(decision_tree, 0, criterion="impurity")
    else:
        js = recurse(decision_tree.tree_, 0, criterion=decision_tree.criterion)

    return js

In [361]:
cols = dict()
for i, c in enumerate(data.feature_names):
    cols[i] = c
tt = export_dict(clf, feature_names=cols)
# print(tt)
import pprint
pp = pprint.PrettyPrinter(indent=0)
pp.pprint(tt)

# save to json
jsonfile_name = 'treedata.json'
with open(jsonfile_name, 'w') as f:
    json.dump(tt, f)

cols {0: 'sepal length (cm)', 1: 'sepal width (cm)', 2: 'petal length (cm)', 3: 'petal width (cm)'}
{'children': [{'class': '0',
            'id': '1',
            'impurity': '0.0',
            'samples': '39',
            'self': '0',
            'value': '[39.  0.  0.]'},
            {'children': [{'children': [{'class': '1',
                                    'id': '4',
                                    'impurity': '0.0',
                                    'samples': '33',
                                    'self': '1',
                                    'value': '[ 0. 33.  0.]'},
                                    {'children': [{'class': '1',
                                                'id': '6',
                                                'impurity': '0.0',
                                                'samples': '1',
                                                'self': '1',
                                                'value': '[0. 1. 0.]'},
              

In [335]:
def viz(decision_tree, feature_names=None):
    js = ""
    def node_to_str(tree, node_id, criterion):

        if not isinstance(criterion, six.string_types):
            criterion = "impurity"
        value = tree.tree_.value[node_id]
        if tree.tree_.n_outputs == 1:
            value = value[0, :]
        if tree.tree_.children_left[node_id] == sklearn.tree._tree.TREE_LEAF:

            return '{"id": "%s", "criterion": "%s", "impurity": "%s", "samples": "%s", "value": "%s"}' \
                 % (node_id, 
                    criterion,
                    tree.tree_.impurity[node_id],
                    tree.tree_.n_node_samples[node_id],
                    value)
        else:
            if feature_names is not None:
                feature = feature_names[tree.tree_.feature[node_id]]
            else:
                feature = tree.tree_.feature[node_id]
            return '"id": "%s", "rule": "%s <= %.4f", "%s": "%s", "samples": "%s"' \
                 % (node_id, 
                    feature,
                    tree.tree_.threshold[node_id],
                    criterion,
                    tree.tree_.impurity[node_id],
                    tree.tree_.n_node_samples[node_id])
            
    def recurse(tree, node_id, criterion, parent=None, depth=0):
        tabs = "  " * depth
        left_child = tree.tree_.children_left[node_id]
        right_child = tree.tree_.children_right[node_id]
        js = js + "\n" + \
             tabs + "{\n" + \
             tabs + "  " + node_to_str(tree, node_id, criterion)
        if left_child != sklearn.tree._tree.TREE_LEAF and depth < 6:
            js = js + ",\n" + \
               tabs + '  "left": ' + \
            recurse(tree, \
                       left_child, \
                       criterion=criterion, \
                       parent=node_id, \
                       depth=depth + 1) + ",\n" + \
               tabs + '  "right": ' + \
               recurse(tree, \
                       right_child, \
                       criterion=criterion, \
                       parent=node_id,
                       depth=depth + 1)

        js = js + tabs + "\n" + \
             tabs + "}"
        
        return js
    if isinstance(decision_tree, sklearn.tree.DecisionTreeClassifier):
        js = js + recurse(decision_tree, 0, criterion="impurity")
    else:
        js = js + recurse(decision_tree.tree_, 0, criterion=decision_tree.criterion)

    return js

In [363]:
data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')