In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import tempfile
import zipfile
import os
from sklearn.metrics import auc, accuracy_score
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier


In [2]:
data = pd.read_csv('/content/drive/MyDrive/inversion/train_test_data.csv')

In [3]:
data.shape

(1308954, 53)

In [4]:
X = data.drop(['output'], axis='columns')
y = data['output']

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [6]:
model_dt = tree.DecisionTreeClassifier(max_depth=78)
model_dt.fit(X_train, Y_train)

DecisionTreeClassifier(max_depth=78)

In [7]:
model_score = model_dt.score(X_test, Y_test)
predictions_test= model_dt.predict(X_test)

In [8]:
print('samples', len(X_test))
print('match', np.count_nonzero(predictions_test==Y_test))
print('mismatch', (len(X_test) - (np.count_nonzero(predictions_test==Y_test))))
print('error', ((len(X_test) - (np.count_nonzero(predictions_test==Y_test))))/(len(X_test)))
print('model score', model_dt.score(X_test, Y_test)*100)

samples 392687
match 353478
mismatch 39209
error 0.09984797052105111
model score 90.01520294789489


**Convert the tree to decision table**

This tree has 128189 nodes. Which is a very large decision tree. Below I have demonstrated how any tree can be converted into a decision table and use that table to make predictions.

In [28]:
n_nodes = model_dt.tree_.node_count
n_nodes

128189

In [9]:
n_nodes = model_dt.tree_.node_count
children_left = model_dt.tree_.children_left
children_right = model_dt.tree_.children_right
feature = model_dt.tree_.feature
threshold = model_dt.tree_.threshold
label = model_dt.tree_.value


node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

   
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True
empty_array = np.empty((0, 7))
print("The binary tree structure has %s nodes and has "
      "the following tree structure printed into decision table:"
      % n_nodes)
print("[Node number], [Input feature index], [Value to compare], [Next node if ture], [Next node if false], [is leaf], [Output]")
for i in range(n_nodes):
    if is_leaves[i]:
        n = np.array(label[i])
        outputs_index = np.argmax(n)
        empty_array = np.append(empty_array, np.array([[i, -1, -1, -1, -1, 1, outputs_index]]), axis=0)
    else:
        empty_array = np.append(empty_array, np.array([[i, feature[i], threshold[i], children_left[i], children_right[i], 0, -1]]), axis=0)
print("\n")
print(empty_array)

The binary tree structure has 128189 nodes and has the following tree structure printed into decision table:
[Node number], [Input feature index], [Value to compare], [Next node if ture], [Next node if false], [is leaf], [Output]


[[ 0.00000e+00  1.10000e+01 -5.15500e+02 ...  4.04200e+03  0.00000e+00
  -1.00000e+00]
 [ 1.00000e+00  1.30000e+01 -5.14500e+02 ...  1.05300e+03  0.00000e+00
  -1.00000e+00]
 [ 2.00000e+00  1.70000e+01 -5.15500e+02 ...  4.18000e+02  0.00000e+00
  -1.00000e+00]
 ...
 [ 1.28186e+05 -1.00000e+00 -1.00000e+00 ... -1.00000e+00  1.00000e+00
   1.00000e+00]
 [ 1.28187e+05 -1.00000e+00 -1.00000e+00 ... -1.00000e+00  1.00000e+00
   0.00000e+00]
 [ 1.28188e+05 -1.00000e+00 -1.00000e+00 ... -1.00000e+00  1.00000e+00
   1.00000e+00]]


In [None]:
file = open("/content/drive/MyDrive/inversion/decision_table.txt", "w+")
content = str(empty_array)
file.write(content)
file.close()

**The converted Pandas DataFrame have 128189 row which is the same as the number of the nodes in the decision tree. Every row has the necessary information to make predictions.**

In [12]:
df = pd.DataFrame(empty_array, 
             columns=['Node number', 
                      'Input feature index',
                      'Value to compare',
                      'Next node if ture',
                      'Next node if false',
                      'is leaf',
                      'Output'])

df

Unnamed: 0,Node number,Input feature index,Value to compare,Next node if ture,Next node if false,is leaf,Output
0,0.0,11.0,-515.5,1.0,4042.0,0.0,-1.0
1,1.0,13.0,-514.5,2.0,1053.0,0.0,-1.0
2,2.0,17.0,-515.5,3.0,418.0,0.0,-1.0
3,3.0,36.0,497.5,4.0,245.0,0.0,-1.0
4,4.0,9.0,504.5,5.0,168.0,0.0,-1.0
...,...,...,...,...,...,...,...
128184,128184.0,-1.0,-1.0,-1.0,-1.0,1.0,0.0
128185,128185.0,28.0,-511.0,128186.0,128187.0,0.0,-1.0
128186,128186.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0
128187,128187.0,-1.0,-1.0,-1.0,-1.0,1.0,0.0


In [None]:
df.to_csv('/content/drive/MyDrive/inversion/decision_table.csv', index=False) 

**Function for making predictions**

In [26]:
def leaf(node):
    i = node
    leaf = df.loc[df['Node number'] == i]['is leaf'].values[0]
    if leaf == 0:
        node_op(i, variable)
    else:
        global predicted_class
        predicted_class = generate_output(i)
       
    return predicted_class

def node_op(node, variable):
    i = node
    variable_index = df.loc[df['Node number'] == i]['Input feature index'].values[0]
    compare_value = df.loc[df['Node number'] == i]['Value to compare'].values[0]
    if variable_index == np.nan:
        leaf(i)
    else:
        variable_index = int(variable_index)
        variable = variable[variable_index]
    

    if compare_value == np.nan:
        leaf(i)
    else:
        if variable < compare_value:
            node = df.loc[df['Node number'] == i]['Next node if ture'].values[0]
        else: 
            node = df.loc[df['Node number'] == i]['Next node if false'].values[0]

    final_output_from_tree = leaf(node)

    return final_output_from_tree

def generate_output(node):
    i = node
    predicted_class = df.loc[df['Node number'] == i]['Output'].values[0]
    return predicted_class

In [21]:
lable = Y_test.to_numpy()
test = X_test.to_numpy()
test[2]

array([ 210.,  446., -226., -478.,  187., -516., -225.,  447.,  212.,
        441.,  215., -477., -225., -433., -204.,  423., -240., -524.,
       -215., -477.,  160., -428.,  185., -463., -220., -482.,  440.,
       -228., -502.,  168.,  490.,  138., -451., -230.,  445., -170.,
        489.,  192., -478.,  151., -439., -209., -434.,  121., -474.,
        168.,  507.,  160.,  502.,  171., -438.,  175.])

In [27]:
node = 0
sample = 2
variable = test[sample] #input for test
x = node_op(node, variable)
print("Prediction for the inputs", x)
print("Actual output", lable[sample])

Prediction for the inputs 0.0
Actual output 0.0
