# Importing Modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree._tree import TREE_LEAF, TREE_UNDEFINED
from sklearn.metrics import accuracy_score,precision_score,recall_score,roc_curve,auc

# Importing The Dataset

In [2]:
df = pd.read_csv('data_1.csv')

In [3]:
df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,mean_value_of_long_term_variability,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,132,0.006,0.0,0.006,0.003,0.0,0.0,17,2.1,0,10.4,130,68,198,6,1,141,136,140,12,0,1
1,133,0.003,0.0,0.008,0.003,0.0,0.0,16,2.1,0,13.4,130,68,198,5,1,141,135,138,13,0,1
2,134,0.003,0.0,0.008,0.003,0.0,0.0,16,2.4,0,23.0,117,53,170,11,0,137,134,137,13,1,1
3,132,0.007,0.0,0.008,0.0,0.0,0.0,16,2.4,0,19.9,117,53,170,9,0,137,136,138,11,1,1
4,131,0.005,0.072,0.008,0.003,0.0,0.0,28,1.4,0,12.9,66,88,154,5,0,135,134,137,7,1,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450 entries, 0 to 1449
Data columns (total 22 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   baseline value                                          1450 non-null   int64  
 1   accelerations                                           1450 non-null   float64
 2   fetal_movement                                          1450 non-null   float64
 3   uterine_contractions                                    1450 non-null   float64
 4   light_decelerations                                     1450 non-null   float64
 5   severe_decelerations                                    1450 non-null   float64
 6   prolongued_decelerations                                1450 non-null   float64
 7   abnormal_short_term_variability                         1450 non-null   int64  
 8   mean_value_of_short_term_variability  

No Missing Values

In [5]:
df.isna().any().any()

False

# Splitting the dataset

In [6]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values
y = label_binarize(y,classes=[1,2,3])
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

# Classifying using DecisionTreeClassifier

In [7]:
def compute_metrics(model):
  y_pred = model.predict(X_test)
  print("Accuracy ",accuracy_score(y_test,y_pred))
  print("Precision ", precision_score(y_test,y_pred,average='micro'))
  print("Recall ",recall_score(y_test,y_pred,average='micro'))

In [8]:
decision_tree = DecisionTreeClassifier(random_state=0, criterion = 'entropy')
decision_tree.fit(X_train,y_train)
compute_metrics(decision_tree)

Accuracy  0.9655172413793104
Precision  0.9655172413793104
Recall  0.9655172413793104
Depth  14


In [9]:
from random import randint,seed
from math import log2
from copy import deepcopy

In [10]:
innernodes = [0]*1000
leafnodes = [0]*1000
seed(2)

def is_leaf(tree,index):
  return tree.children_right[index]==TREE_LEAF and tree.children_left[index]==TREE_LEAF

def dfs(tree, index, data):
  if(is_leaf(tree,index)):
    leafnodes[index]+=1
    return
  
  leftdata = []
  rightdata = []

  innernodes[index] += 1
  currfeature = tree.feature[index]
  for i in range(len(data)):
    if(data[i][currfeature] < tree.threshold[index]):
      leftdata.append(data[i])
    else: rightdata.append(data[i])
  
  dfs(tree,tree.children_left[index],leftdata)
  dfs(tree,tree.children_right[index],rightdata)

  innernodes[index] += innernodes[tree.children_left[index]] + innernodes[tree.children_right[index]]
  leafnodes[index] += leafnodes[tree.children_left[index]] + leafnodes[tree.children_right[index]]

cost = [0] * 1000
error_t = randint(1,10)

def hash(v): 
  return (-1 * randint(1,100) * v if (randint(1,100)%2) else randint(1,100) * v)/randint(1,100)

def find_encoding(v):
  #using some hashing function to encode v
  return hash(v)


def cal_cost(tree, index, data):
  cost[index] = innernodes[index] * cost_encoding_internalnodes + leafnodes[index] * cost_encoding_leafnodes + error_t * len(data) + find_encoding(tree.threshold[index])
  leftdata = []
  rightdata = []

  currfeature = tree.feature[index]
  for i in range(len(data)):
    if(data[i][currfeature] < tree.threshold[index]):
      leftdata.append(data[i])
    else: rightdata.append(data[i])
  
  if(is_leaf(tree, index)): return
  cal_cost(tree, tree.children_left[index], leftdata)
  cal_cost(tree, tree.children_right[index], rightdata)

L_test_internal = log2(20)
def pruning(tree, index):
  if(is_leaf(tree,index)): return

  # cost[node] < cost[left]  + cost[right]
  if(cost[index] < cost[tree.children_left[index]] + cost[tree.children_right[index]]):
    # time to prune
    tree.children_left[index] = TREE_LEAF
    tree.children_right[index] = TREE_LEAF
    return 
  else:
    pruning(tree,tree.children_left[index])
    pruning(tree,tree.children_right[index])

X_train_copy = deepcopy(X_train)
dfs(decision_tree.tree_, 0, X_train_copy)
cost_encoding_internalnodes = log2(20)
cost_encoding_leafnodes = log2(leafnodes[0])
cal_cost(decision_tree.tree_, 0, X_train_copy)
pruning(decision_tree.tree_, 0)


2.8085106382978724
13.255813953488373
5.076923076923077
187.2
-50.285714285714285
19.294117647058822
-16.5
10.523076923076923
-15.0
-17.5609756097561


In [11]:
compute_metrics(decision_tree)

Accuracy  0.9344827586206896
Precision  0.9344827586206896
Recall  0.9344827586206896
Depth  14
