# Decision Tree (self made)

### 1. Importing libraries

In [17]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics

### 2. Data Preprocessing

In [18]:
pima = pd.read_csv("diabetes.csv")

pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [19]:
pima.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [20]:
pima.loc[1]

Pregnancies                  1.000
Glucose                     85.000
BloodPressure               66.000
SkinThickness               29.000
Insulin                      0.000
BMI                         26.600
DiabetesPedigreeFunction     0.351
Age                         31.000
Outcome                      0.000
Name: 1, dtype: float64

In [21]:
all_cols = list(pima.columns)

output_cols = all_cols[-1]

### 3. Initialization and Setup

In [22]:
def entropy(col):
    
    counts = np.unique(col,return_counts=True)
    sz = float(col.shape[0])
    
    ent = 0.0
    
    for ix in counts[1]:
        p  = ix/sz
        ent += (-1.0*p*np.log2(p))
    
    return ent
    
def divide_data(x_data,fkey,fval):
    
    x_right = pd.DataFrame([],columns=x_data.columns)
    x_left = pd.DataFrame([],columns=x_data.columns)
    
    for ix in range(x_data.shape[0]):
        val = x_data[fkey].loc[ix]
        
        if val > fval:
            x_right = x_right.append(x_data.loc[ix])
        else:
            x_left = x_left.append(x_data.loc[ix])
            
    return x_left,x_right

def information_gain(x_data,fkey,fval):
    
    left,right = divide_data(x_data,fkey,fval)
    
    l = float(left.shape[0])/x_data.shape[0]
    r = float(right.shape[0])/x_data.shape[0]
    
    if left.shape[0] == 0 or right.shape[0] ==0:
        return 1000000 
    
    i_gain = entropy(x_data.Outcome) - (l*entropy(left.Outcome)+r*entropy(right.Outcome))
    return i_gain




### 4. Decision Tree Class

In [23]:
class DecisionTree:
    
    #Constructor
    def __init__(self,depth=0,max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
        
    def train(self,X_train):
        
        features = all_cols[:-1]
        info_gains = []
    
        for ix in features:
            i_gain = information_gain(X_train,ix,X_train[ix].mean())
            info_gains.append(i_gain)
            
        self.fkey = features[np.argmax(info_gains)]
        self.fval = X_train[self.fkey].mean()
        print("The chosen tree feature is: ",self.fkey)
        
        #Split Data
        data_left,data_right = divide_data(X_train,self.fkey,self.fval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
         
        #pure population found on one node
        if data_left.shape[0]  == 0 or data_right.shape[0] ==0:
            if X_train.Outcome.mean() >= 0.5:
                self.target = 1
            else:
                self.target = 0
            return
        
        #Stop earyly when depth >=max depth
        if(self.depth>=self.max_depth):
            if X_train.Outcome.mean() >= 0.5:
                self.target = 1
            else:
                self.target = 0
            return
        
        #Recursive Case
        self.left = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left)
        
        self.right = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(data_right)
        
        
        if X_train.Outcome.mean() >= 0.5:
            self.target = 1
        else:
            self.target = 0
        return
    
    def predict(self,test):
        
        #print(self.fkey)
        if test[self.fkey]>self.fval:
            
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            
            if self.left is None:
                return self.target
            return self.left.predict(test)

### 5. Train-Test Split

In [24]:
split = int(0.7*pima.shape[0])
train_data = pima[:split]
test_data = pima[split:]
test_data = test_data.reset_index(drop=True)

In [25]:
print(train_data.shape,test_data.shape)

(537, 9) (231, 9)


### 6. Training

In [26]:
dt = DecisionTree()

In [27]:
dt.train(train_data)

The chosen tree feature is:  Glucose
The chosen tree feature is:  Age
The chosen tree feature is:  BMI
The chosen tree feature is:  Pregnancies
The chosen tree feature is:  Pregnancies
The chosen tree feature is:  Pregnancies
The chosen tree feature is:  Pregnancies
The chosen tree feature is:  Glucose
The chosen tree feature is:  Pregnancies
The chosen tree feature is:  SkinThickness
The chosen tree feature is:  Age
The chosen tree feature is:  Glucose
The chosen tree feature is:  BMI
The chosen tree feature is:  SkinThickness
The chosen tree feature is:  DiabetesPedigreeFunction
The chosen tree feature is:  Pregnancies
The chosen tree feature is:  Insulin
The chosen tree feature is:  Glucose
The chosen tree feature is:  SkinThickness
The chosen tree feature is:  BloodPressure
The chosen tree feature is:  Insulin
The chosen tree feature is:  Pregnancies
The chosen tree feature is:  Insulin
The chosen tree feature is:  Insulin
The chosen tree feature is:  Insulin
The chosen tree featur

In [28]:
y_actual = test_data[output_cols]
y_actual = np.array(y_actual)

print(type(y_actual))

<class 'numpy.ndarray'>


### 7. Prediction

In [29]:
y_pred = []
for ix in range(test_data.shape[0]):
    y_pred.append(dt.predict(test_data.loc[ix]))

In [30]:
y_pred = np.array(y_pred).reshape((-1,1))
print(y_pred.shape)
print(type(y_pred))

(231, 1)
<class 'numpy.ndarray'>


### 8. Scoring

In [31]:
acc = 0

sz = y_pred.shape[0]

for i in range(sz):
    
    if y_pred[i][0] == y_actual[i] :
        acc = acc + 1
        
acc = acc / sz

In [32]:
print(acc)

0.7532467532467533
