## PROJECT- Decision Tree Implementation
##### BY: ADITI DONA

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from math import log2

In [2]:
iris = datasets.load_iris()

In [3]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [4]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [5]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a
...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,c,b,c,d
146,6.3,2.5,5.0,1.9,c,a,c,d
147,6.5,3.0,5.2,2.0,c,b,c,d
148,6.2,3.4,5.4,2.3,c,c,d,d


In [6]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [7]:
df.columns=['sapleLength','sapleWidth','petalLength','petalWidth']

In [8]:
df.head(10)

Unnamed: 0,sapleLength,sapleWidth,petalLength,petalWidth
0,b,c,a,a
1,a,b,a,a
2,a,c,a,a
3,a,c,a,a
4,a,c,a,a
5,b,d,a,a
6,a,c,a,a
7,a,c,a,a
8,a,b,a,a
9,a,c,a,a


In [9]:
y=iris.target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Entropy

In [10]:
def entropy(y):
    
# Total entries
    total=len(y)
    
# Set of distinct values(classes)
    classes=set(y)
    
# Entropy calculation
    classCount=0
    entropy=0.0
    for i in classes:
        classCount=len(y[y==i])
        entropy+=-(classCount/total)*log2(classCount/total)
        
# Entropy of the Current Node    
    return entropy

## Gain Ratio

In [11]:
def gainRatio(df,y,selectedFeature):
    
# initial entropy
    initial_entropy=entropy(y)

# calculating entropy and split info after splitting
    total=df.count()[0]
    final_entropy=0
    split_info=0
    classes=set(df[selectedFeature])
    for i in classes:
        classCount=len(df[df[selectedFeature]==i])
        new_df=df[df[selectedFeature]==i]
        final_entropy+=(classCount/total)*entropy(new_df['target'])
        split_info-=(classCount/total)*log2(classCount/total)
        
    
# information gain
    info_gain=initial_entropy-final_entropy
    gain_ratio=info_gain/split_info
    
    return gain_ratio

In [12]:
# Function to print the details of a particular node using Y
def nodeDetails(y,level):
    print('Level :',level)
    classes=set(y)
    for i in classes:
        classCount=len(y[y==i])
        print('Count of ',i,' = ',classCount)
    print('Current entropy is : ',entropy(y))

## Function of build tree

In [16]:
def build_tree(df,y,unused_features,level):
#base case
# 1. y contains only one distinct value-----> Pure Node is reached
    if len(set(y))==1:
        nodeDetails(y,level)  # Print the details
        print('Reached Leaf Node')
        print()
        return
        
# 2. unused_features is empty-----> No more splitting possible
    if(len(unused_features)==0):
        nodeDetails(y,level)  # Print the details
        print('No more splitting possible')
        print()
        return
    
# Calculation of the best features to split upon using the GAIN_RATIO given by the UNUSED_FEATURES    
    nodeDetails(y,level)                    # Print the node details
    best_feature=""                         # To store best_feature
    maxGainRatio=0                          # To store the Max gain Ratio 
    for i in unused_features:
        feature_gain=gainRatio(df,y,i)      # Funtion returns the Gain Ratio for a Selected Feature
        if(feature_gain>maxGainRatio):
            maxGainRatio=feature_gain
            best_feature=i

# Print the BEST FEATURE and the GAIN RATIO obtained 
    print('Splitting on feature ','"',best_feature,'"',' with gain ratio ',maxGainRatio)
    print()

# Updating the unused_features list by removing the selected feature
    unused_features.remove(best_feature)

# Splitting the ROOT into its children based on the number of disctinct values in the selected_feature column
    classes=set(df[best_feature])

# Splitting the ROOT
    for i in classes:
        new_df=df[df[best_feature]==i]
        new_y=new_df['target']
        build_tree(new_df,new_y,unused_features,level+1)  # Recursive Calls

In [17]:
def main(df,y):
    df['target']=y   # df contains both feature column and target column together

    unused_features = set(df.columns) # list of features which will we used to split
    unused_features.remove('target') #removing the target column from the unused_featured

    # Function call to build the tree
    build_tree(df,y,unused_features,0)

In [18]:
main(df,y)

Level : 0
Count of  0  =  50
Count of  1  =  50
Count of  2  =  50
Current entropy is :  1.584962500721156
Splitting on feature  " petalWidth "  with gain ratio  0.699638203622209

Level : 1
Count of  2  =  34
Current entropy is :  0.0
Reached Leaf Node

Level : 1
Count of  0  =  50
Current entropy is :  0.0
Reached Leaf Node

Level : 1
Count of  1  =  40
Count of  2  =  16
Current entropy is :  0.863120568566631
Splitting on feature  " petalLength "  with gain ratio  0.4334099495621067

Level : 2
Count of  2  =  8
Current entropy is :  0.0
Reached Leaf Node

Level : 2
Count of  1  =  39
Count of  2  =  8
Current entropy is :  0.6581912658132185
Splitting on feature  " sapleLength "  with gain ratio  0.12674503775809332

Level : 3
Count of  2  =  1
Current entropy is :  0.0
Reached Leaf Node

Level : 3
Count of  1  =  23
Count of  2  =  7
Current entropy is :  0.783776947484701
Splitting on feature  " sapleWidth "  with gain ratio  0.07092036405148876

Level : 4
Count of  1  =  3
Count