In [1]:
from sklearn import datasets
import pandas as pd
import math as m
import numpy as np

In [112]:
#load data
iris = datasets.load_iris()

In [113]:
#convert loaded data into dataframe
#and change the name of the columns
df = pd.DataFrame(iris.data)
df.columns = ['sl', 'sw', 'pl', 'pw']

In [114]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, o):
    second = df[o].mean()
    minimum = df[o].min()
    first = (minimum + second)/2
    maximum = df[o].max()
    third = (maximum + second)/2
    return df[o].apply(label, args= (first, second, third))

In [115]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')

In [116]:
#drop all columns with numerical values
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [117]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [118]:
(df['pw_labeled']=='d').sum()

34

In [119]:
#define function en for calculation of entropy
def en(y):
    pro = 0
    
    #loop over all values of y['class']
    for i in set(y['class']):
        a1 = (y['class']==i) 
        
        #step for calculating entropy 
        #for each label in y['class']
        pro += (len(y[a1])/len(y))*(m.log((len(y[a1])/len(y)),2))
    #check the value of pro("net entropy")
    #if pro is 0 then return pro
    #because log base 2 will return positive number
    #else return -pro
    if pro==0:
        return pro
    else:
        return (-1)*pro

In [120]:
#define class for implementation of decision tree

class Tree(object):
    
    #constructor for tree class
    def __init__(self, df):
        self.data = len(df)
        for i in set(df):
            self.i = None
            
    #set value function to print values of each step
    #and to return the actual value for final tree
    def setchildvalues(self,df): 
        print(self.data)
        for i in set(df):
            self.i = (df==i).sum()
            print(i,":",self.i,end= ",")
            
        li = []
        for i in set(df):
            li.append((df==i).sum())
        print('\n')
        return self.data,li
   

In [124]:
#displayTree function final tree
#taking list as an argument
def displayTree(li):
    print("-----FINAL TREE-----")
    for i in range(len(li)):
        print("Root ",li[i][0],'\n')
        print("Child ",li[i][1])
        print('\n')

In [122]:
def build_tree(df, y, unused_columns,li,level):
    
    #determine value of total number of 0,1,2
    #in passed y in build_tree function
    count0,count1,count2 = 0,0,0
    a = (np.array(y)).ravel()
    for i in range(a.shape[0]):
        if a[i]==0:
            count0+=1
        elif a[i]==1:
            count1+=1
        else:
            count2+=1
        
    
    #base case
    # 1. unused_columns is empty
    # 2. y contains only one distinct value
    if (len(set(y['class']))==1) or (len(unused_columns)==0):
        
        #print out the required information at that level
        print("level : ",level)
        print("count of 0 : ",count0)
        print("count of 1 : ",count1)
        print("count of 2 : ",count2)
        print("entropy : ",en(y))
        print("reached leaf node")
        print('\n')
        #update value at each level
        level += 1
        
    else:
        best_feature = ""
        max_gain = 0
        for f in unused_columns:
            possible_values = set(df[f])
            entropy = 0
            split = 0
            for i in possible_values:
                a = (df[f]==i)
                # loop over possible values : val
                # find subset of df & y with f == val
                #calculate net value of entropy and split info
                #at each labeled value
                entropy += (len(df[a])/len(df[f]))*en(y[a])
                split += (len(df[a])/len(df[f]))*(m.log((len(df[a])/len(df[f])),2))
            split *= (-1)
            
            #calculate value of gr and gi for each column
            #gi stand for gain info 
            #gr stand for gain ratio
            gi = en(y) - entropy
            gr = gi/split
            
            #compare max_gain with calculated gain at each label value
            #update max gain if calculate gain is more 
            if gr>max_gain:
                max_gain = gr
                best_feature = f
                
        #print out all the required value
        print("Best Feature ", best_feature)
        print("level : ", level)
        print("count of 0 : ",count0)
        print("count of 1 : ",count1)
        print("count of 2 : ",count2)
        print("entropy : ",en(y))
        print("gain ratio :",gr)
        print('\n')
        
        #update value of level
        level += 1
        
        #root object for Tree class 
        #append value in the li list and print each split
        #for best_feature
        root = Tree(df[best_feature])
        li.append(root.setchildvalues(df[best_feature]))
        
        # remove best feature from unused features
        # loop over possible values of best feature
        # call build tree recursively for each label value
        unused_columns = unused_columns - {best_feature}  
        for j in set(df[best_feature]):
            b = (df[best_feature]==j)
            build_tree(df[b],y[b],unused_columns,li,level)   
            
    #return final list
    return li

In [125]:
y = pd.DataFrame(iris.target)
y.columns = ['class']
unused_columns = set(df.columns)
li = []
#first call to build_tree function with level=0
#d receive the final list from build_tree function
d = build_tree(df, y, unused_columns,li,level=0)

#function call for displayTree 
#with final list passed as argument
displayTree(d)

Best Feature  pw_labeled
level :  0
count of 0 :  50
count of 1 :  50
count of 2 :  50
entropy :  1.584962500721156
gain ratio : 0.1729048730313931


150
d : 34,c : 56,b : 10,a : 50,

level :  1
count of 0 :  0
count of 1 :  0
count of 2 :  34
entropy :  0.0
reached leaf node


Best Feature  pl_labeled
level :  1
count of 0 :  0
count of 1 :  40
count of 2 :  16
entropy :  0.863120568566631
gain ratio : 0.0072109375996113525


56
d : 8,c : 47,b : 1,

level :  2
count of 0 :  0
count of 1 :  0
count of 2 :  8
entropy :  0.0
reached leaf node


Best Feature  sl_labeled
level :  2
count of 0 :  0
count of 1 :  39
count of 2 :  8
entropy :  0.6581912658132185
gain ratio : 0.04553496474578601


47
d : 2,c : 30,b : 14,a : 1,

level :  3
count of 0 :  0
count of 1 :  2
count of 2 :  0
entropy :  0.0
reached leaf node


Best Feature  sw_labeled
level :  3
count of 0 :  0
count of 1 :  23
count of 2 :  7
entropy :  0.783776947484701
gain ratio : 0.07092036405148876


30
c : 6,b : 20,a : 4,

lev