In [1]:
from sklearn import datasets
import pandas as pd
import numpy as np
import queue

In [2]:
iris = datasets.load_iris()

In [3]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [4]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [5]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')

In [6]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [7]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [8]:
class tree:
    def __init__(self,x,y,z,isleaf,f = ''):
        self.count1 = x                     ## Count of Iris-Setosa(0)
        self.count2 = y                     ## Count of Iris-Versicolor(1)
        self.count3 = z                     ## Count of Iris-Virginica(2)
        self.isleaf  = isleaf               ## For checking if the node is leaf or not
        self.feature_selected = f
        self.children = []                  ## Storing children of a node
    
    def add_children(self,child):
        self.children.append(child)

In [9]:
## Calculate split info i.e entropy after the split
def split(df,f,y):
    split_info = 0
    unique_feature_values = set(df[f])
    for i in unique_feature_values:
        split_info += (len(df[df[f] == i])/len(df))*entropy(df[df[f] == i],y[df[f] == i])
    return split_info

In [10]:
## Calculate Entropy
def entropy(df,y):
    entr = 0
    for i in set(y.Predicted):
        entr += -(len(y[y["Predicted"] == i])/len(df))*(np.log2(len(y[y["Predicted"] == i])/len(df)))
    return entr

In [11]:
## Calculate gain entropy
def gain(df,f,y):
    total = len(df)
    entr = entropy(df,y)
    split_info = split(df,f,y)
    
    gain = entr/split_info
    return f,gain,entr

In [12]:
## Calculate on which particular feature to split the dataset
def split_feature(df,features,y):
    gain_of_feature = 0
    for i in features:
        selected_feature,gain_ratio,entr = gain(df,i,y)
        if gain_of_feature < gain_ratio:
            gain_of_feature = gain_ratio
            feature_to_split_upon = selected_feature
    
    features.remove(feature_to_split_upon)
    return feature_to_split_upon,gain_of_feature,entr

In [13]:
## Combined function to implement the tree and
## printing the output in required format

def build_tree(df, y,features,n):
    ## Checking for leaf node
    ## Condition 1:  If there are no more features to split upon
    ## Condition 2:  If the node is a pure node i.e contains only one type of data
    if(len(features) == 0 or len(set(y.Predicted)) == 1 ):  
        print("Level ",n)
        count1,count2,count3 = 0,0,0
        if len(y[y.Predicted == 0].Predicted.value_counts().values) != 0:
            print("Count of Iris-Setosa = ",y[y.Predicted == 0].Predicted.value_counts().values[0])
            count1 = y[y.Predicted == 0].Predicted.value_counts().values[0]
        else:
            print("Count of Iris-Setosa = 0")
        if len(y[y.Predicted == 1].Predicted.value_counts().values) != 0:
            print("Count of Iris-versicolor = ",y[y.Predicted == 1].Predicted.value_counts().values[0])
            count2 = y[y.Predicted == 1].Predicted.value_counts().values[0]
        else:
            print("Count of Iris-Versicolor = 0")
        if len(y[y.Predicted == 2].Predicted.value_counts().values) != 0:
            print("Count of Iris-Virginica = ",y[y.Predicted == 2].Predicted.value_counts().values[0])
            count3 = y[y.Predicted == 2].Predicted.value_counts().values[0]
        else:
            print("Count of Iris-Virginica = 0")
        print("Reached Leaf Node")
        print()
        return tree(count1,count2,count3,True)
        
    feature_to_split_upon,gain_of_feature,info_gain = split_feature(df,features,y)
    
    l,m,k = 0,0,0
    print("Level ",n)
    if len(y[y.Predicted == 0].Predicted.value_counts().values) != 0:
        print("Count of Iris-Setosa = ",y[y.Predicted == 0].Predicted.value_counts().values[0])
        l = y[y.Predicted == 0].Predicted.value_counts().values[0]
    else:
        print("Count of Iris-Setosa = 0")
    if len(y[y.Predicted == 1].Predicted.value_counts().values) != 0:
        print("Count of Iris-Versicolor = ",y[y.Predicted == 1].Predicted.value_counts().values[0])
        m = y[y.Predicted == 1].Predicted.value_counts().values[0]
    else:
        print("Count of Iris-Versicolor = 0")
    if len(y[y.Predicted == 2].Predicted.value_counts().values) != 0:
        print("Count of Iris-Virginica = ",y[y.Predicted == 2].Predicted.value_counts().values[0])
        k = y[y.Predicted == 2].Predicted.value_counts().values[0]
    else:
        print("Count of Iris-Virginica = 0")
    
    node = tree(l,m,k,False,feature_to_split_upon)
    print("Current Entropy is :",info_gain)
    print("Splitting on feature ",feature_to_split_upon," with gain ratio",gain_of_feature)
    
    print()

    for i in set(df[feature_to_split_upon]):
        listtemp = list(features)
        sub_tree = build_tree(df[df[feature_to_split_upon] == i],y[df[feature_to_split_upon] == i],listtemp,n+1)
        node.add_children([sub_tree,i])
        
    return node

In [14]:
y = pd.DataFrame(iris.target,columns = ["Predicted"])
unused_features = list(df.columns)
root = build_tree(df, y, unused_features,0)

Level  0
Count of Iris-Setosa =  50
Count of Iris-Versicolor =  50
Count of Iris-Virginica =  50
Current Entropy is : 1.584962500721156
Splitting on feature  pw_labeled  with gain ratio 4.918704784013118

Level  1
Count of Iris-Setosa = 0
Count of Iris-versicolor =  10
Count of Iris-Virginica = 0
Reached Leaf Node

Level  1
Count of Iris-Setosa = 0
Count of Iris-Versicolor = 0
Count of Iris-Virginica =  34
Reached Leaf Node

Level  1
Count of Iris-Setosa =  50
Count of Iris-Versicolor = 0
Count of Iris-Virginica = 0
Reached Leaf Node

Level  1
Count of Iris-Setosa = 0
Count of Iris-Versicolor =  40
Count of Iris-Virginica =  16
Current Entropy is : 0.863120568566631
Splitting on feature  pl_labeled  with gain ratio 1.5624622032059474

Level  2
Count of Iris-Setosa = 0
Count of Iris-versicolor =  1
Count of Iris-Virginica = 0
Reached Leaf Node

Level  2
Count of Iris-Setosa = 0
Count of Iris-Versicolor = 0
Count of Iris-Virginica =  8
Reached Leaf Node

Level  2
Count of Iris-Setosa = 0

In [15]:
def print_tree(root):
    if root is None:
        return
    t = 0
    q = queue.Queue(maxsize = 100)   
    q.put([root,t])
    while not q.empty(): 
        n = q.qsize()
        while n > 0: 
            p = q.get()
            if(p[0].isleaf):
                out = max(p[0].count1,max(p[0].count2,p[0].count3))
                if out == p[0].count1:
                    print("Node:",p[1],"Leaf Node with Output:Iris-Setosa(0)")
                if out == p[0].count2:
                    print("Node:",p[1],"Leaf Node with Output:Iris-Versicolor(1)")
                if out == p[0].count3:
                    print("Node:",p[1],"Leaf Node with Output:Iris-Virginica(2)")
                print()
            else:
                print("Node:",p[1],"Feature on which it is splitted: " ,p[0].feature_selected)
                print("Acc. to the values of this feature its children are : ",end ="")
                for i in range(len(p[0].children)): 
                    t = t + 1
                    print("Node",t,":",p[0].children[i][1],",",end = " ")
                    q.put([p[0].children[i][0],t])
                    
                print()
                print()
            n = n - 1 
        
    

In [16]:
print_tree(root)

Node: 0 Feature on which it is splitted:  pw_labeled
Acc. to the values of this feature its children are : Node 1 : b , Node 2 : d , Node 3 : a , Node 4 : c , 

Node: 1 Leaf Node with Output:Iris-Versicolor(1)

Node: 2 Leaf Node with Output:Iris-Virginica(2)

Node: 3 Leaf Node with Output:Iris-Setosa(0)

Node: 4 Feature on which it is splitted:  pl_labeled
Acc. to the values of this feature its children are : Node 5 : b , Node 6 : d , Node 7 : c , 

Node: 5 Leaf Node with Output:Iris-Versicolor(1)

Node: 6 Leaf Node with Output:Iris-Virginica(2)

Node: 7 Feature on which it is splitted:  sl_labeled
Acc. to the values of this feature its children are : Node 8 : c , Node 9 : b , Node 10 : a , Node 11 : d , 

Node: 8 Feature on which it is splitted:  sw_labeled
Acc. to the values of this feature its children are : Node 12 : b , Node 13 : a , Node 14 : c , 

Node: 9 Leaf Node with Output:Iris-Versicolor(1)

Node: 10 Leaf Node with Output:Iris-Virginica(2)

Node: 11 Leaf Node with Output:Ir