In [1]:
import numpy as np

# Part 1 of project: 

# `Printing the Decison Tree`

## functions used :

### 1. ````DT :```` main function to print all the details
### 2. ````find_info_gain :````  to find information gain
### 3. ````find_split_info:````  to get split info
### 4. ````find_best_feature :```` to find best feature to split on based on best gain ratio
### 5. ````entropy :````  to get entropy 


In [2]:
def DT(X,Y,features,level):
    
    #  Get unique classes of current node
    classes = set(Y) 
    
    # if current node is a Pure Node
    if(len(classes) == 1): 
        print("Level ",level)
        current_class = list(classes)[0]
        
        print("Count of ",current_class," = ",len(Y))
        print("Current Entropy is = 0.0")
        print("Reached leaf Node")
        print()
        return
    
    # If no feature is left to split
    elif(len(features) == 0): 
        print("Level ",level) 
        
        #finding count of each output class
        for current_class in classes:
            count_of_current_class = (Y == current_class).sum() 
            print("Count of ",current_class," = ",count_of_current_class) 
            
        #printing 
        entropy_current = entropy(Y)
        
        print("Current Entropy is = ",entropy_current)
        print("No more features left")
        print("Reached leaf Node")
        print()
        return
    
    #If all the features in the curr node can only take one value, then we can't split further
    elif((find_best_feature(X,Y,features))[0] == -1):
        print("Level ",level) 
        
        #Get count of each output class
        for i in classes:
            j = (Y == i).sum() 
            print("Count of ",i," = ",j)
        
        #finding class with majority and its entropy
        
        entropy_current = entropy(Y)
        
        print("Current Entropy is = ",entropy_current)
        print("No more features left")
        print("Reached leaf Node")
        print()
        return
        
    
    else:
        print("Level ",level) 
        
        for i in classes:
            j = (Y ==i).sum()
            print("Count of ",i," = ",j)
        
        #entropy Of Current Node
        print("Current Entropy is = ", entropy(Y))
        
        #getting best feature to split and its gain ratio
        best_feature,gain_ratio = find_best_feature(X,Y,features)
        print("Splitting on feature",best_feature,"with gain ratio :",gain_ratio)
        print()
        
        #Splitting curr node on all different values the best feature can have and recursively calling DT on each split
        diff_val_of_best_feat = set(X[:,best_feature]) # Get Different values of the BEST FEATURE in current data
        
        #Traverse each feature value, split on each 
        for current in diff_val_of_best_feat:

            x = X[(X[:,best_feature] == current)]
            y = Y[(X[:,best_feature] == current)]
            position_of_best_feat = np.where(features == best_feature)
            remaining_features = np.delete(features,position_of_best_feat)
            #Recursion
            DT(x,y,remaining_features,level + 1)
        
        return 
            

In [3]:
def find_info_gain(X,Y,feature):
    
    ent = entropy(Y)
    total_data = len(Y)
    weighted_avg = 0
    
    values = set(X[:,feature])  #values the feature can take
    
    for i in values:
        j = Y[(X[:,feature] == i)]
        i_ent = entropy(j)
      
        weighted_avg += (i_ent*len(j))/total_data
    
    #finding info gain
    info_gain = ent - weighted_avg
    return info_gain

In [4]:
def find_split_info(X, Y,feature):

    total_data = len(Y)
    weighted_avg = 0
    values = set(X[:,feature]) #values the feature can take

    for i in values:
        j = Y[(X[:,feature] == i)]
        weighted_avg += ((len(j) / total_data) * np.log2((len(j) / total_data)))
    return (-1 * weighted_avg)

In [5]:
def find_best_feature(X, Y,features):
    best_feature = -1
    max_gain = -1
    
    for current_feature in features: #traversing all features
        #finding split info and info gain
        split_info = find_split_info(X, Y,current_feature)
        info_gain = find_info_gain(X, Y,current_feature)       
    
        # finding gain ratio
        if(split_info != 0): 
            gain_ratio = info_gain/split_info
        else :
            gain_ratio = -1
        
        # getting best feature and best gain ratio
        if(gain_ratio>max_gain):
            best_feature = current_feature
            max_gain = gain_ratio
            
    return (best_feature,max_gain)

In [6]:
def entropy(Y):
    
    #getting unique classes
    classes = set(Y)
    total_data = len(Y)
    
    ent = 0
    
    # traversing all classes and getting entropy
    for current_class in classes:
        
        prob = (Y == current_class).sum()/total_data  # probability value
        ent += (prob * np.log2(prob)) 
    return (-1 * ent)

In [7]:
# Loading and Manipulation of  Iris Data

from sklearn import datasets
iris = datasets.load_iris()
x = iris.data
y = iris.target

# To convert values of features into discrete values
def cont_to_discrete(feature):
    second = feature.mean()
    first = second * 0.5
    third = second * 1.5
    for i in range(len(feature)):
        if(feature[i] < first):
            feature[i] = 0
        elif(feature[i] < second):
            feature[i] = 1
        elif(feature[i] < third):
            feature[i] = 2
        else:
            feature[i] = 3
    return feature

# Applying the function to all features
for i in range(x.shape[1]):
    cont_to_discrete(x[:,i])

    
features = np.array([i for i in range(x.shape[1])])




### `Output of Part 1 of Project ( Printing the Decision Tree )` 

In [8]:
# Calling the function to print our Decision Tree
DT(x,y,features,0)

Level  0
Count of  0  =  50
Count of  1  =  50
Count of  2  =  50
Current Entropy is =  1.584962500721156
Splitting on feature 3 with gain ratio : 0.7350016280496154

Level  1
Count of  0  =  49
Current Entropy is = 0.0
Reached leaf Node

Level  1
Count of  0  =  1
Count of  1  =  10
Current Entropy is =  0.4394969869215134
Splitting on feature 1 with gain ratio : 1.0

Level  2
Count of  1  =  10
Current Entropy is = 0.0
Reached leaf Node

Level  2
Count of  0  =  1
Current Entropy is = 0.0
Reached leaf Node

Level  1
Count of  1  =  39
Count of  2  =  5
Current Entropy is =  0.5107878229540133
Splitting on feature 2 with gain ratio : 0.2488471906913508

Level  2
Count of  1  =  1
Current Entropy is = 0.0
Reached leaf Node

Level  2
Count of  1  =  38
Count of  2  =  4
Current Entropy is =  0.4537163391869448
Splitting on feature 1 with gain ratio : 0.04070432026142338

Level  3
Count of  1  =  31
Count of  2  =  4
Current Entropy is =  0.512709142030877
Splitting on feature 0 with gai

# Part 2 of Project

# `BUILDING THE DECISION TREE`

## functions used:

### 1. `findMaxClass :` to find class with majority of data
### 2. `buildDT :` to build the decision tree and return the root of the resultant Decision Tree
### 3. `printTree :` to print the details of all Nodes given a Root Node of a Decision Tree

## classes used:

### `Node :` this class defines a node with properties given  

In [9]:
# this function would find the class with maximum count
def findMaxClass(Y):
    classes = set(Y) 
 
    best_class  = None
    count  = -1
    
    for i in classes:
        
        j = (Y == i).sum()
        
        if(j > count):
            count = j
            best_class = i
            
    return best_class

In [10]:
def buildDT(X,Y,features):
   
    classes = set(Y) 
    
    # if current node is a Pure Node
    if(len(classes) == 1):
        leafNode = Node()
        leafNode.entropy = 0
        leafNode.data = len(Y)
        leafNode.prediction = Y[0]
        leafNode.children = list()
        leafNode.class_split[Y[0]] = len(Y)
        leafNode.current_prediction = Y[0]
        leafNode.split_feature = None
        return leafNode
    
    # If no feature is left to split
    elif(len(features) == 0):
        
        leafNode = Node()
        leafNode.entropy = entropy(Y)
        leafNode.data = len(Y)
        leafNode.children = list()
        leafNode.split_feature = None
        
       #finding class with maximum number of data points
        leafNode.current_prediction = findMaxClass(Y)
        
        for i in classes:
            j = (Y == i).sum() 
            leafNode.class_split[i] = j
        
        #majority_class = findMajorityClass(Y_data)#Get class with max num of data points
        #leafNode.current_prediction = majority_class
        
        return leafNode
    
    #If all the features in the curr node can only take one value, then we can't split further
    elif((find_best_feature(X,Y,features))[0] == -1):
        
        leafNode = Node()
        leafNode.entropy = entropy(Y)
        leafNode.data = len(Y)
        leafNode.children = list()
        leafNode.split_feature = None
        leafNode.current_prediction = findMaxClass(Y) 
        
        for i in classes:
            j= (Y == i).sum() 
            leafNode.class_split[i] = j
        
        #majority_class = findMajorityClass(Y_data)#Get class with max num of data points
        #leafNode.current_prediction = majority_class
        
        return leafNode
    
    else:
        internalNode = Node()
        internalNode.entropy = entropy(Y)
        internalNode.data = len(Y)
        internalNode.current_prediction = findMaxClass(Y)
        
        
        #getting best feature to split and gain ratio
        best_feature,gain_ratio = find_best_feature(X,Y,features)
        internalNode.split_feature = best_feature
        internalNode.gain_ratio = gain_ratio 
        
        for i in classes:
            j = (Y == i).sum()
            internalNode.class_split[i] = j
            
        
        #Splitting curr node on all different values the best feature can have and recursively calling DT on each split
        diff_val_of_best_feat = set(X[:,best_feature]) # Get Different values of the BEST FEATURE in current data
        
        #Traverse each feature value, split on each 
        for current in diff_val_of_best_feat:
            x = X[(X[:,best_feature] == current)]
            y = Y[(X[:,best_feature] == current)]
            
            remaining_features = np.delete(features,np.where(features == best_feature))
            
            child = buildDT(x,y,remaining_features)#Recursive Call
            
            internalNode.children.append(child)
            
        return internalNode    

In [11]:
class Node():
    
    # INFO ABOUT PROPERTIES:
    # entropy: entropy of node
    # data: total number of data points in current node
    # class_split: dictionary having classes as keys and the number data points of the class as value
    # children: contails the children of current node 
    # split_feature: the feature we are splitting on ( for internal nodes )
    # gain_ratio: gain ratio taken while splitting
    
    def __init__(self,entropy=0,data=0,class_split = dict(),children = list(),current_prediction=-1,split_feature = None,gain_ratio = None):
        self.data = data
        self.entropy = entropy
        self.class_split = dict() 
        self.children = list()
        self.current_prediction = current_prediction
        self.split_feature = None
        self.gain_ratio = None
        return 
    
    def addChild(self, c):
        self.children.append(c)
    
    def getDetailsNode(self,feature_list,target):
        print("Samples : ",self.data)
        print("Entropy : ",self.entropy)
        print("Current Prediction : ",target[self.current_prediction])
        
        classes = self.class_split.keys()
        for curr_class in classes:
            print("Class ",target[curr_class]," has Count",self.class_split[curr_class] )
            
        if(self.split_feature != None):
            print("Splitting on feature :", feature_list[self.split_feature],", with gain ratio : ",self.gain_ratio)
        else:
            print("Leaf Node")
            
        print()
        return


In [12]:
def printTree(root,feature_list,target):
    # if node is a leaf node
    if(len(root.children) == 0):
        root.getDetailsNode(feature_list,target)
        return
    
    # printing details of root Node
    root.getDetailsNode(feature_list,target)
    
    # printing details of children of root Node recursively
    for current_child in root.children:
        printTree(current_child,feature_list,target)
        
    return

### `Output of PArt 2 of Project ( Building the Decision Tree )` 

In [13]:
# Getting the root of the built tree and further printing the details of that Decision Tree from its Root Node 

root_for_iris_dataset = buildDT(x,y,features)
printTree(root_for_iris_dataset,iris.feature_names,iris.target_names)

Samples :  150
Entropy :  1.584962500721156
Current Prediction :  setosa
Class  setosa  has Count 50
Class  versicolor  has Count 50
Class  virginica  has Count 50
Splitting on feature : petal width (cm) , with gain ratio :  0.7350016280496154

Samples :  49
Entropy :  0
Current Prediction :  setosa
Class  setosa  has Count 49
Leaf Node

Samples :  11
Entropy :  0.4394969869215134
Current Prediction :  versicolor
Class  setosa  has Count 1
Class  versicolor  has Count 10
Splitting on feature : sepal width (cm) , with gain ratio :  1.0

Samples :  10
Entropy :  0
Current Prediction :  versicolor
Class  versicolor  has Count 10
Leaf Node

Samples :  1
Entropy :  0
Current Prediction :  setosa
Class  setosa  has Count 1
Leaf Node

Samples :  44
Entropy :  0.5107878229540133
Current Prediction :  versicolor
Class  versicolor  has Count 39
Class  virginica  has Count 5
Splitting on feature : petal length (cm) , with gain ratio :  0.2488471906913508

Samples :  1
Entropy :  0
Current Predict

## Project Submitted by:
### ` Ritambhra Vatsya` vritambhara@gmail.com