In [1]:
# importing required modules
import numpy as np
import pydotplus
import pandas as pd
import sklearn.datasets as Datasets
from sklearn import tree
from sklearn import model_selection as cv

## Modifying Iris Dataset to change it to Labeled Data¶

In [2]:
# function to convert continuous data to discrete values
def makeLabelled(column):
    mean = column.mean()
    for i in range (0,len(column)):
        column[i] = int(column[i]>=mean) 
    return column

In [3]:
iris = Datasets.load_iris()
df = pd.DataFrame(iris.data)
df.shape

(150, 4)

In [4]:
X = df.values
Y = iris.target

In [5]:
# converting data to labelled form
for i in range(0,X.shape[-1]):
    X[:,i] = makeLabelled(X[:,i])

## Implementation of Decision Tree

In [6]:
# function to calculates entropy
def entropy(Y):
    classes = set(Y)
    value = 0
    for i in classes:
        p = (len(Y[Y==i])/len(Y))
        value = value - (p*np.log2(p))
    return value

In [7]:
# function to find gain ration for a selected feature
def CalcGainRatio(X, Y, selectedFeature):
    differentLabels = set(X[:, selectedFeature])
    entropyBeforeSplitting = entropy(Y)
    entropyAfterSplitting = 0
    splitInfo = 0
    for i in differentLabels:
        newNodeY = Y[(X[:,selectedFeature] == i)]
        weightOfSamples = (len(newNodeY)/len(Y))
        entropyAfterSplitting = entropyAfterSplitting + (weightOfSamples*entropy(newNodeY))
        splitInfo = splitInfo - (weightOfSamples*np.log2(weightOfSamples))
    gain = entropyBeforeSplitting - entropyAfterSplitting
    gainRatio = gain
    return gainRatio

In [8]:
# function to print count of classes
def printClassCount(Y):
    classes = set(Y)
    for i in classes:
        print("Count of ",i," = ",len(Y[Y==i]))

In [9]:
# function to print nodes of decision tree
def decisionTree(X, Y, available_features):
    print(" ")
    printClassCount(Y)
    print("Current Entropy is = ",entropy(Y))
    if(available_features == 0 or (entropy(Y) == 0)):
        print("Reached leaf Node")
        return
    selectedFeature = 0
    max_value = -float('inf')
    
    # finding gain ratio for all possible features on which we can split and then choosing the feature with maximum gain.
    for i in range(0,X.shape[-1]):
        value = CalcGainRatio(X, Y, i)
        if(value >= max_value):
            selectedFeature = i
            max_value = value
    print("Splitting on feature ", selectedFeature)
    
    # find all possible unique labels for the selected feature.
    differentLabels = set(X[:, selectedFeature])
    for i in differentLabels:
        newDataSamples = (X[:, selectedFeature] == i)
        newX = X[newDataSamples]
        newY = Y[newDataSamples]
        decisionTree(newX, newY, available_features - 1)
    return

## Applying descision tree

In [10]:
# testing on OR dataset
X = np.array([[0,0],[0,1],[1,0],[1,1]])
Y = np.array([0,1,1,1])
decisionTree(X,Y,2)

 
Count of  0  =  1
Count of  1  =  3
Current Entropy is =  0.8112781244591328
Splitting on feature  1
 
Count of  0  =  1
Count of  1  =  1
Current Entropy is =  1.0
Splitting on feature  0
 
Count of  0  =  1
Current Entropy is =  0.0
Reached leaf Node
 
Count of  1  =  1
Current Entropy is =  0.0
Reached leaf Node
 
Count of  1  =  2
Current Entropy is =  0.0
Reached leaf Node


In [11]:
# testing on iris dataset
decisionTree(X,Y,4)

 
Count of  0  =  1
Count of  1  =  3
Current Entropy is =  0.8112781244591328
Splitting on feature  1
 
Count of  0  =  1
Count of  1  =  1
Current Entropy is =  1.0
Splitting on feature  0
 
Count of  0  =  1
Current Entropy is =  0.0
Reached leaf Node
 
Count of  1  =  1
Current Entropy is =  0.0
Reached leaf Node
 
Count of  1  =  2
Current Entropy is =  0.0
Reached leaf Node
