In [1]:
import numpy as np
import pandas as pd

In [2]:
col_names=['sepal_length','sepal_width','petal_length','petal_width','type']
data=pd.read_csv("Iris.csv",skiprows=1,header=None,names=col_names)
data.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa
6,5.4,3.9,1.7,0.4,Iris-setosa
7,4.6,3.4,1.4,0.3,Iris-setosa
8,5.0,3.4,1.5,0.2,Iris-setosa
9,4.4,2.9,1.4,0.2,Iris-setosa
10,4.9,3.1,1.5,0.1,Iris-setosa


# Node class

In [22]:
class Node():
    def __init__(self,feat_ind=None,thr=None,left=None,right=None,info_gain=None,value=None):
        self.feat_ind=feat_ind
        self.thr=thr
        self.left=left
        self.right=right
        self.info_gain=info_gain
        self.value=value

In [16]:
class DecisionTreeClassifier():
    def __init__(self,min=2,max_depth=2):
        self.root=None
        self.min=min
        self.max_depth=max_depth
    def build_tree(self,dataset,curr_depth=0):
        X,Y=dataset[:,:-1],dataset[:,-1]
        num_sam,num_feat=np.shape(X)
        if num_sam>=self.min and curr_depth<=self.max_depth:
            best_split=self.get_best_split(dataset,num_sam,num_feat)
            if best_split["info_gain"]>0:
                left_subtree=self.build_tree(best_split["data_left"],curr_depth+1)
                right_subtree=self.build_tree(best_split["data_right"],curr_depth+1)
                return Node(best_split["feat_ind"],best_split["thr"],left_subtree,right_subtree,best_split["info_gain"])
        leaf_value=self.leaf_val(Y)
        return Node(value=leaf_value)
    def get_best_split(self,dataset,num_sam,num_feat):
        best_split={}
        max_info_gain=-float("inf")
        for feat_ind in range(num_feat):
            feat_values=dataset[:,feat_ind]
            poss_thr=np.unique(feat_values)
            for thr in poss_thr:
                dataset_left,dataset_right=self.split(dataset,feat_ind,thr)
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y,left_y,right_y=dataset[:,-1],dataset_left[:,-1],dataset_right[:,-1]
                    curr_info_gain=self.information_gain(y,left_y,right_y)
                    if curr_info_gain>max_info_gain:
                        best_split["feat_ind"]=feat_ind
                        best_split["thr"]=thr
                        best_split["data_left"]=dataset_left
                        best_split["data_right"]=dataset_right
                        best_split["info_gain"]=curr_info_gain
                        max_info_gain=curr_info_gain
        return best_split
    def split(self,dataset,feat_ind,thr):
        dataset_left=np.array([row for row in dataset if row[feat_ind]<=thr])
        dataset_right=np.array([row for row in dataset if row[feat_ind]>thr])
        return dataset_left,dataset_right
    def information_gain(self,parent,l_child,r_child):
        weight_l=len(l_child)/len(parent)
        weight_r=len(r_child)/len(parent)
        return self.gini_ind(parent)-(weight_l*self.gini_ind(l_child)+weight_r*self.gini_ind(r_child))
    def gini_ind(self,y):
        class_labels=np.unique(y)
        gini=0
        for cls in class_labels:
            gini+=(len(y[y==cls])/len(y))**2
        return 1-gini
    def leaf_val(self,y):
        y=list(y)
        return max(y,key=y.count)
    def fit(self,x,y):
        dataset=np.concatenate((x,y),axis=1)
        self.root=self.build_tree(dataset)
    def predict(self,x):
        return [self.make_prediction(X,self.root) for X in x]
    def make_prediction(self,x,tree):
        if tree.value!=None: return tree.value
        feat_val=x[tree.feat_ind]
        if feat_val<=tree.thr:
            return self.make_prediction(x,tree.left)
        else:
            return self.make_prediction(x,tree.right)
    def print_tree(self,tree=None,indent=" "):
        if not tree:
            tree=self.root
        if tree.value is not None:
            print(tree.value)
        else:
            print("X_"+str(tree.feat_ind),"<=",tree.thr,"?",tree.info_gain)
            print("%sleft:"%(indent),end="")
            self.print_tree(tree.left,indent+indent)
            print("%sright:"%(indent),end="")
            self.print_tree(tree.right,indent+indent)

# Train-Test split

In [6]:
X=data.iloc[:,:-1].values
Y=data.iloc[:,-1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=.2,random_state=41)

In [23]:
classifier=DecisionTreeClassifier(min=3,max_depth=3)
classifier.fit(X_train,Y_train)
classifier.print_tree()

X_2 <= 1.9 ? 0.33741385372714494
 left:Iris-setosa
 right:X_3 <= 1.5 ? 0.427106638180289
  left:X_2 <= 4.9 ? 0.05124653739612173
    left:Iris-versicolor
    right:Iris-virginica
  right:X_2 <= 5.0 ? 0.019631171921475288
    left:X_1 <= 2.8 ? 0.20833333333333334
        left:Iris-virginica
        right:Iris-versicolor
    right:Iris-virginica


In [24]:
Y_pred=classifier.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,Y_pred)

0.9333333333333333