This notebook contains code for ID3 Algorithm in decision trees

In [None]:
def find_entropy(df):
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name
    entropy = 0
    values = df[Class].unique()
    for value in values:
        fraction = df[Class].value_counts()[value]/len(df[Class])
        entropy += -fraction*np.log2(fraction)
    return entropy

def find_entropy_attribute(df,attribute):
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name
    target_variables = df[Class].unique()  #This gives all 'Yes' and 'No'
    variables = df[attribute].unique()    #This gives different features in that attribute (like 'Hot','Cold' in Temperature)
    entropy2 = 0
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
            den = len(df[attribute][df[attribute]==variable])
            fraction = num/(den+eps)
            entropy += -fraction*log(fraction+eps)
        fraction2 = den/len(df)
        entropy2 += -fraction2*entropy
    return abs(entropy2)

def find_winner(df):
    Entropy_att = []
    IG = []
    for key in df.keys()[:-1]:#         Entropy_att.append(find_entropy_attribute(df,key))
        IG.append(find_entropy(df)-find_entropy_attribute(df,key))
    return df.keys()[:-1][np.argmax(IG)] 

def get_subtable(df, node,value):
    return df[df[node] == value].reset_index(drop=True)

def buildTree(df,targetClass,tree=None): 
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name  #Here we build our decision tree  #Get attribute with maximum information gain
    node = find_winner(df)#Get distinct value of that attribute e.g Salary is node and Low,Med and High are values
    attValue = np.unique(df[node])#Create an empty dictionary to create tree    
    if tree is None:                    
        tree={}
        tree[node] = {}#We make loop to construct a tree by calling this function recursively. #In this we check if the subset is pure and stops if it is pure. 
    for value in attValue:
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable[targetClass],return_counts=True)                        
        if len(counts)==1:#Checking purity of subset
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildTree(subtable,targetClass) #Calling the function recursively               
    return tree

def predict(test, tree, default=None):
    attribute = next(iter(tree)) 
    # print(attribute) 
    if test[attribute] in tree[attribute].keys():
        # print(tree[attribute].keys())
        # print(test[attribute])
        result = tree[attribute][test[attribute]]
        if isinstance(result, dict):
            return predict(test, result)
        else:
            return result
    else:
        return default


In [None]:
class Util: 
 def calculateAccuracy(predictedLabels, targetLabels):
      count=0
      accuracy=0        
      for i in range(len(predictedLabels)):
          if targetLabels[i]== predictedLabels[i]:
            count +=1
      return (count/len(predictedLabels))*100 


 def getPerformanceMetrics(targetLabels,predictedLabels,trueLabels):
      tp=0
      tn=0
      fp=0
      fn=0
      accuracy=0
      recall=0
      specificity=0
      precision=0
      f1=0

      for i in range(len(predictedLabels)):
        if targetLabels[i]== predictedLabels[i]:
          if targetLabels[i] in trueLabels:
            tp +=1
          else:
            tn +=1
        else:
           if targetLabels[i] in trueLabels:
             fn +=1
           else:
             fp +=1
      accuracy = (tp+tn)/(tp+fn+fp+tn)
      if(tp+fn!=0):
        recall = tp/(tp+fn)
      if(fp+tn!=0):
        specificity = tn/(fp+tn)
      if(tp+fp!=0):
        precision = tp/(tp+fp)
      if(precision + recall!=0):
        f1 = 2 *((precision * recall)/(precision + recall))
      return  (accuracy, recall,specificity, precision, f1) 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from warnings import filterwarnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from numpy import log2 as log
import pprint

eps = np.finfo(float).eps

def decisionTree(df,train,targetClass):
  tree= buildTree(train,targetClass)
  return tree

def makePrediction(tree,df,test,targetClass,trueLabels):
  #  Perform the prediction using the built tree
  col_names = df.columns
  test_dict = {}

  i=0;

  predicted_values = []
  target_values = []

  for index, row in test.iterrows():
      # print(row['c1'], row['c2'])
      predicted_values.append(predict(row,tree)) 
      target_values.append(row[targetClass])   

  # print('predicted_values are ' + str(predicted_values))
  # print('target_values are ' + str(target_values))

  # accuracy
  testAccuracy = Util.calculateAccuracy(predicted_values, target_values) 
  print('Accuracy is: %f' % testAccuracy)

  accuracy, recall,specificity, precision, f1 = Util.getPerformanceMetrics(target_values,predicted_values,trueLabels)

  print('Performance Metrics of Test Data : Accuracy : %f , Recall : %f ,Specificity: %f , Precision : %f, F1 Score: %f' % (accuracy, recall,specificity, precision, f1) )



In [None]:
# Call on heart dataset which has a wide range of numerical values

df=pd.read_csv("heart.csv")


#split the data to train and test by using cross validation
train, test = train_test_split(df, test_size=0.3 , random_state=1)


tree = decisionTree(df,train,'a1p2')
pprint.pprint(tree)
trueLabels = [1]

makePrediction(tree,df,test,'a1p2',trueLabels)



{'sc': {126: 1,
        149: {'age': {49: 2, 71: 1}},
        160: 1,
        164: 2,
        166: 2,
        167: 2,
        168: 1,
        172: 2,
        174: 2,
        175: 1,
        177: {'age': {43: 2, 46: 1, 59: 2, 65: 1}},
        178: 1,
        180: 1,
        182: 1,
        183: 1,
        185: 2,
        188: 2,
        192: 1,
        193: 1,
        195: 1,
        196: 1,
        197: {'age': {44: 2, 53: 1}},
        198: {'age': {35: 2, 41: 1}},
        199: 1,
        201: 1,
        203: 1,
        204: 1,
        205: 1,
        206: 2,
        207: 1,
        209: 1,
        210: 1,
        211: 1,
        212: 1,
        214: 1,
        215: 1,
        216: 2,
        217: 2,
        218: 2,
        219: {'age': {39: 2, 44: 1, 50: 1}},
        220: 1,
        221: 1,
        222: 1,
        223: {'age': {40: 2, 67: 1}},
        224: 2,
        225: 2,
        226: 1,
        228: {'sex': {0: 2, 1: 1}},
        229: 2,
        230: 2,
        231: {'age': {38: 2

In [None]:
# Call on credit dataset

df = pd.read_csv('credit.csv')

#This being a numerical column is decreasing accuracy of decision tree if it is chosen as a descriptive feature
#Due to high information gain of this feature, it is being chosen as one of the descriptive features if not removed
df = df.drop("amount" , axis=1) 
df = df.drop("age" , axis=1) 

#split the data to train and test by using cross validation
train, test = train_test_split(df, test_size=0.3 , random_state=1)

tree = decisionTree(df,train,'default')
# pprint.pprint(tree)

trueLabels = ['yes']
makePrediction(tree,df,test,'default',trueLabels)


Accuracy is: 61.000000
Performance Metrics of Test Data : Accuracy : 0.610000 , Recall : 0.372093 ,Specificity: 0.705607 , Precision : 0.336842, F1 Score: 0.353591


ID3 Performance on Multiclass classification
Zoo dataset - https://archive.ics.uci.edu/ml/datasets/zoo

In [None]:
# Call on UCI zoo dataset for multiclass

df = pd.read_csv('zoo.csv',
                      names=['animal_name','hair','feathers','eggs','milk',
                                                   'airbone','aquatic','predator','toothed','backbone',
                                                  'breathes','venomous','fins','legs','tail','domestic','catsize','class',])



#We drop the animal names since this is not a good feature to split the data on
df=df.drop('animal_name',axis=1)


#split the data to train and test by using cross validation
train, test = train_test_split(df, test_size=0.3 , random_state=1)

tree = decisionTree(df,train,'class')
# pprint.pprint(tree)

trueLabels = np.unique(np.array(df.pop("class").to_list()))

makePrediction(tree,df,test,'class',trueLabels)



Accuracy is: 96.774194
Performance Metrics of Test Data : Accuracy : 0.967742 , Recall : 0.967742 ,Specificity: 0.000000 , Precision : 1.000000, F1 Score: 0.983607
