This notebook contains code for C4.5 Algorithm in decision trees

In [None]:
class Util: 
 def calculateAccuracy(predictedLabels, targetLabels):
      count=0
      accuracy=0        
      for i in range(len(predictedLabels)):
          if targetLabels[i]== predictedLabels[i]:
            count +=1
      return (count/len(predictedLabels))*100 


 def getPerformanceMetrics(targetLabels,predictedLabels,trueLabels):
      tp=0
      tn=0
      fp=0
      fn=0
      accuracy=0
      recall=0
      specificity=0
      precision=0
      f1=0

      for i in range(len(predictedLabels)):
        if targetLabels[i]== predictedLabels[i]:
          if targetLabels[i] in trueLabels:
            tp +=1
          else:
            tn +=1
        else:
           if targetLabels[i] in trueLabels:
             fn +=1
           else:
             fp +=1
      accuracy = (tp+tn)/(tp+fn+fp+tn)
      if(tp+fn!=0):
        recall = tp/(tp+fn)
      if(fp+tn!=0):
        specificity = tn/(fp+tn)
      if(tp+fp!=0):
        precision = tp/(tp+fp)
      if(precision + recall!=0):
        f1 = 2 *((precision * recall)/(precision + recall))
      return  (accuracy, recall,specificity, precision, f1) 

In [None]:
!pip install chefboost

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from warnings import filterwarnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from numpy import log2 as log
import pprint
from chefboost import Chefboost as chef

eps = np.finfo(float).eps

def decisionTree(df,train):
  config = {'algorithm' : 'C4.5'}
  tree = chef.fit(df.copy(),config)
  return tree

def makePrediction(tree,df,test,targetClass,trueLabels):
  #  Perform the prediction using the built tree
  col_names = df.columns
  test_dict = {}

  i=0;

  predicted_values = []
  target_values = []

  for index, row in test.iterrows():
      # print(row['c1'], row['c2'])
      predicted_values.append(chef.predict(tree,row)) 
      target_values.append(row[targetClass])   

  # print('predicted_values are ' + str(predicted_values))
  # print('target_values are ' + str(target_values))

  # accuracy
  testAccuracy = Util.calculateAccuracy(predicted_values, target_values) 
  print('Accuracy is: %f' % testAccuracy)

  accuracy, recall,specificity, precision, f1 = Util.getPerformanceMetrics(target_values,predicted_values,trueLabels)

  print('Performance Metrics of Test Data : Accuracy : %f , Recall : %f ,Specificity: %f , Precision : %f, F1 Score: %f' % (accuracy, recall,specificity, precision, f1) )





In [None]:
# Call on heart dataset which has a wide range of numerical values

df=pd.read_csv("heart.csv")
# print(df['a1p2'])

df.rename(columns = {'a1p2':'Decision'}, inplace = True)
df['Decision'] = df['Decision'].apply(str)


#split the data to train and test by using cross validation
train, test = train_test_split(df, test_size=0.3 , random_state=1)



tree = decisionTree(df,train)
# pprint.pprint(tree)
trueLabels = ['1']

print(df['Decision'])


makePrediction(tree,df,test,'Decision',trueLabels)



[INFO]:  1 CPU cores will be allocated in parallel running
C4.5  tree is going to be built...
-------------------------
finished in  16.125747203826904  seconds
-------------------------
Evaluate  train set
-------------------------
Accuracy:  94.81481481481481 % on  270  instances
Labels:  ['1' '2']
Confusion matrix:  [[146, 10], [4, 110]]
Precision:  93.5897 %, Recall:  97.3333 %, F1:  95.4248 %
0      1
1      1
2      1
3      2
4      2
      ..
265    1
266    1
267    1
268    1
269    2
Name: Decision, Length: 270, dtype: object
Accuracy is: 93.827160
Performance Metrics of Test Data : Accuracy : 0.938272 , Recall : 0.978723 ,Specificity: 0.882353 , Precision : 0.920000, F1 Score: 0.948454


In [None]:
# # Call on credit dataset

# df = pd.read_csv('credit.csv')

# #This being a numerical column is decreasing accuracy of decision tree if it is chosen as a descriptive feature
# #Due to high information gain of this feature, it is being chosen as one of the descriptive features if not removed
# df = df.drop("amount" , axis=1) 
# df = df.drop("age" , axis=1) 

# df.rename(columns = {'default':'Decision'}, inplace = True)
# # df['Decision'] = df['Decision'].apply(str)



# #split the data to train and test by using cross validation
# train, test = train_test_split(df, test_size=0.3 , random_state=1)

# tree = decisionTree(df,train)
# # pprint.pprint(tree)

# trueLabels = ['yes']
# makePrediction(tree,df,test,'Decision',trueLabels)


C4.5 Performance on Multiclass classification
Zoo dataset - https://archive.ics.uci.edu/ml/datasets/zoo

In [None]:
# Call on UCI zoo dataset for multiclass

df = pd.read_csv('zoo.csv',
                      names=['animal_name','hair','feathers','eggs','milk',
                                                   'airbone','aquatic','predator','toothed','backbone',
                                                  'breathes','venomous','fins','legs','tail','domestic','catsize','class',])



#We drop the animal names since this is not a good feature to split the data on
df=df.drop('animal_name',axis=1)

df.rename(columns = {'class':'Decision'}, inplace = True)
df['Decision'] = df['Decision'].apply(str)


#split the data to train and test by using cross validation
train, test = train_test_split(df, test_size=0.3 , random_state=1)

tree = decisionTree(df,train)
# pprint.pprint(tree)

trueLabels = [str(x) for x in np.unique(np.array(df.pop("Decision").to_list()))]

makePrediction(tree,df,test,'Decision',trueLabels)



[INFO]:  1 CPU cores will be allocated in parallel running
C4.5  tree is going to be built...
-------------------------
finished in  3.2196712493896484  seconds
-------------------------
Evaluate  train set
-------------------------
Accuracy:  100.0 % on  101  instances
Labels:  ['1' '4' '2' '7' '6' '5' '3']
Confusion matrix:  [[41, 0, 0, 0, 0, 0, 0], [0, 13, 0, 0, 0, 0, 0], [0, 0, 20, 0, 0, 0, 0], [0, 0, 0, 10, 0, 0, 0], [0, 0, 0, 0, 8, 0, 0], [0, 0, 0, 0, 0, 4, 0], [0, 0, 0, 0, 0, 0, 5]]
Decision  1  => Accuray:  100.0 %, Precision:  100.0 %, Recall:  100.0 %, F1:  100.0 %
Decision  4  => Accuray:  100.0 %, Precision:  100.0 %, Recall:  100.0 %, F1:  100.0 %
Decision  2  => Accuray:  100.0 %, Precision:  100.0 %, Recall:  100.0 %, F1:  100.0 %
Decision  7  => Accuray:  100.0 %, Precision:  100.0 %, Recall:  100.0 %, F1:  100.0 %
Decision  6  => Accuray:  100.0 %, Precision:  100.0 %, Recall:  100.0 %, F1:  100.0 %
Decision  5  => Accuray:  100.0 %, Precision:  100.0 %, Recall:  100.0