In [9]:
import numpy as np
import pandas as pd
from numpy import log2 as log
eps = np.finfo(float).eps

In [10]:
data = {'Age':['<=30','<=30','31-40','>40','>40','>40','31-40','<=30','>=30','>=30','<=30','31-40','31-40','>40'],
       'Income':['High','High','High','Medium','Low','Low','Low','Medium','Low','Medium','Medium','Medium','High','Medium'],
       'Student':['No','No','No','No','Yes','Yes','Yes','No','Yes','Yes','Yes','No','Yes','No'],
       'Credit_Rating':['F','E','F','F','F','E','E','F','F','F','E','E','F','E'],
        'Buys_Computer':['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']}
df = pd.DataFrame(data, columns=['Age','Income','Student','Credit_Rating','Buys_Computer'])
df        

Unnamed: 0,Age,Income,Student,Credit_Rating,Buys_Computer
0,<=30,High,No,F,No
1,<=30,High,No,E,No
2,31-40,High,No,F,Yes
3,>40,Medium,No,F,Yes
4,>40,Low,Yes,F,Yes
5,>40,Low,Yes,E,No
6,31-40,Low,Yes,E,Yes
7,<=30,Medium,No,F,No
8,>=30,Low,Yes,F,Yes
9,>=30,Medium,Yes,F,Yes


In [11]:
# data = {'Taste':['Salty','Spicy','Spicy','Spicy','Spicy','Sweet','Salty','Sweet','Spicy','Salty'],
#        'Temperature':['Hot','Hot','Hot','Cold','Hot','Cold','Cold','Hot','Cold','Hot'],
#        'Texture':['Soft','Soft','Hard','Hard','Hard','Soft','Soft','Soft','Soft','Hard'],
#        'Eat':['No','No','Yes','No','Yes','Yes','No','Yes','Yes','Yes']}
# df = pd.DataFrame(data, columns=['Taste','Temperature','Texture','Eat'])
# df        

In [12]:
info = 0.0
values = df.Buys_Computer.unique()
target_vals = df.Buys_Computer.unique()
for value in values:
    prob = df.Buys_Computer.value_counts()[value]/len(df)
    info += -prob*np.log2(prob)
info

0.9402859586706311

In [13]:
cols = ['Age','Income','Student','Credit_Rating']
for each_col in cols:
    entropy_att=0
    classes = df[each_col].unique()
    print(classes)

['<=30' '31-40' '>40' '>=30']
['High' 'Medium' 'Low']
['No' 'Yes']
['F' 'E']


In [14]:
#First Level
entropy_attributes = {}
gain_each = {}
for each_col in cols:
    entropy_att=0
    classes = df[each_col].unique()
    
    for each_class in classes:
        entropy_each_class = 0
        
        for target_val in target_vals:
            numerator = len(df[each_col][df[each_col]==each_class][df.Buys_Computer ==target_val])
            denominator = len(df[each_col][df[each_col]==each_class])
            prob = numerator/(denominator+eps)
            entropy_each_class += abs(-prob*np.log2(prob+eps))
            
        prob_outer = denominator/len(df)
        entropy_att += abs(-prob_outer*entropy_each_class)
        
    entropy_attributes[each_col] = entropy_att
    gain_each[each_col] = info - entropy_att
print(entropy_attributes)
print(gain_each)    

{'Age': 0.5175080355597519, 'Income': 0.9110633930116756, 'Student': 0.7884504573082889, 'Credit_Rating': 0.892158928262361}
{'Age': 0.42277792311087925, 'Income': 0.029222565658955535, 'Student': 0.15183550136234225, 'Credit_Rating': 0.048127030408270155}


In [15]:
def find_entropy(df):
    Class = df.keys()[-1]
    entropy = 0
    values = df[Class].unique()
    for value in values:
        fraction = df[Class].value_counts()[value]/len(df[Class])
        entropy += -fraction*np.log2(fraction)
    return entropy

In [16]:
def find_entropy_attribute(df,attribute):
  Class = df.keys()[-1]
  target_variables = df[Class].unique()
  variables = df[attribute].unique()
  entropy2 = 0
  for variable in variables:
      entropy = 0
      for target_variable in target_variables:
          num = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
          den = len(df[attribute][df[attribute]==variable])
          fraction = num/(den+eps)
          entropy += -fraction*log(fraction+eps)
      fraction2 = den/len(df)
      entropy2 += -fraction2*entropy
  return abs(entropy2)

In [17]:
def find_winner(df):
    Entropy_att = []
    IG = []
    for key in df.keys()[:-1]:
#       Entropy_att.append(find_entropy_attribute(df,key))
        IG.append(find_entropy(df)-find_entropy_attribute(df,key))
    return df.keys()[:-1][np.argmax(IG)]

In [18]:
def get_subtable(df, node,value):
  return df[df[node] == value].reset_index(drop=True)

In [19]:
def buildTree(df,tree=None): 
    Class = df.keys()[-1]    
    #Start building the decision tree
    node = find_winner(df)#attribute with maxm info_gain
    attValue = np.unique(df[node])
    if tree is None:                    
        tree={}
        tree[node] = {}
    for value in attValue:
        
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable['Buys_Computer'],return_counts=True)                        
        
        if len(counts)==1:
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildTree(subtable)
                   
    return tree

In [20]:
#Build Tree
import pprint
tree = buildTree(df)
pprint.pprint(tree)

{'Age': {'31-40': 'Yes',
         '<=30': {'Student': {'No': 'No', 'Yes': 'Yes'}},
         '>40': {'Credit_Rating': {'E': 'No', 'F': 'Yes'}},
         '>=30': 'Yes'}}


In [21]:
#Predction function
def predict(inst,tree):
    for nodes in tree.keys():        
        
        value = inst[nodes]
        tree = tree[nodes][value]
        prediction = 0
            
        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            prediction = tree
            break;                            
        
    return prediction

In [32]:
#Testing
test_data = pd.Series({'Age':'<=30','Income':'High','Student':'Yes','Credit_Rating':'E'})
test_data

Age              <=30
Income           High
Student           Yes
Credit_Rating       E
dtype: object

In [33]:
#Predict
pred = predict(test_data,tree)
pred

'Yes'