In [1]:
import numpy as np
import pandas as pd

In [2]:
#Used to avoid zero in the denominator
eps = np.finfo(float).eps
eps

2.220446049250313e-16

In [3]:
df = pd.DataFrame([['overcast', 'hot', 'high', 'FALSE', 'yes'],
['overcast', 'cool', 'normal', 'TRUE', 'yes'],
['overcast', 'mild', 'high', 'TRUE', 'yes'],
['overcast', 'hot', 'normal', 'FALSE', 'yes'],
['rainy', 'mild', 'high', 'FALSE', 'yes'],
['rainy', 'cool', 'normal', 'FALSE', 'yes'],
['rainy', 'cool', 'normal', 'TRUE', 'no'],
['rainy', 'mild', 'normal', 'FALSE', 'yes'],
['rainy', 'mild', 'high', 'TRUE', 'no'],
['sunny', 'hot', 'high', 'FALSE', 'no'],
['sunny', 'hot', 'high', 'TRUE', 'no'],
['sunny', 'mild', 'high', 'FALSE', 'no'],
['sunny', 'cool', 'normal', 'FALSE', 'yes'],
['sunny', 'mild', 'normal', 'TRUE', 'yes']], columns=['outlook','temp','humidity','windy','play'])

In [4]:
df.shape

(14, 5)

In [5]:
df.values

array([['overcast', 'hot', 'high', 'FALSE', 'yes'],
       ['overcast', 'cool', 'normal', 'TRUE', 'yes'],
       ['overcast', 'mild', 'high', 'TRUE', 'yes'],
       ['overcast', 'hot', 'normal', 'FALSE', 'yes'],
       ['rainy', 'mild', 'high', 'FALSE', 'yes'],
       ['rainy', 'cool', 'normal', 'FALSE', 'yes'],
       ['rainy', 'cool', 'normal', 'TRUE', 'no'],
       ['rainy', 'mild', 'normal', 'FALSE', 'yes'],
       ['rainy', 'mild', 'high', 'TRUE', 'no'],
       ['sunny', 'hot', 'high', 'FALSE', 'no'],
       ['sunny', 'hot', 'high', 'TRUE', 'no'],
       ['sunny', 'mild', 'high', 'FALSE', 'no'],
       ['sunny', 'cool', 'normal', 'FALSE', 'yes'],
       ['sunny', 'mild', 'normal', 'TRUE', 'yes']], dtype=object)

In [6]:
def find_entropy_whole(df):
    target = df.keys()[-1]
    overall_entropy = 0
    values_in_target = df[target].unique()
    for value in values_in_target:
        p = df[target].value_counts()[value] / len(df[target])
        overall_entropy += -p * np.log2(p)
    return overall_entropy

find_entropy_whole(df)

0.9402859586706311

In [7]:
def find_entropy_of_attribute(df, attribute):
    target = df.keys()[-1]
    values_in_target = df[target].unique()
    values_in_attribute = df[attribute].unique()
    entropy_attribute = 0
    for value_in_attribute in values_in_attribute:
        overall_entropy = 0
        for value_in_target in values_in_target:
            num = len(df[attribute][df[attribute] == value_in_attribute][df[target] == value_in_target])
            den = len(df[attribute][df[attribute] == value_in_attribute])
            p = num / (den + eps)
            overall_entropy += -p * np.log2(p + eps)
        p2 = den / len(df)
        entropy_attribute += -p2 * overall_entropy
    return abs(entropy_attribute)

In [8]:
for attribute in df.keys()[:-1]:
    print(f'Entropy of the attribute "{attribute}" is :', find_entropy_of_attribute(df, attribute))

Entropy of the attribute "outlook" is : 0.6935361388961914
Entropy of the attribute "temp" is : 0.9110633930116756
Entropy of the attribute "humidity" is : 0.7884504573082889
Entropy of the attribute "windy" is : 0.892158928262361


In [9]:
def find_best_attribute_to_divide(df):
    IG = []
    all_attributes = df.keys()[:-1]
    for attribute in all_attributes:
        IG.append(find_entropy_whole(df) - find_entropy_of_attribute(df, attribute))
    index_of_attribute_with_max_IG = np.argmax(IG)
    best_attribute = all_attributes[index_of_attribute_with_max_IG]
    return best_attribute
find_best_attribute_to_divide(df)

'outlook'

In [10]:
def buildTree(df, tree=None):
    target = df.keys()[-1]
    node = find_best_attribute_to_divide(df)
    attValue = np.unique(df[node])
    if tree is None:
        tree = {}
        tree[node] = {}
    for value in attValue:
        subtable = df[df[node] == value].reset_index(drop=True)
        clValue, counts = np.unique(subtable['play'], return_counts=True)
        if len(counts) == 1: #Checking purity of subset
            tree[node][value] = clValue[0]
        else:
            tree[node][value] = buildTree(subtable) # Calling the function recusively
    return tree

buildTree(df)

{'outlook': {'overcast': 'yes',
  'rainy': {'windy': {'FALSE': 'yes', 'TRUE': 'no'}},
  'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}