In [54]:
import numpy as np
import math

from collections import Counter

In [55]:
# Outlook, Temperature, Humidity, Wind, Played Football (yes, no).
data = [['sunny', 'hot', 'high', 'weak', 'no'],
 ['sunny', 'hot', 'high', 'strong', 'no'],
 ['overcast', 'hot', 'high', 'weak', 'yes'],
 ['rain', 'mild', 'high', 'weak', 'yes'],
 ['rain', 'cool', 'normal', 'weak', 'yes'],
 ['rain', 'cool', 'normal', 'strong', 'no'],
 ['overcast', 'cool', 'normal', 'strong', 'yes'],
 ['sunny', 'mild', 'high', 'weak', 'no'],
 ['sunny', 'cool', 'normal', 'weak', 'yes'],
 ['rain', 'mild', 'normal', 'weak', 'yes'],
 ['sunny', 'mild', 'normal', 'strong', 'yes'],
 ['overcast', 'mild', 'high', 'strong', 'yes'],
 ['overcast', 'hot', 'normal', 'weak', 'yes'],
 ['rain', 'mild', 'high', 'strong', 'no']]
data = np.array(data)
data.shape

(14, 5)

In [93]:
def entropy(t, f):
    if t == 0 or f == 0: return 0
    n = t + f
    fn = lambda p: -p * math.log2(p)
    return fn(t / n) + fn(f / n)

## Classification using ID3  algorithm

This uses entropy and information gain as metrics.

In [57]:
# Let's find the entropy of the target variables, E(S).
yes, no = data[data[:, -1] == 'yes'], data[data[:, -1] == 'no']
t, f = len(yes), len(no)
t, f

(9, 5)

In [58]:
parent_entropy = entropy(t, f)
parent_entropy # 0.9402859586706311

0.9402859586706311

In [59]:
# Let's find the entropy for the outlook Sunny, E(Sunny).
# First, find the columns with the outlook Sunny.
sunny = data[data[:, 0] == 'sunny']
sunny

array([['sunny', 'hot', 'high', 'weak', 'no'],
       ['sunny', 'hot', 'high', 'strong', 'no'],
       ['sunny', 'mild', 'high', 'weak', 'no'],
       ['sunny', 'cool', 'normal', 'weak', 'yes'],
       ['sunny', 'mild', 'normal', 'strong', 'yes']], dtype='<U8')

In [60]:
# For each of the rows, split by the target output.
yes, no = sunny[sunny[:,-1] == 'yes'], sunny[sunny[:,-1] == 'no']
t, f = len(yes), len(no)
t, f

(2, 3)

In [61]:
entropy(t, f)

0.9709505944546686

In [62]:
# How about the entropy of target given Outlook, E(S,Outlook)?
# To find E(S, Outlook), we need to calculate the average weighted entropy.
# First, we find out all the possible values for the Outlook.

outlooks = list(Counter(data[:, 0]).items())
outlooks

[('sunny', 5), ('overcast', 4), ('rain', 5)]

In [107]:
def group_entropy(data, col, alg=entropy):
    groups = list(Counter(data[:, col]).items())
    avg = 0.0
    total = len(data)
    # For each value, find the ratio of the target class.
    for group, count in groups:
        # Get all the rows with the given outlook.
        group_rows = data[data[:,col] == group]

        # Split by yes and no.
        yes, no = group_rows[group_rows[:,-1] == 'yes'], group_rows[group_rows[:,-1] == 'no']
        t, f = len(yes), len(no)
        avg += alg(t, f) * count / total
    return avg

In [108]:
group_entropy(data, col=0) # 0.6935361388961919

0.6935361388961919

In [71]:
# The information gain is the difference between the parent entropy E(S)
# and the average weighed entropy we found above E(S, Outlook).
information_gain = parent_entropy - average_weighted_entropy
information_gain # 0.24674981977443922

0.24674981977443922

In [74]:
# Find the information gain for all groups/features.
_, col = data.shape
n_features = col - 1 # Exclude the last column, which is the target variable.
best_feature = 0
best_gain = -float('inf')

for i in range(n_features):
    average_weighted_entropy = group_entropy(data, col=i)
    information_gain = parent_entropy - average_weighted_entropy
    if information_gain > best_gain:
        best_gain = information_gain
        best_feature = i
    print(i, information_gain)

0 0.24674981977443922
1 0.02922256565895487
2 0.15183550136234159
3 0.04812703040826949


In [75]:
# This should print the Outlook as the best feature.
# Outlook will form the root node.
best_feature, best_gain

(0, 0.24674981977443922)

In [76]:
#        Outlook
#    /      |     \
# sunny overcast rain

In [81]:
# Now let's find the nodes below Outlook sunny.
# The parent entropy E(Sunny) is:
outlook_sunny = data[data[:,0] == 'sunny']
sunny_parent_entropy = group_entropy(outlook_sunny, 0)
sunny_parent_entropy

0.9709505944546686

In [86]:
# Exclude the outlook variable, hence starts from 1.
labels = ['', 
          'IG(sunny,Temperature)',
          'IG(sunny,Humidity)',
          'IG(sunny,Windy)']

best_feature, best_gain = 0, -float('inf')

for i in range(1, n_features):
    # E(Sunny, Feature_i)
    average_weighted_entropy = group_entropy(outlook_sunny, i)
    information_gain = sunny_parent_entropy - average_weighted_entropy
    if information_gain > best_gain:
        best_gain = information_gain
        best_feature = i
    print(labels[i], information_gain)

IG(sunny,Temperature) 0.5709505944546686
IG(sunny,Humidity) 0.9709505944546686
IG(sunny,Windy) 0.01997309402197478


In [87]:
# Humidity is the best feature, so it will be under sunny.
best_feature, best_gain

(2, 0.9709505944546686)

In [88]:
#        Outlook
#    /      |     \
# sunny overcast rain
#  /
# Humidity

In [91]:
humidity_groups = set(outlook_sunny[:,2])
humidity_groups

{'high', 'normal'}

In [92]:
# Since both the information gain is 0, it's a terminal node.
for group in humidity_groups:
    humidity = outlook_sunny[outlook_sunny[:,2] == group]
    print(group, group_entropy(humidity, -1))

high 0.0
normal 0.0


In [94]:
#        Outlook
#    /      |     \
# sunny overcast rain
#  /
# Humidity
#  /  \ 
# high normal

## Classification using CART

Instead of entropy, we use gini impurity. Instead of finding the lowest entropy, we prefer the one with the higher gini gain.

In [105]:
def gini(t, f):
    if t == 0 or f == 0: return 0
    fn = lambda v: (v / (t + f)) ** 2
    return 1 - fn(t) - fn(f)

In [106]:
gini(9, 5) # 0.4591836734693877

0.4591836734693877

In [113]:
yes = data[data[:,-1] == 'yes']
no = data[data[:,-1] == 'no']
t, f = len(yes), len(no)
parent_gini = gini(t, f)
parent_gini

0.4591836734693877

In [115]:
# Now, find the gini gain for each feature. Prefer higher gini gain.
best_feature, best_gain = 0, -float('inf')
for i in range(n_features):
    group_gini = group_entropy(data, i, gini)
    gini_gain = parent_gini - group_gini
    if gini_gain > best_gain:
        best_gain = gini_gain
        best_feature = i
    print(i, gini_gain)

0 0.11632653061224485
1 0.01870748299319719
2 0.09183673469387743
3 0.030612244897959162


In [116]:
# Gini gain is higher for outlook.
best_feature, best_gain

(0, 0.11632653061224485)

## References
https://medium.com/datadriveninvestor/decision-tree-algorithm-with-hands-on-example-e6c2afb40d38