In [384]:
import math
from collections import Counter
from functools import partial, reduce

# https://www.saedsayad.com/decision_tree.htm

In [367]:
# Outlook, Temperature, Humidity, Windy, Play Golf.
OUTLOOK = 0
TEMPERATURE = 1
HUMIDITY = 2
WINDY = 3
PLAY_GOLF = -1

data = """rainy hot high false no
rainy hot high true no
overcast hot high false yes
sunny mild high false yes
sunny cool normal false yes
sunny cool normal true no
overcast cool normal true yes
rainy mild high false no
rainy cool normal false yes
sunny mild normal false yes
rainy mild normal true yes
overcast mild high true yes
overcast hot normal false yes
sunny mild high true no"""

data = list(map(lambda s: s.split(' '), data.split('\n')))
data[:2]

[['rainy', 'hot', 'high', 'false', 'no'],
 ['rainy', 'hot', 'high', 'true', 'no']]

In [368]:
math.log(5) / math.log(2) == math.log2(5)

True

In [369]:
## Entropy

def entropy(t, f = 0):
    if f == 0: return 0
    """Calculates the entropy given the number of true and false instances."""
    n = t + f # The total instances.
    fn = lambda p: -p * math.log2(p) 
    return sum(map(fn, [t / n, f / n]))

In [370]:
def partition_by(data, col, val):
    result = [], []
    for row in data:
        result[0 if row[col] == val else 1].append(row)
    return result

In [371]:
def counter(data, col=-1):
    labels = [row[col] for row in data]
    return list(Counter(labels).items())

In [372]:
def pick(tup, col):
    """Picks the value from the given tuple."""
    return tup[col]

In [373]:
# Example using the frequency table of one attribute.
# Partition by the last column "Play Golf" with the value "yes" and "no".
# yes, no = partition_by(data, -1, 'yes')
# len(yes), len(no)
labels = counter(data)
labels

[('no', 5), ('yes', 9)]

In [374]:
# Handy method to select the second value of the tuple.
pick_value = partial(pick, col=1)

# To find the entropy of the target, E(PlayGold) = E(9, 5).
entropy_target = entropy(*map(pick_value, labels)) # entropy([9, 5]), # 0.9402859586706311
entropy_target

0.9402859586706311

In [375]:
def entropy_for(data, col):
    total = 0.0
    n = len(data)
    attributes = counter(data, col)
    for attribute, _ in attributes:
        o, _ = partition_by(data, col, attribute)
        print(f'{attribute:10s}: {counter(o)}')
        total += entropy(*map(pick_value, counter(o))) * len(o) / n
    return total

In [376]:
entropy_for(data, OUTLOOK)

rainy     : [('no', 3), ('yes', 2)]
overcast  : [('yes', 4)]
sunny     : [('yes', 3), ('no', 2)]


0.6935361388961919

In [377]:
entropy_for(data, HUMIDITY)

high      : [('no', 4), ('yes', 3)]
normal    : [('yes', 6), ('no', 1)]


0.7884504573082896

In [378]:
entropy_for(data, TEMPERATURE)

hot       : [('no', 2), ('yes', 2)]
mild      : [('yes', 4), ('no', 2)]
cool      : [('yes', 3), ('no', 1)]


0.9110633930116763

In [379]:
entropy_for(data, WINDY)

false     : [('no', 2), ('yes', 6)]
true      : [('no', 3), ('yes', 3)]


0.8921589282623617

In [380]:
entropy_for(data, PLAY_GOLF) # The target entropy has to be calculated separately.

no        : [('no', 5)]
yes       : [('yes', 9)]


0.0

## Information Gain

The information gained is based on the decrease in entropy after a dataset is split on an attribute. The branch is divided based on the attribute that returns the highest information gain. This means the attribute with the higher information gain is preferred.

In [381]:
# Gain(T, X) = Entropy(T) - Entropy(T, X)
# G(PlayGolf, Outlook) = E(PlayGolf) - E(PlayGolf, Outlook)
entropy_target - entropy_for(data, OUTLOOK) # 0.24674981977443922

rainy     : [('no', 3), ('yes', 2)]
overcast  : [('yes', 4)]
sunny     : [('yes', 3), ('no', 2)]


0.24674981977443922

In [418]:
# Information Gain.
# (('outlook', 0.24674981977443922),
#  ('humidity', 0.15183550136234159),
#  ('temperature', 0.02922256565895487),
#  ('windy', 0.04812703040826949))
labels = ['outlook', 'humidity', 'temperature', 'windy']
for i in range(4):
    print(labels[i],':', entropy_target - entropy_for(data, i))
    print()

rainy     : [('no', 3), ('yes', 2)]
overcast  : [('yes', 4)]
sunny     : [('yes', 3), ('no', 2)]
outlook : 0.24674981977443922

hot       : [('no', 2), ('yes', 2)]
mild      : [('yes', 4), ('no', 2)]
cool      : [('yes', 3), ('no', 1)]
humidity : 0.02922256565895487

high      : [('no', 4), ('yes', 3)]
normal    : [('yes', 6), ('no', 1)]
temperature : 0.15183550136234159

false     : [('no', 2), ('yes', 6)]
true      : [('no', 3), ('yes', 3)]
windy : 0.04812703040826949



## Gini Index

Gini index is a metric to measure how often a randomly chosen element would be incorrectly identified. Attributes with a lower gini index is preferred.

In [421]:
def gini(t, f=0):
    if f == 0: return 0
    fn = lambda p: p ** 2
    n = t + f
    return 1 - fn(t / n) - fn(f / n)

In [422]:
gini(5, 7)

0.486111111111111

In [409]:
def gini_for(data, col):
    total = 0.0
    n = len(data)
    attributes = counter(data, col)
    for attribute, _ in attributes:
        o, _ = partition_by(data, col, attribute)
        print(f'{attribute:10s}: {counter(o)}')
        total += gini(*map(pick_value, counter(o))) * len(o) / n
    return total

In [414]:
for i in range(4):
    print(gini_for(data, i))
    print()

rainy     : [('no', 3), ('yes', 2)]
overcast  : [('yes', 4)]
sunny     : [('yes', 3), ('no', 2)]
0.34285714285714286

hot       : [('no', 2), ('yes', 2)]
mild      : [('yes', 4), ('no', 2)]
cool      : [('yes', 3), ('no', 1)]
0.4404761904761905

high      : [('no', 4), ('yes', 3)]
normal    : [('yes', 6), ('no', 1)]
0.3673469387755103

false     : [('no', 2), ('yes', 6)]
true      : [('no', 3), ('yes', 3)]
0.42857142857142855

