<a href="https://colab.research.google.com/github/YujiaLIAO-1/housing/blob/main/id3_simple_starter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Entropy
Entropy is defined as:

$$\sum_{i=1}^n -p(x_i)\log_2 p(x_i)$$

where $p(x_i)$ is the probability of class $x_i$ in the data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8')

In [5]:
df = pd.read_csv('golf-id3.csv')

In [None]:
df

Unnamed: 0,outlook,temp,humidity,windy,play
0,overcast,hot,high,False,yes
1,overcast,cool,normal,True,yes
2,overcast,mild,high,True,yes
3,overcast,hot,normal,False,yes
4,rainy,hot,high,False,no
5,rainy,hot,high,True,no
6,rainy,mild,high,False,no
7,rainy,cool,normal,False,yes
8,rainy,mild,normal,True,yes
9,sunny,cool,normal,True,no


In [6]:
# target feature "play"
df.play.value_counts()

Unnamed: 0_level_0,count
play,Unnamed: 1_level_1
yes,9
no,5


In [9]:
# entropy for target (play): E(play)
# play has (yes, no) values
# P(yes) is the probability of yes for play
# P(no) is the probability of no for play
# E(play) = E(9, 5)= -P(yes)*log2(P(yes)) - P(no)*log2(P(no))

e_play = - 5/14 * np.log2(5/14) - 9/14 * np.log2(9/14)
e_play

0.9402859586706311

In [7]:
# E(play, outlook)
# frequency table
# .to_frame() changes the result to dataframe for better formatting
df[['outlook', 'play']].groupby('outlook').value_counts().to_frame(name='count')

Unnamed: 0_level_0,Unnamed: 1_level_0,count
outlook,play,Unnamed: 2_level_1
overcast,yes,4
rainy,no,3
rainy,yes,2
sunny,yes,3
sunny,no,2


In [8]:
# given outlook, entropy for target (play)
# Entropy(play, outlook) means Entropy of play given outlook
# P
# E(play, outlook) = P(sunny)E(3, 2) + P(overcast)E(4, 0) + P(rainy)E(2, 3)
# E(play, outlook) = 5/14 * E(3, 2) + P(overcast)E(4, 0) + P(rainy)E(2, 3)

e_3_2 = (-3/5 * np.log2(3/5)) + (- 2/5 * np.log2(2/5))
e_4_0 = 0  # pure - entropy is 0, half/half entropy is 1
e_2_3 = e_3_2

e_play_outlook = 5/14 * e_3_2 + 4/14 * e_4_0 + 5/14 * e_2_3
print('Entropy(play, outlook):', e_play_outlook)

# information gain
gain_play_outlook = e_play - e_play_outlook
print('Info Gain from splitting using outlook:', gain_play_outlook)


Entropy(play, outlook): 0.6935361388961918
Info Gain from splitting using outlook: 0.24674981977443933


In [None]:
# determine which feature to use to split the tree using Information Gain

In [11]:
# E(play, temp)
df[['temp', 'play']].groupby('temp').value_counts().to_frame(name='count')

Unnamed: 0_level_0,Unnamed: 1_level_0,count
temp,play,Unnamed: 2_level_1
cool,yes,3
cool,no,1
hot,no,2
hot,yes,2
mild,yes,4
mild,no,2


In [12]:
# E(play, temp) = P(cool)E(3, 1) + P(hot)E(2, 2) + P(mild)E(4, 2)
e_3_1 = (-3/4 * np.log2(3/4)) + (- 1/4 * np.log2(1/4))
e_2_2 = 1  # pure - entropy is 0, half/half entropy is 1
e_4_2 = (-4/6 * np.log2(4/6)) + (- 2/6 * np.log2(2/6))

e_play_temp = 4/14 * e_3_1 + 4/14 * e_2_2 + 6/14 * e_4_2
print('Entropy(play, temp):', e_play_temp)

# information gain
gain_play_temp = e_play - e_play_temp
print('Info Gain from splitting using temp:', gain_play_temp)


Entropy(play, temp): 0.9110633930116763
Info Gain from splitting using temp: 0.02922256565895487


In [13]:
# E(play, humidity)
df[['humidity', 'play']].groupby('humidity').value_counts().to_frame(name='count')

Unnamed: 0_level_0,Unnamed: 1_level_0,count
humidity,play,Unnamed: 2_level_1
high,no,4
high,yes,3
normal,yes,6
normal,no,1


In [14]:
# E(play, humidity) = P(high)E(3, 4) + P(normal)E(6, 1)
e_3_4 = (-3/7 * np.log2(3/7)) + (- 4/7 * np.log2(4/7))
e_6_1 =(-6/7 * np.log2(6/7)) + (- 1/7 * np.log2(1/7))

e_play_humidity = 7/14 * e_3_4 + 7/14 * e_6_1
print('Entropy(play, humidity):', e_play_humidity)

# information gain
gain_play_humidity = e_play - e_play_humidity
print('Info Gain from splitting using humidity:', gain_play_humidity)


Entropy(play, humidity): 0.7884504573082896
Info Gain from splitting using humidity: 0.15183550136234159


In [15]:
# E(play, windy)

df[['windy', 'play']].groupby('windy').value_counts().to_frame(name='count')

Unnamed: 0_level_0,Unnamed: 1_level_0,count
windy,play,Unnamed: 2_level_1
False,yes,6
False,no,2
True,no,3
True,yes,3


In [16]:
# E(play, windy) = P(false)E(6, 2) + P(true)E(3, 3)
e_6_2 = (-6/8 * np.log2(6/8)) + (- 2/8 * np.log2(2/8))
e_3_3 = 1

e_play_windy = 8/14 * e_6_2 + 6/14 * e_3_3
print('Entropy(play, windy):', e_play_windy)

# information gain
gain_play_windy = e_play - e_play_windy
print('Info Gain from splitting using windy:', gain_play_windy)


Entropy(play, windy): 0.8921589282623617
Info Gain from splitting using windy: 0.04812703040826949


- ***Outlook* should be the first decision node**


In [25]:
# sunny and rainy need further spliting


# sunny 的条件下：
# E(play, temp)
df_sunny = df[df['outlook'] == 'sunny']
df_sunny[['temp', 'play']].groupby('temp').value_counts().to_frame(name='count')



Unnamed: 0_level_0,Unnamed: 1_level_0,count
temp,play,Unnamed: 2_level_1
cool,no,1
cool,yes,1
mild,yes,2
mild,no,1


In [28]:
# E(play, temp) = P(cool)E(1, 1) + P(mild)E(2, 1)
e_1_1 = 1
e_2_1 = (-2/3 * np.log2(2/3)) + (- 1/3 * np.log2(1/3))

e_play_temp = 2/5 * e_1_1 + 3/5 * e_2_1
print('Entropy(play, temp):', e_play_temp)

# information gain
gain_play_temp = e_play - e_play_temp
print('Info Gain from splitting using temp:', gain_play_temp)

Entropy(play, temp): 0.9509775004326937
Info Gain from splitting using temp: -0.01069154176206255


In [27]:
# E(play, humidity)
df_sunny = df[df['outlook'] == 'sunny']
df_sunny[['humidity', 'play']].groupby('humidity').value_counts().to_frame(name='count')



Unnamed: 0_level_0,Unnamed: 1_level_0,count
humidity,play,Unnamed: 2_level_1
high,no,1
high,yes,1
normal,yes,2
normal,no,1


In [29]:
# E(play, humidity) = P(high)E(1, 1) + P(normal)E(2, 1)
e_1_1 = 1
e_2_1 =(- 2/3 * np.log2(2/3)) + (- 1/3 * np.log2(1/3))

e_play_humidity = 2/5 * e_1_1 + 3/5 * e_2_1
print('Entropy(play, humidity):', e_play_humidity)

# information gain
gain_play_humidity = e_play - e_play_humidity
print('Info Gain from splitting using humidity:', gain_play_humidity)


Entropy(play, humidity): 0.9509775004326937
Info Gain from splitting using humidity: -0.01069154176206255


In [26]:
# E(play, windy)
df_sunny = df[df['outlook'] == 'sunny']
df_sunny[['windy', 'play']].groupby('windy').value_counts().to_frame(name='count')


Unnamed: 0_level_0,Unnamed: 1_level_0,count
windy,play,Unnamed: 2_level_1
False,yes,3
True,no,2


- ***Windy* is chosen to be the next decision node**

In [30]:
# rainy 的条件下：
# E(play, temp)
df_rainy = df[df['outlook'] == 'rainy']
df_rainy[['temp', 'play']].groupby('temp').value_counts().to_frame(name='count')

Unnamed: 0_level_0,Unnamed: 1_level_0,count
temp,play,Unnamed: 2_level_1
cool,yes,1
hot,no,2
mild,no,1
mild,yes,1


In [31]:
# E(play, humidity)
df_rainy = df[df['outlook'] == 'rainy']
df_rainy[['humidity', 'play']].groupby('humidity').value_counts().to_frame(name='count')


Unnamed: 0_level_0,Unnamed: 1_level_0,count
humidity,play,Unnamed: 2_level_1
high,no,3
normal,yes,2


- ***Humidity* is chosen to be the third decision node**

In [32]:
# E(play, windy)
df_rainy = df[df['outlook'] == 'rainy']
df_rainy[['windy', 'play']].groupby('windy').value_counts().to_frame(name='count')


Unnamed: 0_level_0,Unnamed: 1_level_0,count
windy,play,Unnamed: 2_level_1
False,no,2
False,yes,1
True,no,1
True,yes,1


In all, to split the tree using Information Gain, the first feature is **outlook** (divided by *sunny*, *overcast*, *rainy*), the second feature is **windy** (decision node of *sunny*), the third feature is **humidity** (decision node of *rainy*).