$Entropy(S) = \displaystyle \sum_{i=1}^{m} - p_i . log_2(p_i)$

$Gain(S, A) = Entropy(S) - \displaystyle \sum_{v_1}^{v_n} \frac{|S_v|}{|S|} Entropy(S_v)$

In [1]:
import math
import pandas as pd
import numpy as np
from sklearn import tree
from chefboost import Chefboost as chef

In [2]:
csv_file_path = "table.csv"
ds = pd.read_csv(csv_file_path)
dfs = pd.DataFrame(ds)
total = dfs.shape[0]
print(type(dfs), total)
dfs

<class 'pandas.core.frame.DataFrame'> 14


Unnamed: 0,age,income,student,credit_rating,buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middleaged,high,no,fair,yes
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
6,middleaged,low,yes,excellent,yes
7,youth,medium,no,fair,no
8,youth,low,yes,fair,yes
9,senior,medium,yes,fair,yes


Entropy(S) = ∑ – p(i) . log2p(i)

Gain(S, A) = Entropy(S) – ∑ (p(S|A) . Entropy(S|A))

---
Gain(buys_computer, **credit_rating**) = Entropy(buys_computer) – ∑ (p(buys_computer|credit_rating) . Entropy(buys_computer|credit_rating))

Gain(buys_computer, **student**) = Entropy(buys_computer) – ∑ (p(buys_computer|student) . Entropy(buys_computer|student))

Gain(buys_computer, **income**) = Entropy(buys_computer) – ∑ (p(buys_computer|income) . Entropy(buys_computer|income))

Gain(buys_computer, **age**) = Entropy(buys_computer) – ∑ (p(buys_computer|age) . Entropy(buys_computer|age))

In [3]:
result = dict()
gains = dict()

In [4]:
print("buys_computer : yes?")
p_yes = dfs['buys_computer'].value_counts()['yes']
print("yes:", p_yes)
p_yes = p_yes / total

p_no = dfs['buys_computer'].value_counts()['no']
print("no:", p_no)
p_no = p_no / total

entropy_buys_computer = - p_yes * np.log(p_yes) - p_no * np.log(p_no)
print('Entropy(buys_computer) =', entropy_buys_computer)
result['buys_computer'] = entropy_buys_computer

buys_computer : yes?
yes: 9
no: 5
Entropy(buys_computer) = 0.6517565611726531


In [5]:
print("credit_rating : fair ?")

fair = dfs[(dfs['credit_rating'] == 'fair')]
print("fair: ", fair.shape[0])

fair_yes = dfs[(dfs['credit_rating'] == 'fair') & (dfs['buys_computer'] == 'yes')]
p_fair_yes = fair_yes.shape[0]
fair_no = dfs[(dfs['credit_rating'] == 'fair') & (dfs['buys_computer'] == 'no')]
p_fair_no = fair_no.shape[0]

print("yes:", p_fair_yes)
p_fair_yes = p_fair_yes / total
print("no:", p_fair_no)
p_fair_no = p_fair_no / total

entropy_credit_rating_fair = - p_fair_yes * np.log(p_fair_yes) - p_fair_no * np.log(p_fair_no)
print('Entropy(credit_rating=fair) =', entropy_credit_rating_fair)
result['credit_rating_fair'] = entropy_credit_rating_fair
fair

credit_rating : fair ?
fair:  8
yes: 6
no: 2
Entropy(credit_rating=fair) = 0.6411148186024177


Unnamed: 0,age,income,student,credit_rating,buys_computer
0,youth,high,no,fair,no
2,middleaged,high,no,fair,yes
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
7,youth,medium,no,fair,no
8,youth,low,yes,fair,yes
9,senior,medium,yes,fair,yes
12,middleaged,high,yes,fair,yes


In [6]:
print("credit_rating : excellent ?")

excellent = dfs[(dfs['credit_rating'] == 'excellent')]
print("excellent: ", excellent.shape[0])
excellent
excellent_yes = dfs[(dfs['credit_rating'] == 'excellent') & (dfs['buys_computer'] == 'yes')]
p_excellent_yes = excellent_yes.shape[0]
excellent_no = dfs[(dfs['credit_rating'] == 'excellent') & (dfs['buys_computer'] == 'no')]
p_excellent_no = excellent_no.shape[0]

print("yes:", p_excellent_yes)
p_excellent_yes = p_excellent_yes / total
print("no:", p_excellent_no)
p_excellent_no = p_excellent_no / total

entropy_credit_rating_excellent = - p_excellent_yes * np.log(p_excellent_yes) - p_excellent_no * np.log(p_excellent_no)
print('Entropy(credit_rating=excellent) =', entropy_credit_rating_excellent)
result['credit_rating_excellent'] = entropy_credit_rating_excellent
excellent

credit_rating : excellent ?
excellent:  6
yes: 3
no: 3
Entropy(credit_rating=excellent) = 0.6601907318344924


Unnamed: 0,age,income,student,credit_rating,buys_computer
1,youth,high,no,excellent,no
5,senior,low,yes,excellent,no
6,middleaged,low,yes,excellent,yes
10,youth,medium,yes,excellent,yes
11,middleaged,medium,no,excellent,yes
13,senior,medium,no,excellent,no


Entropy(buys_computer|credit_rating=fair) = – p(No) . log2p(No) – p(Yes) . log2p(Yes)

= – (2/8) . log2(2/8) – (6/8) . log2(6/8) = 0.6411148186024177

---
Entropy(buys_computer|credit_rating=excellent) = – p(No) . log2p(No) – p(Yes) . log2p(Yes)

= – (3/6) . log2(3/6) – (3/6) . log2(3/6) = 0.6601907318344924

**Gain(buys_computer, credit_rating)** = Entropy(buys_computer)

 – (p(buys_computer|credit_rating=fair) * Entropy(buys_computer|credit_rating=fair))
 
 – (p(buys_computer|credit_rating=excellent) * Entropy(buys_computer|credit_rating=excellent))

In [7]:
gain_buys_computer_credit_rating = 0.651 - ((8/14) * 0.641) - ((6/14) * 0.660)
print("Gain(buys_computer, credit_rating) = ", gain_buys_computer_credit_rating)
gains['gain_buys_computer_credit_rating'] = gain_buys_computer_credit_rating

Gain(buys_computer, credit_rating) =  0.0018571428571428905


In [8]:
print("student : yes ?")

yes = dfs[(dfs['student'] == 'yes')]
print("yes: ", yes.shape[0])

yes_yes = dfs[(dfs['student'] == 'yes') & (dfs['buys_computer'] == 'yes')]
p_yes_yes = yes_yes.shape[0]
yes_no = dfs[(dfs['student'] == 'yes') & (dfs['buys_computer'] == 'no')]
p_yes_no = yes_no.shape[0]

print("yes:", p_yes_yes)
p_yes_yes = p_yes_yes / total
print("no:", p_yes_no)
p_yes_no = p_yes_no / total

entropy_student_yes = - p_yes_yes * np.log(p_yes_yes) - p_yes_no * np.log(p_yes_no)
print('Entropy(student=yes) =', entropy_student_yes)
result['student_yes'] = entropy_student_yes
yes

student : yes ?
yes:  7
yes: 6
no: 1
Entropy(student=yes) = 0.5516317494241771


Unnamed: 0,age,income,student,credit_rating,buys_computer
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
6,middleaged,low,yes,excellent,yes
8,youth,low,yes,fair,yes
9,senior,medium,yes,fair,yes
10,youth,medium,yes,excellent,yes
12,middleaged,high,yes,fair,yes


In [9]:
print("student : no ?")

no = dfs[(dfs['student'] == 'no')]
print("no: ", no.shape[0])

no_yes = dfs[(dfs['student'] == 'no') & (dfs['buys_computer'] == 'yes')]
p_no_yes = no_yes.shape[0]
no_no = dfs[(dfs['student'] == 'no') & (dfs['buys_computer'] == 'no')]
p_no_no = no_no.shape[0]

print("yes:", p_no_yes)
p_no_yes = p_no_yes / total
print("no:", p_no_no)
p_no_no = p_no_no / total

entropy_student_no = - p_no_yes * np.log(p_no_yes) - p_no_no * np.log(p_no_no)
print('Entropy(student=no) =', entropy_student_no)
result['student_no'] = entropy_student_no
no

student : no ?
no:  7
yes: 3
no: 4
Entropy(student=no) = 0.6880276426302085


Unnamed: 0,age,income,student,credit_rating,buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middleaged,high,no,fair,yes
3,senior,medium,no,fair,yes
7,youth,medium,no,fair,no
11,middleaged,medium,no,excellent,yes
13,senior,medium,no,excellent,no


Entropy(buys_computer|student=yes) = – p(No) . log2p(No) – p(Yes) . log2p(Yes)

= – (1/7) . log2(1/7) – (6/7) . log2(6/7) = 0.5516317494241771

---
Entropy(buys_computer|student=no) = – p(No) . log2p(No) – p(Yes) . log2p(Yes)

= – (4/7) . log2(4/7) – (3/7) . log2(3/7) = 0.6880276426302085

**Gain(buys_computer, student)** = Entropy(buys_computer)

 – (p(buys_computer|student=yes) * Entropy(buys_computer|student=yes))
 
 – (p(buys_computer|student=no) * Entropy(buys_computer|student=no))


In [10]:
gain_buys_computer_student = 0.940 - ((7/14) * 0.551) - ((7/14) * 0.688)
print("Gain(buys_computer, student) = ", gain_buys_computer_student)
gains['gain_buys_computer_student'] = gain_buys_computer_student

Gain(buys_computer, student) =  0.3204999999999999


In [11]:
print("income : low ?")

low = dfs[(dfs['income'] == 'low')]
print("low: ", low.shape[0])

low_yes = dfs[(dfs['income'] == 'low') & (dfs['buys_computer'] == 'yes')]
p_low_yes = low_yes.shape[0]
low_no = dfs[(dfs['income'] == 'low') & (dfs['buys_computer'] == 'no')]
p_low_no = low_no.shape[0]

print("yes:", p_low_yes)
p_low_yes = p_low_yes / total
print("no:", p_low_no)
p_low_no = p_low_no / total

entropy_income_low = - p_low_yes * np.log(p_low_yes) - p_low_no * np.log(p_low_no)
print('Entropy(income=low) =', entropy_income_low)
result['income_low'] = entropy_income_low
low

income : low ?
low:  4
yes: 3
no: 1
Entropy(income=low) = 0.5185994608897646


Unnamed: 0,age,income,student,credit_rating,buys_computer
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
6,middleaged,low,yes,excellent,yes
8,youth,low,yes,fair,yes


In [12]:
print("income : medium ?")

medium = dfs[(dfs['income'] == 'medium')]
print("medium: ", medium.shape[0])

medium_yes = dfs[(dfs['income'] == 'medium') & (dfs['buys_computer'] == 'yes')]
p_medium_yes = medium_yes.shape[0]
medium_no = dfs[(dfs['income'] == 'medium') & (dfs['buys_computer'] == 'no')]
p_medium_no = medium_no.shape[0]

print("yes:", p_medium_yes)
p_medium_yes = p_medium_yes / total
print("no:", p_medium_no)
p_medium_no = p_medium_no / total

entropy_income_medium = - p_medium_yes * np.log(p_medium_yes) - p_medium_no * np.log(p_medium_no)
print('Entropy(income=medium) =', entropy_income_medium)
result['income_medium'] = entropy_income_medium
medium

income : medium ?
medium:  6
yes: 4
no: 2
Entropy(income=medium) = 0.6359194408637213


Unnamed: 0,age,income,student,credit_rating,buys_computer
3,senior,medium,no,fair,yes
7,youth,medium,no,fair,no
9,senior,medium,yes,fair,yes
10,youth,medium,yes,excellent,yes
11,middleaged,medium,no,excellent,yes
13,senior,medium,no,excellent,no


In [13]:
print("income : high ?")

high = dfs[(dfs['income'] == 'high')]
print("high: ", high.shape[0])

high_yes = dfs[(dfs['income'] == 'high') & (dfs['buys_computer'] == 'yes')]
p_high_yes = high_yes.shape[0]
high_no = dfs[(dfs['income'] == 'high') & (dfs['buys_computer'] == 'no')]
p_high_no = high_no.shape[0]

print("yes:", p_high_yes)
p_high_yes = p_high_yes / total
print("no:", p_high_no)
p_high_no = p_high_no / total

entropy_income_high = - p_high_yes * np.log(p_high_yes) - p_high_no * np.log(p_high_no)
print('Entropy(income=high) =', entropy_income_high)
result['income_high'] = entropy_income_high
high

income : high ?
high:  4
yes: 2
no: 2
Entropy(income=high) = 0.555974328301518


Unnamed: 0,age,income,student,credit_rating,buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middleaged,high,no,fair,yes
12,middleaged,high,yes,fair,yes


Entropy(buys_computer|income=low) = – p(No) . log2p(No) – p(Yes) . log2p(Yes)

= – (1/4) . log2(1/4) – (3/4) . log2(3/4) = 0.5185994608897646

---
Entropy(buys_computer|income=medium) = – p(No) . log2p(No) – p(Yes) . log2p(Yes)

= – (2/6) . log2(2/6) – (4/6) . log2(4/6) = 0.6359194408637213

---
Entropy(buys_computer|income=high) = – p(No) . log2p(No) – p(Yes) . log2p(Yes)

= – (2/4) . log2(2/4) – (2/4) . log2(2/4) = 0.555974328301518

**Gain(buys_computer, income)** = Entropy(buys_computer)

 – (p(buys_computer|income=low) * Entropy(buys_computer|income=low))
 
 – (p(buys_computer|income=medium) * Entropy(buys_computer|income=medium))
 
 – (p(buys_computer|income=high) * Entropy(buys_computer|income=high))

In [14]:
gain_buys_computer_income = 0.940 - ((4/14) * 0.518) - ((6/14) * 0.635) - ((4/14) * 0.555)
print("Gain(buys_computer, income) = ", gain_buys_computer_income)
gains['gain_buys_computer_income'] = gain_buys_computer_income

Gain(buys_computer, income) =  0.3612857142857142


In [15]:
print("age : youth ?")

youth = dfs[(dfs['age'] == 'youth')]
print("youth: ", youth.shape[0])

youth_yes = dfs[(dfs['age'] == 'youth') & (dfs['buys_computer'] == 'yes')]
p_youth_yes = youth_yes.shape[0]
youth_no = dfs[(dfs['age'] == 'youth') & (dfs['buys_computer'] == 'no')]
p_youth_no = youth_no.shape[0]

print("yes:", p_youth_yes)
p_youth_yes = p_youth_yes / total
print("no:", p_youth_no)
p_youth_no = p_youth_no / total

entropy_age_youth = - p_youth_yes * np.log(p_youth_yes) - p_youth_no * np.log(p_youth_no)
print('Entropy(age=youth) =', entropy_age_youth)
result['age_youth'] = entropy_age_youth
youth

age : youth ?
youth:  5
yes: 2
no: 3
Entropy(age=youth) = 0.6080825300680053


Unnamed: 0,age,income,student,credit_rating,buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
7,youth,medium,no,fair,no
8,youth,low,yes,fair,yes
10,youth,medium,yes,excellent,yes


In [17]:
print("age : middleaged ?")

middleaged = dfs[(dfs['age'] == 'middleaged')]
print("middleaged: ", middleaged.shape[0])

middleaged_yes = dfs[(dfs['age'] == 'middleaged') & (dfs['buys_computer'] == 'yes')]
p_middleaged_yes = middleaged_yes.shape[0]
middleaged_no = dfs[(dfs['age'] == 'middleaged') & (dfs['buys_computer'] == 'no')]
p_middleaged_no = middleaged_no.shape[0]

print("yes:", p_middleaged_yes)
p_middleaged_yes = p_middleaged_yes / total
print("no:", p_middleaged_no)
p_middleaged_no = p_middleaged_no / total

if p_middleaged_yes == 0:
    entropy_age_middleaged =- p_middleaged_no * np.log(p_middleaged_no)
elif p_middleaged_no == 0:
    entropy_age_middleaged = - p_middleaged_yes * np.log(p_middleaged_yes)
else:
    entropy_age_middleaged = - p_middleaged_yes * np.log(p_middleaged_yes) - p_middleaged_no * np.log(p_middleaged_no)
    
print('Entropy(age=middle) =', entropy_age_middleaged)
result['age_middle'] = entropy_age_middleaged
middleaged

age : middleaged ?
middleaged:  4
yes: 4
no: 0
Entropy(age=middle) = 0.3579322767129623


Unnamed: 0,age,income,student,credit_rating,buys_computer
2,middleaged,high,no,fair,yes
6,middleaged,low,yes,excellent,yes
11,middleaged,medium,no,excellent,yes
12,middleaged,high,yes,fair,yes


In [18]:
print("age : senior ?")

senior = dfs[(dfs['age'] == 'senior')]
print("senior: ", senior.shape[0])

senior_yes = dfs[(dfs['age'] == 'senior') & (dfs['buys_computer'] == 'yes')]
p_senior_yes = senior_yes.shape[0]
senior_no = dfs[(dfs['age'] == 'senior') & (dfs['buys_computer'] == 'no')]
p_senior_no = senior_no.shape[0]

print("yes:", p_senior_yes)
p_senior_yes = p_senior_yes / total
print("no:", p_senior_no)
p_senior_no = p_senior_no / total

entropy_age_senior = - p_senior_yes * np.log(p_senior_yes) - p_senior_no * np.log(p_senior_no)
print('Entropy(age=senior) =', entropy_age_senior)
result['age_senior'] = entropy_age_senior
senior

age : senior ?
senior:  5
yes: 3
no: 2
Entropy(age=senior) = 0.6080825300680053


Unnamed: 0,age,income,student,credit_rating,buys_computer
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
9,senior,medium,yes,fair,yes
13,senior,medium,no,excellent,no


Entropy(buys_computer|age=youth) = – p(No) . log2p(No) – p(Yes) . log2p(Yes)

= – (3/5) . log2(3/5) – (2/5) . log2(2/5) = 0.6080825300680053

---
Entropy(buys_computer|age=middleaged) = – p(No) . log2p(No) – p(Yes) . log2p(Yes)

= – (0/4) . log2(0/4) – (4/4) . log2(4/4) = 0.3579322767129623

---
Entropy(buys_computer|age=senior) = – p(No) . log2p(No) – p(Yes) . log2p(Yes)

= – (2/5) . log2(2/5) – (3/5) . log2(3/5) = 0.6080825300680053

**Gain(buys_computer, age)** = Entropy(buys_computer)

 – (p(buys_computer|age=youth) * Entropy(buys_computer|age=youth))
 
 – (p(buys_computer|age=middleaged) * Entropy(buys_computer|age=middleaged))
 
 – (p(buys_computer|age=senior) * Entropy(buys_computer|age=senior))

In [19]:
gain_buys_computer_age = 0.940 - ((5/14) * 0.608 ) - ((4/14) * 0.357) - ((5/14) * 0.608)
print("Gain(buys_computer, age) = ", gain_buys_computer_age)
gains['gain_buys_computer_age'] = gain_buys_computer_age

Gain(buys_computer, age) =  0.40371428571428575


In [20]:
result

{'buys_computer': 0.6517565611726531,
 'credit_rating_fair': 0.6411148186024177,
 'credit_rating_excellent': 0.6601907318344924,
 'student_yes': 0.5516317494241771,
 'student_no': 0.6880276426302085,
 'income_low': 0.5185994608897646,
 'income_medium': 0.6359194408637213,
 'income_high': 0.555974328301518,
 'age_youth': 0.6080825300680053,
 'age_middle': 0.3579322767129623,
 'age_senior': 0.6080825300680053}

In [21]:
gains

{'gain_buys_computer_credit_rating': 0.0018571428571428905,
 'gain_buys_computer_student': 0.3204999999999999,
 'gain_buys_computer_income': 0.3612857142857142,
 'gain_buys_computer_age': 0.40371428571428575}

In [27]:
min(gains, key=gains. get)

'gain_buys_computer_credit_rating'