In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("data/tennis.txt",sep='\t')


In [3]:
df

Unnamed: 0,outlook,temperature,humidity,wind,playtennis
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,overcast,hot,high,weak,yes
3,rain,mild,high,weak,yes
4,rain,cool,normal,weak,yes
5,rain,cool,normal,strong,no
6,overcast,cool,normal,strong,yes
7,sunny,mild,high,weak,no
8,sunny,cool,normal,weak,yes
9,rain,mild,normal,weak,yes


In [4]:
df.to_numpy()

array([['sunny', 'hot', 'high', 'weak', 'no'],
       ['sunny', 'hot', 'high', 'strong', 'no'],
       ['overcast', 'hot', 'high', 'weak', 'yes'],
       ['rain', 'mild', 'high', 'weak', 'yes'],
       ['rain', 'cool', 'normal', 'weak', 'yes'],
       ['rain', 'cool', 'normal', 'strong', 'no'],
       ['overcast', 'cool', 'normal', 'strong', 'yes'],
       ['sunny', 'mild', 'high', 'weak', 'no'],
       ['sunny', 'cool', 'normal', 'weak', 'yes'],
       ['rain', 'mild', 'normal', 'weak', 'yes'],
       ['sunny', 'mild', 'normal', 'strong', 'yes'],
       ['overcast', 'mild', 'high', 'strong', 'yes'],
       ['overcast', 'hot', 'normal', 'weak', 'yes'],
       ['rain', 'mild', 'high', 'strong', 'no']], dtype=object)

In [5]:
columnname = df.columns.to_numpy()

In [6]:
X = df.to_numpy()[:, :-1]
X

array([['sunny', 'hot', 'high', 'weak'],
       ['sunny', 'hot', 'high', 'strong'],
       ['overcast', 'hot', 'high', 'weak'],
       ['rain', 'mild', 'high', 'weak'],
       ['rain', 'cool', 'normal', 'weak'],
       ['rain', 'cool', 'normal', 'strong'],
       ['overcast', 'cool', 'normal', 'strong'],
       ['sunny', 'mild', 'high', 'weak'],
       ['sunny', 'cool', 'normal', 'weak'],
       ['rain', 'mild', 'normal', 'weak'],
       ['sunny', 'mild', 'normal', 'strong'],
       ['overcast', 'mild', 'high', 'strong'],
       ['overcast', 'hot', 'normal', 'weak'],
       ['rain', 'mild', 'high', 'strong']], dtype=object)

In [7]:
label ,y = np.unique(df.to_numpy()[:,-1] , return_inverse=True)

In [8]:
label

array(['no', 'yes'], dtype=object)

In [9]:
height, width = X.shape


# counts n.o. 0-labels and 1-labels of all values of all attributes
# counters: list: dict{str : [count0, count1]}
counters = [dict() for _ in range(width)]
# list indices of all values of all attributes
# collectors: list: dict{str : [index0, index1, index2,...,]}
collectors = [dict() for _ in range(width)]

for i in range(height):
    label = y[i]
    
    for (index, value) in enumerate(X[i]):
        counter = counters[index]
        counter[value] = counter.get(value, [0, 0])
        counter[value][label] += 1
        
        collector = collectors[index]
        collector[value] = collector.get(value, [])
        collector[value].append(i)
        

In [10]:
collectors

[{'sunny': [0, 1, 7, 8, 10],
  'overcast': [2, 6, 11, 12],
  'rain': [3, 4, 5, 9, 13]},
 {'hot': [0, 1, 2, 12], 'mild': [3, 7, 9, 10, 11, 13], 'cool': [4, 5, 6, 8]},
 {'high': [0, 1, 2, 3, 7, 11, 13], 'normal': [4, 5, 6, 8, 9, 10, 12]},
 {'weak': [0, 2, 3, 4, 7, 8, 9, 12], 'strong': [1, 5, 6, 10, 11, 13]}]

In [11]:

def entropy(counts):
    if counts[0] == 0 or counts[1] == 0:
        return 0
    ratio_0 = (counts[0])/np.sum(counts)
    ratio_1 = (counts[1])/np.sum(counts)
    return -(ratio_0*np.log2(ratio_0) + ratio_1*np.log2(ratio_1))

In [12]:
counters

[{'sunny': [3, 2], 'overcast': [0, 4], 'rain': [2, 3]},
 {'hot': [2, 2], 'mild': [2, 4], 'cool': [1, 3]},
 {'high': [4, 3], 'normal': [1, 6]},
 {'weak': [2, 6], 'strong': [3, 3]}]

In [13]:
Information_Gain = []
for (index, counter) in enumerate(counters):
    I = 0
    num_ele = np.sum(np.array(list(counter.values())))
    for value in counter.values():
        I += (sum(value)/num_ele)*entropy(value)
    Information_Gain.append(I)
min_index = np.argmin(np.array(Information_Gain))


print(collectors[min_index])
print(counters[min_index])
print(columnname[min_index])

{'sunny': [0, 1, 7, 8, 10], 'overcast': [2, 6, 11, 12], 'rain': [3, 4, 5, 9, 13]}
{'sunny': [3, 2], 'overcast': [0, 4], 'rain': [2, 3]}
outlook


In [14]:
np.delete(X, min_index, axis = 1)

array([['hot', 'high', 'weak'],
       ['hot', 'high', 'strong'],
       ['hot', 'high', 'weak'],
       ['mild', 'high', 'weak'],
       ['cool', 'normal', 'weak'],
       ['cool', 'normal', 'strong'],
       ['cool', 'normal', 'strong'],
       ['mild', 'high', 'weak'],
       ['cool', 'normal', 'weak'],
       ['mild', 'normal', 'weak'],
       ['mild', 'normal', 'strong'],
       ['mild', 'high', 'strong'],
       ['hot', 'normal', 'weak'],
       ['mild', 'high', 'strong']], dtype=object)

In [15]:
t = np.array([7.5,7.8,7.,6.5,7.8])
v = np.array([4,4,4,2,4])

In [16]:
(np.sum(t*v))/np.sum(v)

7.411111111111111

In [17]:
score = np.array(['6', '8.5', '8.5', '7', '7.17', '7', '5.5', '6.5', '5', '7.5',
       '7.5', '5', '8', '7', '8.5', '7', '6', '6.5', '7', '8.3', '8.5',
       '9', '8.8', '8.5', '5.8', '7.4', '6.5', '8.8', '6.2', '7.5', '6.7',
       '6.8', '8.8']).astype(np.float64)
#score = np.append(score,[7.5,7.8,7.,6.5,7.8],axis=0)
score

array([6.  , 8.5 , 8.5 , 7.  , 7.17, 7.  , 5.5 , 6.5 , 5.  , 7.5 , 7.5 ,
       5.  , 8.  , 7.  , 8.5 , 7.  , 6.  , 6.5 , 7.  , 8.3 , 8.5 , 9.  ,
       8.8 , 8.5 , 5.8 , 7.4 , 6.5 , 8.8 , 6.2 , 7.5 , 6.7 , 6.8 , 8.8 ])

In [18]:
c = np.array(['2', '4', '3', '4', '4', '3', '4', '4', '4', '4', '2', '4', '4',
       '4', '2', '4', '4', '4', '2', '4', '4', '4', '2', '2', '2', '4',
       '4', '4', '3', '4', '4', '4', '4']).astype(np.int64)
#c = np.append(c,[4,4,4,2,4],axis=0)

In [19]:
c

array([2, 4, 3, 4, 4, 3, 4, 4, 4, 4, 2, 4, 4, 4, 2, 4, 4, 4, 2, 4, 4, 4,
       2, 2, 2, 4, 4, 4, 3, 4, 4, 4, 4], dtype=int64)

In [20]:
sumary = np.sum((score * c))/np.sum(c)
total = sumary * 0.1 * 4
total

2.8841043478260873