In [129]:
import numpy as np
import pandas as pd

In [130]:

data = [
    [1, 30, 'No'],
    [1, 15, 'No'],
    [1, 5, 'No'],
    [0, 10, 'No'],
    [0, 5, 'No'],
    [0, 15, 'Yes'],
    [0, 20, 'Yes'],
    [0, 25, 'Yes'],
    [0, 30, 'Yes'],
    [0, 30, 'Yes'],
]

In [131]:
data

[[1, 30, 'No'],
 [1, 15, 'No'],
 [1, 5, 'No'],
 [0, 10, 'No'],
 [0, 5, 'No'],
 [0, 15, 'Yes'],
 [0, 20, 'Yes'],
 [0, 25, 'Yes'],
 [0, 30, 'Yes'],
 [0, 30, 'Yes']]

In [132]:
train_data = pd.DataFrame(data)

In [133]:
train_data

Unnamed: 0,0,1,2
0,1,30,No
1,1,15,No
2,1,5,No
3,0,10,No
4,0,5,No
5,0,15,Yes
6,0,20,Yes
7,0,25,Yes
8,0,30,Yes
9,0,30,Yes


In [134]:
train_data.columns = ['Rain', 'Time', 'Walk']

In [135]:
train_data

Unnamed: 0,Rain,Time,Walk
0,1,30,No
1,1,15,No
2,1,5,No
3,0,10,No
4,0,5,No
5,0,15,Yes
6,0,20,Yes
7,0,25,Yes
8,0,30,Yes
9,0,30,Yes


In [136]:
walk = pd.get_dummies(train_data['Walk'], drop_first=True)
time = pd.get_dummies(train_data['Time']>15, drop_first=True)
train_data = pd.concat([time, train_data, walk], axis=1)
train_data.drop(['Walk', 'Time'], axis=1, inplace=True)
train_data.rename(columns={'Yes': 'Walk', True: 'Time > 15'}, inplace=True)
train_data

Unnamed: 0,Time > 15,Rain,Walk
0,1,1,0
1,0,1,0
2,0,1,0
3,0,0,0
4,0,0,0
5,0,0,1
6,1,0,1
7,1,0,1
8,1,0,1
9,1,0,1


In [137]:
train_data.columns[-1]

'Walk'

In [138]:
class Question:
    def __init__(self):
        self.attribute = None
        self.predict = train_data.columns[-1]
        self.true_values = []
        self.false_values = []
        self.best_info_gain = 0
        self.best_question = None

    def get_values(self, column):
        for i in range(len(train_data)):
            if column[i] == 1:
                self.true_values.append(train_data[self.predict][i])
            else:
                self.false_values.append(train_data[self.predict][i])
        return self.true_values, self.false_values

    def info_gain(self, true_values, false_values, attributes, predict):
        ratio = np.bincount(train_data[attributes])/len(train_data[attributes])
        E_yes = self.entropy(true_values)
        E_no = self.entropy(false_values)
        prediction = self.entropy(train_data[predict])
        total_ent = (ratio[0] * E_no) + (ratio[1] * E_yes)
        return prediction - float(total_ent)

    def entropy(self, y):
        instance = np.bincount(y)
        ratio = instance/len(y)
        return np.sum([-(item * np.log2(item)) for item in ratio if item > 0])

    def question(self):    
        for i in range(len(train_data.columns) - 1):
            self.attribute = train_data.columns[i]
            true_values, false_values = self.get_values(train_data[self.attribute])
            info_gain = self.info_gain(true_values, false_values, self.attribute, self.predict)
            if info_gain > self.best_info_gain:
                self.best_info_gain = info_gain
                self.attribute = train_data.columns[i]
            self.true_values = []
            self.false_values = []
            self.best_question = f'Is {self.attribute}'
        return self.best_question, self.best_info_gain
    




In [139]:
question = Question()
question.question()

('Is Rain', 0.3958156020033583)