In [21]:
import pandas as pd
import numpy as np
import warnings
import pandasql as ps

warnings.filterwarnings('ignore')

In [73]:
#import dataset
df = pd.read_csv('https://raw.githubusercontent.com/AugustLONG/ML01/master/01decisiontree/AllElectronics.csv')

#cleasing data: remove RID column
df.drop("RID", axis = 1, inplace = True)

#cleasing data: rename target variable name
df.rename({'class_buys_computer': 'target_class'}, axis=1, inplace=True)

#change target values from text to boolean values
class_mapper = {"no": 0, "yes": 1}
df['target_class'] = df['target_class'].map(class_mapper)

#print data
df

Unnamed: 0,age,income,student,credit_rating,target_class
0,youth,high,no,fair,0
1,youth,high,no,excellent,0
2,middle_aged,high,no,fair,1
3,senior,medium,no,fair,1
4,senior,low,yes,fair,1
5,senior,low,yes,excellent,0
6,middle_aged,low,yes,excellent,1
7,youth,medium,no,fair,0
8,youth,low,yes,fair,1
9,senior,medium,yes,fair,1


In [235]:
def get_entropy(df, target):
    pos_list = df[df[target] == 1]
    neg_list = df[df[target] == 0]
    
    prob_pos_neg = np.array([len(pos_list) / len(df), len(neg_list) / len(df)])
    prob_pos_neg = prob_pos_neg[prob_pos_neg != 0]
    log_prob_pos_neg = np.log2(prob_pos_neg)
    
    return -sum(prob_pos_neg * log_prob_pos_neg)


def get_attribute_entropy(df, attribute, target):
    attribute_list = df[attribute].unique()
    attr_values = []
    
    for attr in attribute_list:
        df_attr = df[df[attribute] == attr]
        attr_values.append((len(df_attr) / len(df)) * get_entropy(df_attr, target))
    
    return sum(attr_values)


def attribute_entropy(df, target):
    column_list = [col for col in df if col != target]
    summary_list = []
    
    for col in column_list:
        query = """
            SELECT '{attribute}'                           AS attribute
                 , {attribute}                             AS details
                 , SUM(CASE WHEN {target_class} == 1 
                                 THEN 1 ELSE 0 END)        AS yes
                 , SUM(CASE WHEN {target_class} == 0 
                                 THEN 1 ELSE 0 END)        AS no
            FROM df
            GROUP BY 2
        """.format(attribute = col, target_class = target)
        
        summary_output = ps.sqldf(query, locals())
        summary_list.append(summary_output)
    
    return summary_list

In [236]:
row_list = []

for col in [col for col in df if col != 'target_class']:
    attr_name = col
    attr_entropy = get_entropy(df, 'target_class') - get_attribute_entropy(df, col, 'target_class')
    row_list.append([attr_name, round(attr_entropy, 2)])
    
pd.DataFrame(row_list, columns = ['attribute', 'information_gain'])

Unnamed: 0,attribute,information_gain
0,age,0.25
1,income,0.03
2,student,0.15
3,credit_rating,0.05


In [237]:
pd.concat(attribute_entropy(df, 'target_class'), ignore_index=True)

Unnamed: 0,attribute,details,yes,no
0,age,middle_aged,4,0
1,age,senior,3,2
2,age,youth,2,3
3,income,high,2,2
4,income,low,3,1
5,income,medium,4,2
6,student,no,3,4
7,student,yes,6,1
8,credit_rating,excellent,3,3
9,credit_rating,fair,6,2
