In [1]:
import pandas as pd
import math

df = pd.read_csv("./dataset/banking.csv")
df.head(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,...,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,...,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1
5,30,management,divorced,basic.4y,no,yes,no,cellular,jul,tue,...,8,999,0,nonexistent,1.4,93.918,-42.7,4.961,5228.1,0
6,37,blue-collar,married,basic.4y,no,yes,no,cellular,may,thu,...,1,999,0,nonexistent,-1.8,92.893,-46.2,1.327,5099.1,0
7,39,blue-collar,divorced,basic.9y,no,yes,no,cellular,may,fri,...,1,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,0
8,36,admin.,married,university.degree,no,no,no,cellular,jun,mon,...,1,3,1,success,-2.9,92.963,-40.8,1.266,5076.2,1
9,27,blue-collar,single,basic.4y,no,yes,no,cellular,apr,thu,...,2,999,1,failure,-1.8,93.075,-47.1,1.41,5099.1,0


In [14]:
def entropy(df, target):
    values = df[target].value_counts()
    num_total = len(df)
    entropy = 0
    
    for value in values:
        p = value / num_total
        if (p != 0):
            entropy += p * math.log(p,2)
    
    entropy = -entropy

    return entropy

In [15]:
entropy(df, 'poutcome')

0.68467946807691882

In [18]:
def average_entropy(df, target, feature):
    feature_values = df[feature].unique()
    avg_entropy = 0
    
    for feature_value in feature_values:
        sub_df = df.loc[(df[feature] == feature_value)]
        
        avg_entropy += len(sub_df) / len(df) * entropy(sub_df, target)
                
    
    return avg_entropy

In [19]:
average_entropy(df, 'poutcome', 'education')

0.68173863633163534

In [20]:
def information_gain(df, target, feature):
    return entropy(df, target) - average_entropy(df, target, feature)

In [21]:
information_gain(df, 'poutcome', 'education')

0.0029408317452834787

In [26]:
def intrinsic_information(df, feature):
    attr_values = df[feature].unique()
    intrinsic_info = 0
    
    for attr_value in attr_values:
        sub_df = df.loc[(df[feature] == attr_value)]
        
        intrinsic_info += len(sub_df)/len(df) * math.log((len(sub_df)/len(df)), 2)
        
    intrinsic_info = -intrinsic_info
                
    
    return intrinsic_info

In [27]:
intrinsic_information(df, 'education')

2.5566438839448797

In [12]:
def gain_ratio(df, classifier, attribute):
    return information_gain(df, classifier, attribute) / intrinsic_information(df, classifier, attribute)

In [13]:
gain_ratio(df, 'poutcome', 'education')

0.0011502703852308911