In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in data, have the label be the last column

df = pd.read_excel('tennis.xlsx')
df.head()

Unnamed: 0,ID,Days,Outlook,Temperature,Humidity,Wind,Class
0,1,Mon,Sunny,Hot,High,Weak,No
1,2,Tue,Sunny,Hot,High,Strong,No
2,3,Wed,Overcast,Hot,High,Weak,No
3,4,Thu,Rain,Mild,High,Weak,Yes
4,5,Fri,Rain,Cool,Normal,Weak,Yes


In [3]:
def calc_entropy(feature_dict, df = df):
    # calculates entropy for individual groups within a feature
    # calculates conditional entropy of feature
    
    m = df.shape[0]
    entropy_dict = {}
    for k, v in feature_dict.items():
#         print(k, v)
        group_nums = []
        group_total = 0
        for k1, v1 in v.items():
            group_total += v1
            group_nums.append(v1)
#         print(group_total)
#         print(group_nums)
        
        # calc probability of group, append to list
        prob_lst = []
        for i in range(0, len(group_nums)):
            prob = group_nums[i]/group_total
            prob_lst.append(prob)
#         print(prob_lst)

        # calc entropy of group
        # sum(-(p|1)log2(|1))
        entropy_lst = []
        for i in range(0, len(prob_lst)):
            if prob_lst[i] == 0:
                group_ent = 0
            else:
                group_ent = -(prob_lst[i])*np.log2(prob_lst[i])
            entropy_lst.append(group_ent)
        entropy_dict[k] = [np.sum(entropy_lst), group_total]
#         print(entropy_lst)
#         print(np.sum(entropy_lst))

    # calc conditional entropy
    conditional_entropy = 0
    for k, v in entropy_dict.items():
#         print(k,v)
        v1 = v[0]
        v2 = v[1]
        p = v2/m
        conditional_entropy += v1 * p
    
    return entropy_dict, conditional_entropy

In [4]:
def feature_counts(col, df = df):
    
    # takes in column to process and dataframe, df is default
    # goes thru feature, collects groups within feature
    # gets counts per label within each group
    # returns dict of values
    
    lab = df.columns[-1] # get label column
    unique_labels = df[lab].unique() # unique lables in case not binary
#     labs = df.loc[:,lab].value_counts().to_frame()
    vals = df.loc[:,col].value_counts().to_frame()
    groups = {}
    # loop thru unique lables, get counts, store in dict groups{}
    for val in range(0, vals.shape[0]): # for each group
        label_counts = {}
        for group in range(0, len(unique_labels)): # get value counts
            count = len(df[(df[col] == vals.index[val]) & (df[lab] == unique_labels[group])]) # get number of values
            label_counts[unique_labels[group]] = count # dict[label] = count
            groups[vals.index[val]] = label_counts #dict[feature][label: count]

    return groups

In [5]:
def system_entropy(df):
    # gets system entropy, returns as float
    # labels needs to be last column or update code
    
    label = df.columns[-1]
    labs = df.loc[:,label]
    unique_labels = labs.unique()
    m = df.shape[0]
    labs_dict = {}
    
    p_lst = []
    for i in range(0, len(unique_labels)):
        labs_dict[unique_labels[i]] = len(df[df[label] == unique_labels[i]])
        p_lst.append(labs_dict[unique_labels[i]]/m)
    
    # sum(-(p|1)log2(|1))
    entropy_lst = []
    for i in range(0, len(p_lst)):
        if p_lst[i] == 0:
            group_ent = 0
        else:
            group_ent = -(p_lst[i])*np.log2(p_lst[i])
        entropy_lst.append(group_ent)
    system_entropy = sum(entropy_lst)
        
    return system_entropy

In [6]:
def information_gain(calc_entropy_results):
    # calcs information gain
    # input entropy dict from calc_entropy
    
    sys_entropy = system_entropy(df)
    feature_entropy = calc_entropy_results[-1]
    conditional_entropy = sys_entropy - feature_entropy
    
    return conditional_entropy

In [7]:
def split_information(calc_entropy_results, df = df):
    # gets split information for information gain ratio calculation
    
    m = df.shape[0]
    split_info_lst = []
    entropy_dict = calc_entropy_results[0]
#     print(entropy_dict)
    for k, v in entropy_dict.items():
        count = v[1]
        p = count/m
        split_info_lst.append(-(p)*np.log2(p))
        split_information = np.sum(split_info_lst)
    return split_information

# split_information(calc_entropy(feature_counts("Days", df = df)))

In [8]:
def information_gain_ratio(conditional_entropy, split_information):
    # calculates information gain ratio for a feature
    return conditional_entropy / split_information

In [11]:
def get_entropy(feature):
    feat_counts = feature_counts(feature, df)
    print(f"Feature Counts: {feat_counts}\n")
    sys_entropy = system_entropy(df)
    print(f"System Entropy: {sys_entropy}\n")
    conditional_entropy = calc_entropy(feat_counts)
    print(f"Conditional Entropy: {conditional_entropy}\n")
    info_gain = information_gain(conditional_entropy)
    print(f"Information Gain: {info_gain}\n")
    split_info = split_information(conditional_entropy)
    print(f"Split Information: {split_info}\n")
    info_gain_ratio = information_gain_ratio(info_gain, split_info)
    print(f"Information Gain Ratio: {info_gain_ratio}")

In [842]:
get_entropy("Days")

Feature Counts: {'Mon': {'No': 2, 'Yes': 0}, 'Tue': {'No': 2, 'Yes': 0}, 'Wed': {'No': 1, 'Yes': 1}, 'Thu': {'No': 0, 'Yes': 2}, 'Fri': {'No': 0, 'Yes': 2}, 'Sat': {'No': 2, 'Yes': 0}, 'Sun': {'No': 0, 'Yes': 2}}

System Entropy: 1.0

Conditional Entropy: ({'Mon': [0.0, 2], 'Tue': [0.0, 2], 'Wed': [1.0, 2], 'Thu': [0.0, 2], 'Fri': [0.0, 2], 'Sat': [0.0, 2], 'Sun': [0.0, 2]}, 0.14285714285714285)

Information Gain: 0.08893660698832373

Split Information: 2.807354922057604

Information Gain Ratio: 0.031679858606242464


In [13]:
for feature in df.columns[1:-1]:
    print(f"\nFeature: {feature}\n")
    get_entropy(feature)


Feature: Days

Feature Counts: {'Mon': {'No': 2, 'Yes': 0}, 'Tue': {'No': 2, 'Yes': 0}, 'Wed': {'No': 1, 'Yes': 1}, 'Thu': {'No': 0, 'Yes': 2}, 'Fri': {'No': 0, 'Yes': 2}, 'Sat': {'No': 2, 'Yes': 0}, 'Sun': {'No': 0, 'Yes': 2}}

System Entropy: 1.0

Conditional Entropy: ({'Mon': [0.0, 2], 'Tue': [0.0, 2], 'Wed': [1.0, 2], 'Thu': [0.0, 2], 'Fri': [0.0, 2], 'Sat': [0.0, 2], 'Sun': [0.0, 2]}, 0.14285714285714285)

Information Gain: 0.8571428571428572

Split Information: 2.807354922057604

Information Gain Ratio: 0.30532044609259046

Feature: Outlook

Feature Counts: {'Sunny': {'No': 4, 'Yes': 1}, 'Rain': {'No': 1, 'Yes': 4}, 'Overcast': {'No': 2, 'Yes': 2}}

System Entropy: 1.0

Conditional Entropy: ({'Sunny': [0.7219280948873623, 5], 'Rain': [0.7219280948873623, 5], 'Overcast': [1.0, 4]}, 0.8013772106338303)

Information Gain: 0.19862278936616973

Split Information: 1.5774062828523454

Information Gain Ratio: 0.12591733120715737

Feature: Temperature

Feature Counts: {'Mild': {'No': 2, 