# Information Gain

Information gain is the entropy lost between a parent node and a child node within a decision tree.

In [24]:
import numpy as np
import pandas as pd

logb2 = lambda i : np.log(i)/np.log(2)

def entropy(opts):
    ret = 0
    for i in opts:
        ret -= (i/(sum(opts)))*logb2(i/sum(opts))
    return ret

data = pd.read_csv('https://s3.amazonaws.com/video.udacity-data.com/topher/2018/April/5ad940f6_ml-bugs/ml-bugs.csv')

initial_set = [data[data['Species'] == 'Mobug'].index.size, data[data['Species'] == 'Lobug'].index.size]
initial_entropy = entropy(initial_set)

mobug_counts_color = data[data['Species'] == 'Mobug'].groupby('Color').count().iloc[:,0]
lobug_counts_color = data[data['Species'] == 'Lobug'].groupby('Color').count().iloc[:,0]

color_counts = pd.merge(mobug_counts_color, lobug_counts_color, left_index=True, right_index=True)
color_counts.columns = ['Mobug', 'Lobug']
color_counts['entropy'] = entropy([color_counts['Mobug'], color_counts['Lobug']])
color_counts['information_gain'] = initial_entropy - color_counts['entropy']

print(color_counts)


def info_gain_length(l, lt=True):
    if lt:
        mobug_under = data[(data['Species'] == 'Mobug') & (data['Length (mm)'] < l)].index.size
        lobug_under = data[(data['Species'] == 'Lobug') & (data['Length (mm)'] < l)].index.size
    else:
        mobug_under = data[(data['Species'] == 'Mobug') & (data['Length (mm)'] > l)].index.size
        lobug_under = data[(data['Species'] == 'Lobug') & (data['Length (mm)'] > l)].index.size
    
    ent = entropy([mobug_under, lobug_under])
    info_gain = initial_entropy - ent
    return info_gain

print("<17 information gain: " + str(entropy([mobug_under_17, lobug_under_17])))
print("")




       Mobug  Lobug   entropy  information_gain
Color                                          
Blue       4      6  0.970951          0.008918
Brown      4      2  0.918296          0.061573
Green      2      6  0.811278          0.168591
