In [5]:
import pandas as pd
import numpy as np

For the following quiz, consider the data found in this file, consisting of twenty-four made-up insects measured on their length and color.
Which of the following splitting criteria provides the most information gain for discriminating Mobugs from Lobugs?
* Color = Brown
* Color = Blue
* Color = Green
* Length < 17.0 mm
* Length < 20.0 mm

In [6]:
bugs = pd.read_csv("ml-bugs.csv")
bugs.head()

Unnamed: 0,Species,Color,Length (mm)
0,Mobug,Brown,11.6
1,Mobug,Blue,16.3
2,Lobug,Blue,15.1
3,Lobug,Green,23.7
4,Lobug,Blue,18.4


### Function to calculate entropy from group of two elements

In [33]:
def twoGroupEntropy(first, total):
    return -(first / total * np.log2(first / total) +
        (total - first) / total * np.log2((total - first) / total))

### Function to calculate entropy from child groups (e.g. after split on feature)

In [34]:
def childEntropy(bugs_child1, bugs_child2, total_bugs):
    return sum(bugs_child1)/total_bugs * twoGroupEntropy(bugs_child1[0], sum(bugs_child1)) \
        + sum(bugs_child2)/total_bugs * twoGroupEntropy(bugs_child2[0], sum(bugs_child2))

### Initial entropy of the two different bug species group

In [40]:
bug_species = bugs["Species"].value_counts()
total_bugs = sum(bug_species)

initial_ent = twoGroupEntropy(bug_species[0], total_bugs)
initial_ent

0.9798687566511528

### Split on Color: Brown

In [48]:
bugs_brown = bugs[bugs['Color'] == 'Brown']['Species'].value_counts()
bugs_not_brown = bugs[bugs['Color'] != 'Brown']['Species'].value_counts()
child_ent = childEntropy(bugs_brown, bugs_not_brown, total_bugs)
gain_ent = initial_ent - child_ent

gain_ent

0.06157292259666325

### Split on Color: Blue

In [47]:
bugs_blue = bugs[bugs['Color'] == 'Blue']['Species'].value_counts()
bugs_not_blue = bugs[bugs['Color'] != 'Blue']['Species'].value_counts()
child_ent = childEntropy(bugs_blue, bugs_not_blue, total_bugs)
gain_ent = initial_ent - child_ent

gain_ent

0.000589596275060833

### Split on Color: Green

In [44]:
bugs_green = bugs[bugs['Color'] == 'Green']['Species'].value_counts()
bugs_not_green = bugs[bugs['Color'] != 'Green']['Species'].value_counts()
child_ent = childEntropy(bugs_green, bugs_not_green, total_bugs)
gain_ent = initial_ent - child_ent

gain_ent

0.042776048498108565

### Split on Length: >17.0 mm

In [55]:
bugs_l17 = bugs[bugs['Length (mm)'] > 17]['Species'].value_counts()
bugs_not_l17 = bugs[bugs['Length (mm)'] <= 17]['Species'].value_counts()
child_ent = childEntropy(bugs_l17, bugs_not_l17, total_bugs)
gain_ent = initial_ent - child_ent

gain_ent

0.11260735516748976

### Split on Length: >20.0 mm

In [56]:
bugs_l17 = bugs[bugs['Length (mm)'] > 20]['Species'].value_counts()
bugs_not_l17 = bugs[bugs['Length (mm)'] <= 20]['Species'].value_counts()
child_ent = childEntropy(bugs_l17, bugs_not_l17, total_bugs)
gain_ent = initial_ent - child_ent

gain_ent


0.10073322588651723