In [94]:
import pandas as pd
import math
import csv
import pylab as pl
import time

# 1. Small Decision Tree by maximizing information gain

## 1.1 entropy function

$$Entropy(S) = -\Sigma{p\log{p}}$$

In [43]:
def get_entropy(data):
    total = sum(data)
    entropy =0
    for i,d in enumerate(data):
        # data/sum -> p
        try:
            d/=total*1.0
            entropy-=d*math.log(d,2)
        except:
            continue
            
    return entropy


In [37]:
get_entropy([2,2,2])

1.584962500721156

## 1.2 Load data

In [22]:
simpson = pd.read_csv('data/simpson.csv')

In [23]:
simpson

Unnamed: 0,Person,Hair_Length,Weight,Age,Class
0,Homer,0,250,36,M
1,Marge,10,150,34,F
2,Bart,2,90,10,M
3,Lisa,6,78,8,F
4,Maggie,4,20,1,F
5,Abe,1,170,70,M
6,Selma,8,160,41,F
7,Otto,10,180,38,M
8,Krusty,6,200,45,M
9,Comic,8,290,38,M


In [25]:
simpson.describe()

Unnamed: 0,Hair_Length,Weight,Age
count,10.0,10.0,10.0
mean,5.5,158.8,32.1
std,3.62859,80.56026,20.566694
min,0.0,20.0,1.0
25%,2.5,105.0,16.0
50%,6.0,165.0,37.0
75%,8.0,195.0,40.25
max,10.0,290.0,70.0


## 1.3 get maximum information gain

In [78]:
maxgain=-1
maxk=0
for k in pl.frange(0.0,10.0,0.1):
    s = simpson[simpson['Hair_Length'] <= k]
    b = simpson.drop(s.index)
    
    s_l = len(s)
    b_l = len(b)
    sum_l = s_l+b_l
    s_M = len(s[s['Class']=='M'])
    s_F = len(s[s['Class']=='F']) 
    b_M = len(b[b['Class']=='M']) 
    b_F = len(b[b['Class']=='F'])
    
    M = len(simpson[simpson['Class']=='M'])
    F = len(simpson[simpson['Class']=='F'])
    
    current = get_entropy([M,F])
#     print s_M,s_F,b_M,b_F
#     print s_l,b_l,sum_l
    child = get_entropy([s_M,s_F])*(s_l*1.0/sum_l) + get_entropy([b_M,b_F])*(b_l*1.0/sum_l)

    gain = current-child

    if gain>maxgain:

        maxk=k
        maxgain=gain

In [79]:
maxk, maxgain

(2.0, 0.2812908992306925)

# 2. Cross Validation

In [85]:
from sklearn import tree
from sklearn import metrics
from sklearn.cross_validation import cross_val_score

In [80]:
iris_data = pd.read_csv('data/iris.csv')

In [86]:
iris_X = iris_data[iris_data.columns[0:4]]
iris_Y = iris_data['species']

In [97]:
for i in range(5,15):
    start = time.time()
    clf = tree.DecisionTreeClassifier(max_depth=5)
    # Perform n-fold cross validation 
    scores = cross_val_score(estimator=clf, X=iris_X, y=iris_Y, cv=i, n_jobs=4)
    end = time.time()
    
    print ("for "+str(i)+"th fold cross "+str(scores.mean())+" and time spend "+str(end-start))
    

for 5th fold cross 0.96 and time spend 0.180916070938
for 6th fold cross 0.959104938272 and time spend 0.163623094559
for 7th fold cross 0.946428571429 and time spend 0.176446914673
for 8th fold cross 0.96626984127 and time spend 0.162850856781
for 9th fold cross 0.954320987654 and time spend 0.171447038651
for 10th fold cross 0.953333333333 and time spend 0.167570114136
for 11th fold cross 0.962121212121 and time spend 0.176992177963
for 12th fold cross 0.952777777778 and time spend 0.176806926727
for 13th fold cross 0.961538461538 and time spend 0.176886796951
for 14th fold cross 0.954365079365 and time spend 0.306139945984


# 3. AdaBoost

In [92]:
from sklearn.ensemble import AdaBoostClassifier

In [99]:
clf = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(estimator=clf, X=iris_X, y=iris_Y, cv=i, n_jobs=4)
scores.mean()

for i in range(5,15):
    start = time.time()
    clf = AdaBoostClassifier(n_estimators=100)
    scores = cross_val_score(estimator=clf, X=iris_X, y=iris_Y, cv=i, n_jobs=4)
    
    end = time.time()
    print ("for "+str(i)+"th fold cross "+str(scores.mean())+" and time spend "+str(end-start))
    

for 5th fold cross 0.946666666667 and time spend 0.566279888153
for 6th fold cross 0.945216049383 and time spend 0.571048974991
for 7th fold cross 0.953231292517 and time spend 0.67126083374
for 8th fold cross 0.945436507937 and time spend 0.778046131134
for 9th fold cross 0.948148148148 and time spend 0.880843877792
for 10th fold cross 0.946666666667 and time spend 0.877290964127
for 11th fold cross 0.948484848485 and time spend 0.988434076309
for 12th fold cross 0.945833333333 and time spend 1.07829189301
for 13th fold cross 0.948717948718 and time spend 1.28095078468
for 14th fold cross 0.948412698413 and time spend 1.81405186653
