In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [2]:
from sklearn.tree import DecisionTreeClassifier

from collections import Counter
from scipy.stats.mstats import mode

bag_filenum = 0
boost_filenum = 0
balearn_num = 0
boolearn_num = 0


In [300]:
#bagging loop
#default tree depth = 1; uses stumps -- offers more direct comparison to stump-using AdaBoost
#although deeper trees are probably ideal.
def buildBagClassifier(X,y,K,treedepth = 1, counts_tiebreak = False):
    #if tree depth is small (1 or 2), presort data, which will speed training in this case.
    presort_data = (treedepth  < 3)
    trees = []
    
    #tiebreaking stuff - ignore for now
#     countdict = Counter(y.tolist())
    
#     n_output_vals = len(countdict.keys())
#     if(not counts_tiebreak):
#         tiebreak = dict(zip(countdict.keys(), np.ones(n_output_vals)))
        
    
    for step in range(0,K):
        newtree = DecisionTreeClassifier(max_depth = treedepth, presort = presort_data, splitter = 'random')
        newtree.fit(X,y)
        trees.append(newtree)
        
    return trees

In [301]:
def get_max_ind(indices, inlist):
    bestind = 0
    bestval = inlist[indices[0]]
    for i in indices:
        if(inlist[i] > bestval):
            bestval = inlist[i]
            bestind = i
    return bestind
        
def get_mode(inlist,breakers=None):
#     counts = Counter(inlist)
#note - doesn't deal well with multiple modes - would like to chose randomly (or on priors), but chooses arbitrarily 
#(deterministically)
    outmode = mode(inlist)[0][0]
    if(isinstance(outmode, np.ma.MaskedArray)):
        outmode = outmode[0]
    else: 
        #otherwise, should be array... I hope.
        outmode = outmode
    return outmode

def bag_predict(x,bag,breakers=None):
    preds = []
    for tree in bag:
        predcurr = tree.predict(x)
        preds.append(predcurr)
    return get_mode(preds,breakers)
    
def bag_score(xs,ys,bag,breakers=None):
    correct = 0.
    count = len(ys)
    for i in range(0,count):
        pred = bag_predict(xs[i].reshape(1,-1),bag,breakers)
        if(pred == ys[i]):
            correct += 1.
    return correct/count

In [484]:
#AdaBoost loop
def buildAdaBoostClassifier(X,y,K):
    nsamps = len(y)
    
    stumps = []
    z = []
    
    weights = np.ones(nsamps)
    weights = weights/nsamps
    
    #build K stumps with updating weights
    for step in range(0,K):
        #fit stump based on weights
        #set presorting true to improve training speed
        newstump = DecisionTreeClassifier(max_depth = 1, presort = True)
        newstump.fit(X,y,sample_weight=weights)
        
        error = 0.
        #if ever get no error, assume error would occur on average once in twice as large dataset
        errmin = .5/nsamps
        correct = []
        
        #sum (weighted) error; identify correctly-classified samples
        for i in range(0,nsamps):
            pred = newstump.predict(X[i].reshape(1,-1))
            if(pred != y[i]):
                error += weights[i]
            else:
                correct.append(i)
        
        if(error == 0.):
            error = errmin
        #reduce weight of correctly classified samples (based on quality of stump)
        for i in correct:
            weights[i] *= error/(1.-error)
        
        #normalize weights
        weights = weights/sum(weights)
        
        #calculate stump-weight z
        currz = np.log((1.-error)/error)
        
        #add weight z and stump newstump to appropriate lists
        z.append(currz)
        stumps.append(newstump)
        
    return stumps, z

In [305]:
def get_max_key(indict):
    keys = indict.keys()
    max_key = None
    max_val = None
    first = True
    for key in keys:
        if(first):
            max_val = indict[key]
            max_key = key
            first = False
        elif(indict[key] > max_val):
            max_val = indict[key]
            max_key = key
    return max_key
    
    
def boost_predict(x,stumps,z):
    scores = {}
    for i in range(0,len(stumps)):
        cpred = stumps[i].predict(x)
        
        #if guesses are arrays, assume they are trivial
        #i.e. of form [[guess]], containing the single guess value (default behavior for numerical classes)
        if(isinstance(cpred,np.ndarray)):
            guess = cpred[0]
        #otherwise, assume guesses are good keys
        else:
            guess = cpred
            
        try:
            scores[guess] += z[i]
        except:
            scores[guess] = z[i]
    return get_max_key(scores)
            
def boost_score(xs,ys,stumps, z):
    correct = 0.
    count = len(ys)
    for i in range(0,count):
        pred = boost_predict(xs[i].reshape(1,-1),stumps,z)
        if(pred == ys[i]):
            correct += 1
    return correct/count

In [168]:
arr_names = []
for i in range(0,279):
    arr_names.append("Input_" + str(i))
    
arr_names.append("Output")

dfa = pd.read_table("../HW4/arrhythmia.data", sep = ",", header = None, names=arr_names, na_values = ["?"])

In [169]:
from sklearn.preprocessing import Imputer
#impute mean for integer, float; impute mode otherwise


def categorical_to_int(cat_col):
    int_col = []
    catmap = {}
    currind = 0
    for val in cat_col:
        if val not in catmap.keys():
            catmap[val] = currind
            int_col.append(currind)
            currind += 1
        else:
            int_col.append(catmap[val])
    return int_col

def imputed_version(dfin, excl_cols = []):
    df = dfin.copy()
    
    copyimputed = False
    imp_mean = Imputer(missing_values='NaN', strategy='mean', axis=0,copy=copyimputed)
    imp_mode = Imputer(missing_values='NaN', strategy='most_frequent', axis=0, copy=copyimputed )

    categorical_vars = []
    #will use this list to make dummies for all categorical variables

    for col in df.columns:
        if(col in excl_cols):
            continue
        elif(df[col].dtype == np.float64 or  df[col].dtype == np.int64):
            df[col] = imp_mean.fit_transform(df[col].reshape(-1,1))    
        else:
            categorical_vars.append(col)
            #need to convert to integer column for Imputer to work
            df[col] = categorical_to_int(df[col])
            df[col] = imp_mode.fit_transform(df[col].reshape(-1,1))  
    return df


In [336]:
dfa = imputed_version(dfa, ['Output'])
# print(dfa['Input_13'][3])
# print(np.mean(dfa['Input_13'][:]))

Xa = np.asarray(dfa[arr_names[:279]])
ya = np.asarray(dfa['Output'])


In [183]:

ya2 = []
thresh = 4
for i in range(0,len(ya)):
    if(ya[i] <= thresh):
        ya2.append(1)
    else:
        ya2.append(0)
ya2 = np.asarray(ya2)

(452, 279)

In [194]:
Xatrain = Xa[:][:350] 
testa = Xa[:][350].reshape(1,-1)

In [337]:
yatrain = ya2[:350]

In [350]:
treea = 151
baga = buildBagClassifier(Xatrain,yatrain,treea)
train_bag_a = bag_score(Xatrain, yatrain, baga)
print(train_bag_a)

0.7771428571428571


In [351]:
boosta, za = buildAdaBoostClassifier(Xatrain,yatrain,treea)
train_boost_a = boost_score(Xatrain, yatrain, boosta, za)
print(train_boost_a)

0.9685714285714285


In [417]:
#Calculate validation curve values (training + test scores) on number K of learners

# old_ntrees = [1,3,5,11,31,51,75,101,151,201,251] + [125,175,275,301,325]
ntrees = [525,551,575,601]

bags = False
boost = True

addto = True

if(bags and not(addto)): 
    bag_trainscores = []
    bag_testscores = []
    ntrees_graph_bags = ntrees

elif(bags and addto):
    ntrees_graph_bags += ntrees
    
if(boost and not(addto)):
    boost_trainscores = []
    boost_testscores = []
    ntrees_graph_boost = ntrees
    
elif(boost and addto):
    ntrees_graph_boost += ntrees

for treea in ntrees:
    if(bags):
        baga = buildBagClassifier(Xatrain,yatrain,treea,treedepth = bagdepth)
        bag_trainscores.append(bag_score(Xatrain, yatrain, baga))
        bag_testscores.append(bag_score(Xa[:][350:],ya2[350:],baga))
    
    if(boost):
        boosta, za = buildAdaBoostClassifier(Xatrain,yatrain,treea)
        boost_trainscores.append(boost_score(Xatrain, yatrain, boosta, za))
        boost_testscores.append(boost_score(Xa[:][350:],ya2[350:],boosta,za))
        
    print(str(treea) + " done!")

525 done!
551 done!
575 done!
601 done!


In [414]:
#plot stump bag K validation curve
bag_filenum += 1
plt.clf()
# ax = plt.axes()
plt.title("Bag Validation Curve")
plt.plot(ntrees_graph_bags,bag_trainscores,'b.',label = 'Bagging Training Scores')
plt.plot(ntrees_graph_bags,bag_testscores,'r.', label = 'Bagging Test Scores')
plt.xlabel("Number of Trees of Depth " + str(bagdepth) + " Used")
plt.ylabel("Score (Fraction Correct)")
plt.legend(loc=4)
plt.show()
# plt.savefig('./7AdaBoost_and_StumpBag/bag_' + str(bag_filenum) + "_depth_" + str(bagdepth))

In [463]:
#plot AdaBoost K validation curve
boost_filenum += 1
plt.clf()
plt.title("AdaBoost Validation Curve")
plt.plot(ntrees_graph_boost,boost_trainscores,'b.', label = "AdaBoost Training Scores")
plt.plot(ntrees_graph_boost,boost_testscores,'r.', label = "AdaBoost Test Scores")
plt.xlabel("Number of Stumps Used")
plt.ylabel("Score (Fraction Correct)")
plt.legend(loc=4)
plt.savefig('./7AdaBoost_and_StumpBag/boost_' + str(boost_filenum))
plt.show()

In [485]:
#Learning curves: scores on test set vs size of train set 
#(note final curves won't include train set scores - just trying to replicate the Russell + Norvig graph here)

#get AdaBoost test scores
Kstar = 101
trainsizes = range(10,360,35)
boost_learnscores = []

for ntrain in trainsizes:
    currXa = Xatrain[:][:ntrain]
    currya = yatrain[:ntrain]
    boostal, zal = buildAdaBoostClassifier(currXa,currya,Kstar)
    boost_learnscores.append(boost_score(Xa[:][360:],ya2[360:],boostal,zal))



In [487]:
Kbag = 101

#get bag of stumps test scores
bag_learnscores = []
for ntrain in trainsizes:
    currXa = Xatrain[:][:ntrain]
    currya = yatrain[:ntrain]
    bagal = buildBagClassifier(currXa,currya,Kstar)
    bag_learnscores.append(bag_score(Xa[:][360:],ya2[360:],bagal))



In [488]:
#plot learning curves
boolearn_num += 1
plt.clf()
plt.title("AdaBoost Learning Curve")
plt.plot(trainsizes,boost_learnscores,'r-', label = "AdaBoost Learning Scores")
plt.plot(trainsizes, bag_learnscores, 'k--', label = "Bagging Learning Scores")
plt.xlabel("Number of Training Examples")
plt.ylabel("Score (Fraction Correct) On Test Set")
plt.legend(loc=4)
plt.savefig('./7AdaBoost_and_StumpBag/boost_learn_' + str(boolearn_num))
plt.show()

(452, 279)