In [2]:
from __future__ import division # For python 2.*

import numpy as np
import matplotlib.pyplot as plt
import mltools as ml

In [3]:
X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)
X,Y = ml.shuffleData(X,Y)

In [4]:
class BaggedTree(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        n_bags = len(self.learners)
        preds = [self.learners[l].predictSoft(X) for l in range(n_bags)]
        return np.mean(preds, axis=0)

In [19]:
# not split data, using random forest
Xtr = X
Ytr = Y
ensemble = [0]*25
Xte = np.genfromtxt("data/X_test.txt",delimiter=None)
Ypred = np.zeros((np.size(Xte,0),1))
for i in range(25):
    Xi,Yi = ml.bootstrapData(Xtr,Ytr)
    ensemble[i] = ml.dtree.treeClassify(Xi,Yi,maxDepth=8
            ,minLeaf=256,nFeatures=60,minParent=512)
    Ypred = Ypred + ensemble[i].predictSoft(Xte)
Ypred = Ypred/25
print("AUC after 25 learners = " + str(ensemble[24].auc(Xi,Yi)))
np.savetxt('Ypred_.txt',
np.vstack( (np.arange(len(Ypred)) , Ypred[:,1]) ).T,
'%d, %.2f',header='ID,Prob1',comments='',delimiter=',');


AUC after 25 learners = 0.694709343514


In [21]:
# not split data, using random forest
Xtr = X
Ytr = Y
ensemble = [0]*20
Xte = np.genfromtxt("data/X_test.txt",delimiter=None)
Ypred = np.zeros((np.size(Xte,0),1))
for i in range(20):
    Xi,Yi = ml.bootstrapData(Xtr,Ytr)
    ensemble[i] = ml.dtree.treeClassify(Xi,Yi,maxDepth=8
            ,minLeaf=256,nFeatures=60,minParent=512)
    Ypred = Ypred + ensemble[i].predictSoft(Xte)
Ypred = Ypred/20
print("AUC after 20 learners = " + str(ensemble[19].auc(Xi,Yi)))
# np.savetxt('Ypred_.txt',
# np.vstack( (np.arange(len(Ypred)) , Ypred[:,1]) ).T,
# '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');


AUC after 20 learners = 0.695381852331


In [5]:
# not split data, using random forest
Xtr = X
Ytr = Y
ensemble = [0]*15
Xte = np.genfromtxt("data/X_test.txt",delimiter=None)
Ypred = np.zeros((np.size(Xte,0),1))
for i in range(15):
    Xi,Yi = ml.bootstrapData(Xtr,Ytr)
    ensemble[i] = ml.dtree.treeClassify(Xi,Yi,maxDepth=8
            ,minLeaf=256,nFeatures=60,minParent=512)
    Ypred = Ypred + ensemble[i].predictSoft(Xte)
Ypred = Ypred/15
print("AUC after 15 learners = " + str(ensemble[14].auc(Xi,Yi)))
np.savetxt('Ysubmit.txt',
np.vstack( (np.arange(len(Ypred)) , Ypred[:,1]) ).T,
'%d, %.2f',header='ID,Prob1',comments='',delimiter=',');


AUC after 15 learners = 0.693575158225


In [18]:
# spit data to 0.8, using radom forest
[Xtr,Xva,Ytr,Yva] = ml.splitData(X,Y,0.80)

ensemble1 = [0]*25
Xte = np.genfromtxt("data/X_test.txt",delimiter=None)
Ypred1 = np.zeros((np.size(Xte,0),1))
for i in range(25):
    Xi,Yi = ml.bootstrapData(Xtr,Ytr)
    ensemble1[i] = ml.dtree.treeClassify(Xi,Yi,maxDepth=8
            ,minLeaf=256,nFeatures=60,minParent=512)
    Ypred1 = Ypred1 + ensemble1[i].predictSoft(Xte)
Ypred1 = Ypred1/25
print("AUC after 25 learners = " + str(ensemble1[24].auc(Xi,Yi)))
# np.savetxt('Ypred1.txt',
# np.vstack( (np.arange(len(Ypred1)) , Ypred1[:,1]) ).T,
# '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');


AUC after 25 learners = 0.692043573365


In [None]:
# use Ada Boost
nBoost = 25
for i in range(nBoost):
    learner[i] = ml.MyClassifier( X,Y, weights=wts ) # train a weighted classifier
    Yhat = learner[i].predict(X)
    e = wts.dot( Y != Yhat ) # compute weighted error rate
    alpha[i] = 0.5 * np.log( (1-e)/e )
    wts *= np.exp( -alpha[i] * Y * Yhat ) # update weights
    wts /= wts.sum() # and normalize them

In [None]:
# Final classifier:
predict = np.zeros( (mTest,) )
for i in range(nBoost):
    predict += alpha[i] * learner[i].predict(Xtest)# compute contribution of each
predict = np.sign(predict) # and convert to +1 / -1 decision
print("AUC after 25 learners = " + str(predict[24].auc(Xi,Yi)))