In [132]:
import numpy as np
import matplotlib.pyplot as plt

In [140]:
def loadSimpData():
    datMat = np.array([[ 1. ,  2.1],
        [ 2. ,  1.1],
        [ 1.3,  1. ],
        [ 1. ,  1. ],
        [ 2 ,  1. ]])
    classLabels = np.array([1, 1, -1, -1, 1])
    return datMat,classLabels


def loadDatasets(filename):
    fr = open(filename)
    lengthOfFeat = len(fr.readline().strip().split('\t'))
    datasets = []
    labels = []
    for line in fr.readlines():
        array = np.array(line.strip().split('\t'))
        datasets.append(array[:lengthOfFeat - 1])
        labels.append(float(array[-1]))
        
    n, p = np.shape(datasets)
    dataMat = np.zeros((n, p))
    for i in range(n):
        for j in range(p):
            dataMat[i][j] = float(datasets[i][j])
    return dataMat, np.array(labels)


def stumpClassify(dataMatrix,dimen,threshVal):#just classify the data
    retArray = np.ones((np.shape(dataMatrix)[0]))
    retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
    return retArray
   
    
def buildStump(datasets, labels, D):
    n, dimension = np.shape(datasets)
    numStep = 10
    minError = np.inf # initialize minimal error
    bestClassifier = {'dimension': 0, 'threshold': 0}  # initialize the threshold
    for dim in range(dimension):  # for each dimension's feature set
        minData = np.min(datasets[:, dim])
        maxData = np.max(datasets[:, dim])
        stepSize = (maxData - minData) / numStep
        for j in range(-1, numStep + 2):
            errorArray = np.zeros(len(datasets))  # initialize error array for missclassification counting
            threshold = minData + j * stepSize
            labelArray = stumpClassify(datasets, dim, threshold)
            errorArray[labels != labelArray] = 1  # count the number of missclassification data
            error = np.sum(D * errorArray)
            if error < minError:
                minError = error
                classEst = labelArray
                bestClassifier['dimension'] = dim
                bestClassifier['threshold'] = threshold     
    return bestClassifier, classEst, minError


def adaBoostModel(datasets, labels, iteNum):
    n, p = np.shape(datasets)  # n: number of instances, p: dimension for each instance
    D = np.ones(n) / n  # initialize the weights array for each instance
    classEstimate = np.zeros(len(labels))  # initialize the final classifier
    predictLabel = np.ones(len(labels)) # initialize final predicted labels for each instance
    classifiersBuckets = [] # store each weak basic classifier
    alpha = np.zeros(iteNum)
    for ite in range(iteNum):
        bestClassifier, classEst, minError = buildStump(datasets, labels, D)
        alpha[ite] = 0.5 * np.log((1 - minError) / minError)  # update classifier linear coefficient alpha
        bestClassifier['D'] = D
        bestClassifier['alpha'] = alpha[ite]
        classifiersBuckets.append(bestClassifier)  # add this weak classifer into buckets
        Z = np.sum(D * np.exp(-alpha[ite] * labels * classEst))
        D = D * np.exp(-alpha[ite] * labels * classEst) / Z  # update weigth for each instance
        classEstimate += alpha[ite] * classEst
    classEstimate = np.sign(classEstimate)
    finalError = len(predictLabel[classEstimate != labels]) / len(labels)
    return classEstimate, finalError, classifiersBuckets


def accuracyAda(testSets, testLabels, classifiers):
    """
    input a single unkonwn test datasets instance and 
    output its final label using the train model classifiers
    """
    predictedLabels = 0
    for classify in classifiers:
        labels = stumpClassify(testSets,classify['dimension'],classify['threshold'])
        predictedLabels += classify['alpha'] * labels
    predictedLabels = np.sign(predictedLabels)
    accuracy = len(testLabels[predictedLabels == testLabels]) / len(testLabels)
    return accuracy

In [142]:
dataMat,classLabels = loadSimpData()
numIt = 3

classEstimate, minError, classifiersBuckets = adaBoostModel(dataMat, classLabels, numIt)
print(classEstimate)
print(minError)
print(classifiersBuckets)

[ 1.  1. -1. -1.  1.]
0.0
[{'dimension': 0, 'threshold': 1.3, 'D': array([0.2, 0.2, 0.2, 0.2, 0.2]), 'alpha': 0.6931471805599453}, {'dimension': 1, 'threshold': 1.0, 'D': array([0.5  , 0.125, 0.125, 0.125, 0.125]), 'alpha': 0.9729550745276565}, {'dimension': 0, 'threshold': 0.9, 'D': array([0.28571429, 0.07142857, 0.07142857, 0.07142857, 0.5       ]), 'alpha': 0.8958797346140273}]


In [143]:
accuracy = accuracyAda(np.array([[5, 5], [0, 0]]), np.array([1, -1]), classifiersBuckets)
accuracy

1.0

In [156]:
trainSets, trainLabels = loadDatasets('horseColicTraining2.txt')  # 298*21
testSets, testLabels = loadDatasets('horseColicTest2.txt')

numClassifier = [1, 10, 50, 100, 500, 1000]
for numIt in numClassifier:
    classEstimate, minError, classifiersBuckets = adaBoostModel(trainSets, trainLabels, numIt)
    trainAccuracy = 1 - minError
    testAccuracy = accuracyAda(testSets, testLabels, classifiersBuckets)
    print("Number of classifier:{}, train accuracy:{}, testAccuracy:{}".format(numIt, trainAccuracy, testAccuracy))


Number of classifier:1, train accuracy:0.6342281879194631, testAccuracy:0.7121212121212122
Number of classifier:10, train accuracy:0.6476510067114094, testAccuracy:0.6666666666666666
Number of classifier:50, train accuracy:0.6610738255033557, testAccuracy:0.6818181818181818
Number of classifier:100, train accuracy:0.6577181208053691, testAccuracy:0.696969696969697
Number of classifier:500, train accuracy:0.6577181208053691, testAccuracy:0.696969696969697
Number of classifier:1000, train accuracy:0.6577181208053691, testAccuracy:0.696969696969697
