In [237]:
import numpy as np
from pandas import read_csv
import operator

header = ['pregnancies', 'glucose', 'blood pressure', 'skin thickness', 'insulin', 'bmi', 'pedigree', 'age']
data = np.array(read_csv('pima-indians-diabetes.data', header=None))
labels = data[:,8]
labels[labels == 0] = -1
features = data[:,:8]
m = len(data)

In [238]:
weights = np.ones(m) / m
classifiers = []

def accuracy():
    correct = 0
    for answer, sample in zip(labels, features):
        vote = 0
        for c in classifiers:
            if c[2]:
                vote += (-1 if sample[c[0]] < c[1] else 1) * c[3]
            else:
                vote += (-1 if sample[c[0]] >= c[1] else 1) * c[3]
        pred = np.sign(vote)
        if pred == answer:
            correct += 1
    print('- accuracy = %.2f%%' % ((correct/m)*100))
    
for i in range(3):
    print('\nRound', i)
    scores = []
    for f in range(features.shape[1]):
        for t in np.unique(features[:,f]):
            pred = [-1 if x < t else 1 for x in features[:,f]]
            err = sum([int(p != a) for (p, a) in zip(pred, labels)] * weights)
            scores.append((f, t, True, err))
            
            predR = [-1 if x >= t else 1 for x in features[:,f]]
            errR = sum([int(p != a) for (p, a) in zip(predR, labels)] * weights)
            scores.append((f, t, False, errR))
    f, t, lessthan, err = min(scores, key=operator.itemgetter(3))
    alpha = 0.5 * np.log((1 - err) / err)
    classifiers.append((f, t, lessthan, alpha))
    accuracy()
    print('- \\alpha_%d = %.4f' % (i, alpha))
    print('- classify negative if', header[f], '<' if lessthan else '>=', t)
    weights *= np.exp(-labels * alpha * [-1 if x < t else 1 for x in features[:,f]])
    weights /= np.sum(weights)


Round 0
- accuracy = 75.00%
- \alpha_0 = 0.5493
- classify negative if glucose < 144.0

Round 1
- accuracy = 75.00%
- \alpha_1 = 0.2759
- classify negative if age < 29.0

Round 2
- accuracy = 75.00%
- \alpha_2 = 0.2544
- classify negative if bmi < 30.0
