In [1]:
import numpy as np
import math

In [2]:
train_data, train_labels = [], []
test_data, test_labels = [], []
dictionary = []

with open("hw6train.txt") as train:
    for email in train.readlines():
        point = [int(n) for n in email.split()]
        train_data.append(point[:-1])
        train_labels.append(point[-1])

with open("hw6test.txt") as test:
    for email in test.readlines():
        point = [int(n) for n in email.split()]
        test_data.append(point[:-1])
        test_labels.append(point[:-1])

with open("hw6dictionary.txt") as d:
    dictionary = [email.rstrip() for email in d.readlines()]

In [3]:
print len(train_data[0]), len(dictionary)

4003 4003


In [4]:
pos_classifiers, neg_classifiers = [], []
for word_index in dictionary:
    pos_classifiers.append(lambda email: email[word_index] == 1)
    neg_classifiers.append(lambda email: email[word_index] != 1)

In [5]:
def predict(data, classifier_list):
    weighted_sum = 0
    for alpha, classifier in classifier_list:
        weighted_sum += alpha * classifier(data)
    return 1 if weighted_sum > 0 else -1

In [6]:
def error(data, labels, classifier_list):
    error_count = 0
    for point, label in zip(data, labels):
        if predict(point, classifier_list) != label:
            error_count += 1
    return error_count / len(data)

In [7]:
def adaboost_train(data, labels, iterations):
    weights = [1.0 / len(data) for _ in data]
    classifier_list = []
    for t in xrange(iterations):
        classifier, error, rescale_factors = get_best_classifier_error_rescale_factor(weights, data, labels)
        weights, alpha = rescale(weights, error, rescale_factors)
        classifier_list.append((alpha, classifier))
    return classifier_list

In [8]:
def rescale(weights, error, rescale_factors):
    new_weights = []
    alpha = 0.5 * math.log((1 - error) / error)
    for weight, factor in zip(weights, rescale_factors):
        new_weights.append(weight * math.exp(-alpha * factor))
    normalization_constant = sum(new_weights)
    return [weight / normalization_constant for weight in new_weights], alpha

In [9]:
def get_best_classifier_error_rescale_factor(weights, data, labels):
    best_classifier, best_error, best_rescale_factors = None, 0.5, []
    for classifier in pos_classifiers + neg_classifiers:
        error, rescale_factors = compute_error_and_rescale_factors(classifier, weights, data, labels)
        if error < best_error:
            best_classifier, best_error, best_rescale_factors = classifier, error, rescale_factors
    return best_classifier, best_error, best_rescale_factors

In [10]:
def compute_error_and_rescale_factors(classifier, weights, data, labels):
    error, rescale_factors = 0, weights[:]
    for weight, point, label in zip(weights, data, labels):
        if classifier(data) != label:
            error += weight
            rescale_factors.append(label * classifier(data))
    return error, rescale_factors

In [11]:
iterations = [3, 7, 10, 15, 20]
for t in iterations:
    classifier_list = adaboost_train(train_data, train_labels, t)
    print "Error after round", t, "training", error(train_data, train_labels, classifier_list), "test", error(test_data, test_labels, classifier_list)

TypeError: list indices must be integers, not str