In [None]:
import numpy as np
from IPython.display import clear_output
from google.colab import drive
drive.mount('/content/drive')

In [2]:
def get_data(filename):
    x, y = [], []
    file = open(filename, 'r')
    for line in file.readlines():
        split = line.split(' ')
        y.append(eval(split.pop()))
        x.append([1]+list(map(eval, split)))
    file.close()
    return x, y, len(y)

def h(x, g):
    s, i, theta = g
    return s * np.sign(x[i] - theta)

def Eg(data, g, weighted):
    N, err = len(data), 0
    for n in range(N):
        if data[n][1] != h(data[n][0], g):
            if weighted: err += data[n][2]
            else: err += 1 / N
    return err

def G(x, gs, factors, T, weighted):
    tmp = 0
    for i in range(T):
        if weighted: tmp += factors[i] * h(x, gs[i])
        else: tmp += h(x, gs[i])
    return np.sign(tmp)

def EG(data, gs, factors, T, weighted):
    N, err = len(data), 0
    for n in range(N):
        if data[n][1] != G(data[n][0], gs, factors, T, weighted):
            err += 1
    return err / N

def A(data):
    N, dim = len(data), len(data[0][0])
    best_err, best_s, best_i, best_theta = 1, 0, 0, 0
    for i in range(dim):
        data.sort(key = lambda v : v[0][i])
        for s in [-1, 1]:
            for n in range(N):
                if n == 0: theta = -np.inf
                else: theta = (data[n-1][0][i] + data[n][0][i]) / 2
                err = Eg(data, [s, i, theta], True)
                if err < best_err:
                    best_err, best_s, best_i, best_theta = err, s, i, theta
    return [best_s, best_i, best_theta]

def update(data, g):
    N, sum, err_sum = len(data), 0, 0
    for n in range(N):
        sum += data[n][2]
        if data[n][1] != h(data[n][0], g): err_sum += data[n][2]
    e = err_sum / sum
    factor = ((1-e)/e)**0.5
    for n in range(N):
        data[n] = list(data[n])
        if data[n][1] != h(data[n][0], g): data[n][2] *= factor
        else: data[n][2] /= factor
    return data, np.log(factor)

def adaboost(data, T):
    gs, factors = [], []
    for t in range(T):
        g = A(data)
        data, factor = update(list(data), g)
        gs.append(g)
        factors.append(factor)
    return gs, factors

train_x, train_y, train_N= get_data("hw6_train.dat")
train_u = np.full_like(np.arange(train_N), 1 / train_N, 'double')
train_data = list(zip(train_x, train_y, train_u))
test_x, test_y, test_N = get_data("hw6_test.dat")
test_u = np.full_like(np.arange(test_N), 1 / test_N, 'double')
test_data = list(zip(test_x, test_y, test_u))
T = 500
gs, factors = adaboost(train_data, T)

491 492 493 494 495 496 497 498 499 

In [3]:
np.save('gs', gs)
np.save('factors', factors)
# gs = np.load('gs.npy')
# factors = np.load('factors.npy')
clear_output()
!mv ./gs.npy ./drive/MyDrive/
!mv ./factors.npy ./drive/MyDrive/

In [4]:
# Q11

print(Eg(train_data, gs[0], False))

0.3740000000000003


In [5]:
# Q12

maxE = 0
for t in range(T):
    err = Eg(train_data, gs[t], False)
    if maxE < err:
          maxE = err

print(maxE)

0.5910000000000004


In [6]:
# Q13

tau = 87
for t in range(T):
    err = EG(train_data, gs, factors, t, True)
    if err <= 0.05:
          tau = t
          break

print(tau)

355


In [7]:
# Q14

print(Eg(test_data, gs[0], False))

0.45500000000000035


In [8]:
# Q15

print(EG(test_data, gs, factors, T, False))

0.229


In [9]:
# Q16

print(EG(test_data, gs, factors, T, True))

0.188
