In [55]:
import pandas as pd
import numpy as np
import tqdm as tqdm

In [56]:
train_url = "https://www.csie.ntu.edu.tw/~htlin/course/ml21fall/hw6/hw6_train.dat"
test_url = "https://www.csie.ntu.edu.tw/~htlin/course/ml21fall/hw6/hw6_test.dat"

train_df = pd.read_csv(train_url, header=None, sep=' ')
test_df = pd.read_csv(test_url, header=None, sep=' ')

x_train = train_df.drop(columns=[10]).to_numpy()
y_train = train_df[10].to_numpy()
x_test = test_df.drop(columns=[10]).to_numpy()
y_test = test_df[10].to_numpy()

In [57]:
T = 500

## Problem 11 and 12

In [58]:
def Problem_11_12(x_train, y_train):
        n_spl = x_train.shape[0]
        n_fea = x_train.shape[1]
        u = np.ones(n_spl) * 1/n_spl
        
        ein = []
        alpha = []
        for t in tqdm.tqdm(range(T)):
                best = None
 
                for i in range(n_fea):
                        sort_idx = x_train[:, i].argsort()
                        y_tmp = y_train[sort_idx] # sort target
                        x_tmp = x_train[sort_idx] # sort feature

                        pos_pred = np.ones(n_spl) # > theta, +1
                        neg_pred = -np.ones(n_spl) # > theta, -1

                        # first time
                        pos_err = 0
                        neg_err = 0
                        for r in range(0, n_spl):
                                if y_tmp[r] == +1:
                                        neg_err += u[sort_idx[r]]
                                else:
                                        pos_err += u[sort_idx[r]]
                                        
                        if (pos_err < neg_err):
                                local_best = (+1, i, -np.inf, pos_err)
                        else:
                                local_best = (-1, i, -np.inf, neg_err)

                        # move forward for each theta
                        for r in range(0, n_spl - 1):
                                if y_tmp[r] == +1:
                                        pos_err += u[sort_idx[r]] 
                                        neg_err -= u[sort_idx[r]]
                                else:
                                        pos_err -= u[sort_idx[r]] 
                                        neg_err += u[sort_idx[r]]

                                theta = (x_tmp[r][i] + x_tmp[r+1][i]) / 2

                                if (pos_err < neg_err):
                                        tmp_best = (+1, i, theta, pos_err)
                                else:
                                        tmp_best = (-1, i, theta, neg_err)
                                
                                if (local_best[3] > tmp_best[3]):
                                        local_best = tmp_best

                        # Compare with global best
                        if best == None or best[3] > local_best[3]:
                                best = local_best                        

                # prepare for update u
                et = best[3] / np.sum(u)
                dt = np.sqrt((1-et) / et)

                # Get best parameter
                s, i, theta, _ = best

                # calculate 0/1 error and update u
                error = 0
                for r in range(n_spl):
                        if x_train[r][i] > theta: # +s
                                if (y_train[r] != s):
                                        error += 1
                                        u[r] *= dt
                                else:
                                        u[r] /= dt
                        else: # -s
                                if (y_train[r] != -s):
                                        error += 1
                                        u[r] *= dt
                                else:
                                        u[r] /= dt
                
                # record current best
                ein.append(error / n_spl)
                alpha.append((np.log(dt), best))

        return ein, alpha

In [59]:
ein, alpha = Problem_11_12(x_train, y_train)

100%|██████████| 500/500 [00:13<00:00, 36.74it/s]


In [60]:
print(np.min(ein))
print(np.max(ein))

0.374
0.591


## Problem 13

In [61]:
def Problem_13(x_train, y_train, alpha):
        n_spl = x_train.shape[0]

        pred_list = []
        for settings in alpha:                
                a, para = settings
                s, i, theta, _ = para

                local_pred = np.zeros(n_spl)
                for r in range(n_spl):
                        if (x_train[r][i] > theta) :
                                local_pred[r] = +s
                        else:
                                local_pred[r] = -s

                pred_list.append(a * local_pred)
        
        err_list = []
        cur_pred = np.zeros(n_spl)
        for pred in pred_list:
                cur_pred += pred

                error = 0
                for r in range(n_spl):
                        if cur_pred[r] * y_train[r] < 0:
                                error += 1
                err_list.append(error / n_spl)

        return err_list

In [62]:
err_list = Problem_13(x_train, y_train, alpha)

In [63]:
for i in range(len(err_list)):
        if (err_list[i] <= 0.05):
                print(i)
                break

354


## Problem 14

In [64]:
def Problem_14(x_test, y_test, g1):
        n_spl = x_test.shape[0]

        a, para = g1
        s, i, theta, _ = para

        local_pred = np.zeros(n_spl)
        for r in range(n_spl):
                if (x_test[r][i] > theta) :
                        local_pred[r] = +s
                else:
                        local_pred[r] = -s

        error = 0
        for r in range(n_spl):
                if local_pred[r] * y_test[r] < 0:
                        error += 1
        error /= n_spl

        return error

In [65]:
Problem_14(x_test, y_test, alpha[0])

0.455

## Problem 15

In [66]:
def Problem_15(x_test, y_test, alpha):
        n_spl = x_test.shape[0]

        cur_pred = np.zeros(n_spl)
        for settings in alpha:                
                _, para = settings
                s, i, theta, _ = para

                local_pred = np.zeros(n_spl)
                for r in range(n_spl):
                        if (x_test[r][i] > theta) :
                                local_pred[r] = +s
                        else:
                                local_pred[r] = -s

                cur_pred += local_pred
        
        error = 0
        for r in range(n_spl):
                if cur_pred[r] * y_test[r] < 0:
                        error += 1

        return error / n_spl

In [67]:
err = Problem_15(x_test, y_test, alpha)
err

0.205

## Problem 16

In [68]:
def Problem_16(x_test, y_test, alpha):
        n_spl = x_test.shape[0]

        cur_pred = np.zeros(n_spl)
        for settings in alpha:                
                a, para = settings
                s, i, theta, _ = para

                local_pred = np.zeros(n_spl)
                for r in range(n_spl):
                        if (x_test[r][i] > theta) :
                                local_pred[r] = +s
                        else:
                                local_pred[r] = -s

                cur_pred += a * local_pred
        
        error = 0
        for r in range(n_spl):
                if cur_pred[r] * y_test[r] < 0:
                        error += 1

        return error / n_spl

In [69]:
err = Problem_16(x_test, y_test, alpha)
err

0.188

In [70]:
ein_list = Problem_13(x_train, y_train, alpha)
print(ein_list[:20])

[0.374, 0.374, 0.318, 0.345, 0.304, 0.308, 0.302, 0.297, 0.291, 0.256, 0.254, 0.261, 0.249, 0.24, 0.233, 0.222, 0.225, 0.216, 0.204, 0.213]


In [71]:
eout_list = Problem_13(x_test, y_test, alpha)
print(eout_list[:20])

[0.455, 0.455, 0.393, 0.415, 0.383, 0.355, 0.382, 0.338, 0.344, 0.31, 0.307, 0.319, 0.311, 0.3, 0.283, 0.271, 0.278, 0.272, 0.263, 0.267]
