In [1]:
import numpy as np
import pandas as pd
import copy
import tqdm

In [2]:
train_url = "https://www.csie.ntu.edu.tw/~htlin/course/ml21fall/hw3/hw3_train.dat"
test_url = "https://www.csie.ntu.edu.tw/~htlin/course/ml21fall/hw3/hw3_test.dat"
df_train = pd.read_csv(train_url, sep='\t', header=None)
df_test = pd.read_csv(test_url, sep='\t', header=None)

In [3]:
df_train_target = df_train[10].to_numpy()
df_train = df_train.drop(10, axis=1).to_numpy()
df_test_target = df_test[10].to_numpy()
df_test = df_test.drop(10, axis=1).to_numpy()

In [4]:
def hete_transform(x:np.ndarray, Q=2):
    ret = [1.0]
    ret.extend(x.tolist())
    tmp = np.copy(x)
    for i in range(2, Q+1):
        tmp *= x
        ret.extend(tmp)
    return np.array(ret)

def df_transform(input:np.ndarray, transform, Q=2):
    output = []
    for x in input:
        output.append(transform(x, Q))
    return np.array(output)

In [5]:
def linear_regression(features, target):
    return np.linalg.pinv(features) * target

In [6]:
def calculate_err(w, x, y, N):    
    zo_err = np.zeros(N)
    for k in range(N):
        y_hat = w.transpose() * x[k].transpose()
        if (y_hat * y[k] < 0):
            zo_err[k] = 1
    return np.average(zo_err)

In [7]:
def problem_12(x_train, y_train, x_test, y_test):
    z_train = np.mat(df_transform(x_train, hete_transform, 2))
    y_train = np.mat(y_train).transpose()

    z_test = np.mat(df_transform(x_test, hete_transform, 2))
    y_test = np.mat(y_test).transpose()

    w_lin = linear_regression(z_train, y_train)

    zo_err_train = calculate_err(w_lin, z_train, y_train, z_train.shape[0])
    zo_err_test = calculate_err(w_lin, z_test, y_test, z_test.shape[0])

    print("Problem 12: ", np.abs(zo_err_train - zo_err_test))

problem_12(df_train, df_train_target, df_test, df_test_target)


Problem 12:  0.3263333333333333


In [8]:
def problem_13(x_train, y_train, x_test, y_test):
    z_train = np.mat(df_transform(x_train, hete_transform, 8))
    y_train = np.mat(y_train).transpose()
    
    z_test = np.mat(df_transform(x_test, hete_transform, 8))
    y_test = np.mat(y_test).transpose()

    w_lin = linear_regression(z_train, y_train)

    zo_err_train = calculate_err(w_lin, z_train, y_train, z_train.shape[0])
    zo_err_test = calculate_err(w_lin, z_test, y_test, z_test.shape[0])

    print("Problem 13: ", np.abs(zo_err_train - zo_err_test))

problem_13(df_train, df_train_target, df_test, df_test_target)



Problem 13:  0.4576666666666667


In [9]:
def full_order_2(x:np.ndarray, Q=None):
    ret = [1.0]
    ret.extend(x.tolist())
    for i in range(x.shape[0]):
        for j in range(i+1, x.shape[0]):
            ret.append(x[i] * x[j])
    ret.extend((x ** 2).tolist())
    return np.array(ret)

In [10]:
def problem_14(x_train, y_train, x_test, y_test):
    z_train = np.mat(df_transform(x_train, full_order_2, 2))
    y_train = np.mat(y_train).transpose()
    
    z_test = np.mat(df_transform(x_test, full_order_2, 2))
    y_test = np.mat(y_test).transpose()

    w_lin = linear_regression(z_train, y_train)

    zo_err_train = calculate_err(w_lin, z_train, y_train, z_train.shape[0])
    zo_err_test = calculate_err(w_lin, z_test, y_test, z_test.shape[0])

    print("Problem 14: ", np.abs(zo_err_train - zo_err_test))

problem_14(df_train, df_train_target, df_test, df_test_target)



Problem 14:  0.33866666666666667


In [11]:
def shrink(x:np.ndarray, Q=1):
    ret = [1.0]
    ret.extend(x[:Q].tolist())
    return np.array(ret)

In [12]:
def problem_15(x_train, y_train, x_test, y_test):
    diff = []
    y_train = np.mat(y_train).transpose()
    y_test = np.mat(y_test).transpose()

    for i in tqdm.tqdm(range(1, x_train[0].shape[0] + 1)):
        z_train = np.mat(df_transform(x_train, shrink, i))
        
        z_test = np.mat(df_transform(x_test, shrink, i))

        w_lin = linear_regression(z_train, y_train)

        zo_err_train = calculate_err(w_lin, z_train, y_train, z_train.shape[0])
        zo_err_test = calculate_err(w_lin, z_test, y_test, z_test.shape[0])

        diff.append(np.abs(zo_err_train - zo_err_test))

    idx = np.argmin(diff) + 1
    print("Problem 15: ", (idx, diff[idx - 1]))
    print(diff)

problem_15(df_train, df_train_target, df_test, df_test_target)

100%|██████████| 10/10 [00:01<00:00,  6.54it/s]

Problem 15:  (3, 0.1323333333333333)
[0.13666666666666666, 0.13433333333333336, 0.1323333333333333, 0.1443333333333333, 0.2523333333333333, 0.3223333333333333, 0.26466666666666666, 0.2653333333333333, 0.2483333333333333, 0.3226666666666666]





In [13]:
def sample_5(x:np.ndarray, Q:np.array):
    ret = [1.0]
    for idx in Q:
        ret.append(x[idx])
    return np.array(ret)

In [14]:
def problem_16(x_train, y_train, x_test, y_test):
    diff = []
    y_train = np.mat(y_train).transpose()
    y_test = np.mat(y_test).transpose()

    for i in tqdm.tqdm(range(200)):
        rng = np.random.RandomState(i)
        sample = rng.choice([i for i in range(10)], 5, replace=False)
        z_train = np.mat(df_transform(x_train, sample_5, sample))
        z_test = np.mat(df_transform(x_test, sample_5, sample))

        w_lin = linear_regression(z_train, y_train)

        zo_err_train = calculate_err(w_lin, z_train, y_train, z_train.shape[0])
        zo_err_test = calculate_err(w_lin, z_test, y_test, z_test.shape[0])

        diff.append(np.abs(zo_err_train - zo_err_test))


    print("Problem 16: ", np.average(diff))

problem_16(df_train, df_train_target, df_test, df_test_target)



100%|██████████| 200/200 [00:30<00:00,  6.53it/s]

Problem 16:  0.21406



