In [283]:
import pandas as pd
import numpy as np
import itertools
from liblinear import liblinearutil as ll

In [284]:
train_url = "http://www.csie.ntu.edu.tw/~htlin/course/ml21fall/hw4/hw4_train.dat"
test_url = "http://www.csie.ntu.edu.tw/~htlin/course/ml21fall/hw4/hw4_test.dat"

train = pd.read_csv(train_url, header=None, sep=" ")
test = pd.read_csv(test_url, header=None, sep=" ")

train_y = train[6].to_numpy()
test_y = test[6].to_numpy()
train_x = train.drop(6, axis=1).to_numpy()
test_x = test.drop(6, axis=1).to_numpy()

In [285]:
def third_poly(x):
    ret = [1]
    for q in [1, 2, 3]:
        for idx_list in itertools.combinations_with_replacement([i for i in range(len(x))], q):
            tmp = 1
            for idx in idx_list:
                tmp *= x[idx]
            ret.append(tmp)
    return ret

In [286]:
def transform(all_x, func):
    ret = []
    for x in all_x:
        ret.append(func(x))
    return np.array(ret)

In [287]:
tf_train_x = transform(train_x, third_poly)
tf_test_x = transform(test_x, third_poly)

In [288]:
def liblinear_data_format(x, y):
    data = str(y) + " "
    for i in range(x.shape[0]):
        data += str(i+1)
        data += ":"
        data += str(x[i])
        data += " "
    data += "\n"
    return data

In [289]:
def write_data_format(x, y, name = None):
    if (name == None):
        return

    with open(name, 'w') as file:
        size = x.shape[0]
        for i in range(size):
            file.write(liblinear_data_format(x[i], y[i]))

## Solve Problem Using LibLinear

### Problem 12

In [290]:
def Problem12(train_y, train_x, test_y, test_x):
      prob_train = ll.problem(train_y.tolist(), train_x.tolist())

      # c = (1 / (2 * lambda))
      for log_lam in [-4, -2, 0, 2, 4]:
            print("log10 lambda =",log_lam, end = "\t")

            c = 1 / (2 * 10**(log_lam))
            m = ll.train(prob_train, '-s 0 -e 0.000001 -c '+str(c))
            _, _, _ = ll.predict(test_y.tolist(), test_x.tolist(), m)

Problem12(train_y, tf_train_x, test_y, tf_test_x)

log10 lambda = -4	Accuracy = 87.375% (699/800) (classification)
log10 lambda = -2	Accuracy = 87.75% (702/800) (classification)
log10 lambda = 0	Accuracy = 92% (736/800) (classification)
log10 lambda = 2	Accuracy = 92.75% (742/800) (classification)
log10 lambda = 4	Accuracy = 83.5% (668/800) (classification)


### Problem 13

In [291]:
def Problem13(train_y, train_x):
      prob_train = ll.problem(train_y.tolist(), train_x.tolist())

      # c = (1 / (2 * lambda))
      for log_lam in [-4, -2, 0, 2, 4]:
            print("log10 lambda =",log_lam, end = "\t")

            c = 1 / (2 * 10**(log_lam))
            m = ll.train(prob_train, '-s 0 -e 0.000001 -c '+str(c))
            _, _, _ = ll.predict(train_y.tolist(), train_x.tolist(), m)

Problem13(train_y, tf_train_x)

log10 lambda = -4	Accuracy = 100% (200/200) (classification)
log10 lambda = -2	Accuracy = 100% (200/200) (classification)
log10 lambda = 0	Accuracy = 97.5% (195/200) (classification)
log10 lambda = 2	Accuracy = 95% (190/200) (classification)
log10 lambda = 4	Accuracy = 82% (164/200) (classification)


### Problem 14

In [292]:
def Problem14(train_y, train_x, test_y, test_x):
    prob_train = ll.problem(train_y[0:120].tolist(), train_x[0:120].tolist())
    valid_y = train_y[120:200].tolist()
    valid_x = train_x[120:200].tolist()

    best_model = None
    best_acc = 0
    best_lam = None

    # c = (1 / (2 * lambda))
    for log_lam in [-4, -2, 0, 2, 4]:
        print("log10 lambda =",log_lam, end = "\t")

        c = 1 / (2 * 10**(log_lam))
        m = ll.train(prob_train, '-s 0 -e 0.000001 -c '+str(c))

        print("+ Validation:", end = "\t")
        _, acc, _ = ll.predict(valid_y, valid_x, m)

        if (best_acc < acc[0]):
            best_model = m
            best_acc = acc[0]
            best_lam = log_lam

    print("\nbest lam: ", best_lam)
    print("+ Test:", end = "\t")
    _, _, _ = ll.predict(test_y.tolist(), test_x.tolist(), best_model)

    return best_lam

best_lam = Problem14(train_y, tf_train_x, test_y, tf_test_x)

log10 lambda = -4	+ Validation:	Accuracy = 77.5% (62/80) (classification)
log10 lambda = -2	+ Validation:	Accuracy = 80% (64/80) (classification)
log10 lambda = 0	+ Validation:	Accuracy = 88.75% (71/80) (classification)
log10 lambda = 2	+ Validation:	Accuracy = 93.75% (75/80) (classification)
log10 lambda = 4	+ Validation:	Accuracy = 85% (68/80) (classification)

best lam:  2
+ Test:	Accuracy = 92.5% (740/800) (classification)


In [293]:
print("0/1 Eout = ", ( 1 - (740 / 800)))

0/1 Eout =  0.07499999999999996


### Problem 15

In [294]:
def Problem15(train_y, train_x, test_y, test_x, log_lam):
    prob_train = ll.problem(train_y.tolist(), train_x.tolist())

    print("log10 lambda =",log_lam, end = "\t")

    c = 1 / (2 * 10**(log_lam))
    m = ll.train(prob_train, '-s 0 -e 0.000001 -c '+str(c))
    _, _, _ = ll.predict(test_y.tolist(), test_x.tolist(), m)

Problem15(train_y, tf_train_x, test_y, tf_test_x, best_lam)

log10 lambda = 2	Accuracy = 92.75% (742/800) (classification)


In [295]:
print("0/1 Eout = ", ( 1 - (742 / 800)))

0/1 Eout =  0.07250000000000001


### Problem 16

In [296]:
def Problem16(train_y, train_x, test_y, test_x):
    folds = []
    for i in range(5):
        start = i*40
        end = start + 40
        
        prob_y = train_y[0:start].tolist() + train_y[end:200].tolist()
        prob_x = train_x[0:start].tolist() + train_x[end:200].tolist()
        prob = ll.problem(prob_y, prob_x)

        valid_y = train_y[start:end].tolist()
        valid_x = train_x[start:end].tolist()

        folds.append((prob, valid_y, valid_x))

    # c = (1 / (2 * lambda))
    all_acc_dict = {}
    for log_lam in [-4, -2, 0, 2, 4]:
        print("log10 lambda =",log_lam, end = "\n")
        c = 1 / (2 * 10**(log_lam))

        acc_list = []
        for i in range(5):
            prob, valid_y, valid_x = folds[i]

            m = ll.train(prob, '-s 0 -e 0.000001 -c '+str(c))
            
            print("+ Validation:", end = "\t")
            _, acc, _ = ll.predict(valid_y, valid_x, m)

            acc_list.append(acc[0])

        all_acc_dict[log_lam] = np.average(acc_list)
    
    print("Ecv = ", 1 - max(all_acc_dict.values()) / 100)

    return all_acc_dict

acc_list = Problem16(train_y, tf_train_x, test_y, tf_test_x)

log10 lambda = -4
+ Validation:	Accuracy = 82.5% (33/40) (classification)
+ Validation:	Accuracy = 82.5% (33/40) (classification)
+ Validation:	Accuracy = 70% (28/40) (classification)
+ Validation:	Accuracy = 82.5% (33/40) (classification)
+ Validation:	Accuracy = 87.5% (35/40) (classification)
log10 lambda = -2
+ Validation:	Accuracy = 82.5% (33/40) (classification)
+ Validation:	Accuracy = 82.5% (33/40) (classification)
+ Validation:	Accuracy = 75% (30/40) (classification)
+ Validation:	Accuracy = 85% (34/40) (classification)
+ Validation:	Accuracy = 90% (36/40) (classification)
log10 lambda = 0
+ Validation:	Accuracy = 85% (34/40) (classification)
+ Validation:	Accuracy = 87.5% (35/40) (classification)
+ Validation:	Accuracy = 85% (34/40) (classification)
+ Validation:	Accuracy = 92.5% (37/40) (classification)
+ Validation:	Accuracy = 92.5% (37/40) (classification)
log10 lambda = 2
+ Validation:	Accuracy = 90% (36/40) (classification)
+ Validation:	Accuracy = 90% (36/40) (classifica