In [74]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import os

In [87]:
# 10% test, 90% training
def cross_val_binary(X, y, batch_size):
    error_arr = []
    subset_num = int(len(X)/batch_size)-1
    for i in range(subset_num):
        error = 0
        X_test = X[i*batch_size: (i+1)*batch_size]
        y_test = y[i*batch_size: (i+1)*batch_size]
        X_train = np.concatenate((X[0: i*batch_size], X[(i+1)*batch_size: len(X)]))
        y_train = np.concatenate((y[0: i*batch_size], y[(i+1)*batch_size: len(y)]))
        
        # Revised part, PCA fitting only applied to training set
        pca = PCA()
        X_train =  pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
    
        w = least_squares_train(X_train, y_train)
        for i in range(len(X_test)):
            result = least_squares_predict(X_test[i], w, "binary")
            if result != y_test[i]:
                error = error + 1
        error_rate = error/batch_size
        error_arr.append(error_rate)
    
#     print ("Error rate of each iteration: " + str(error_arr))
    print ("Average error rate:" + str(np.average(error_arr)))
    
    return np.average(error_arr)

In [88]:
# Classify 10 different digit classes
def cross_val_multi(X, y, batch_size):
    error_arr = []
    subset_num = int(len(X) / batch_size) - 1
    
    for i in range(subset_num):
        print("batch: " + str(i))
        error = 0
        X_test = X[i * batch_size: (i + 1) * batch_size]
        y_test = y[i * batch_size: (i + 1) * batch_size]
        X_train = np.concatenate((X[0: i * batch_size], X[(i + 1) * batch_size: len(X)]))
        label_train = np.concatenate((y[0: i * batch_size], y[(i + 1) * batch_size: len(y)]))
        
        weight_list = []
        for m in range(10):
            print('building predictor for ' + str(m))
            y_train = []
            for j in range(len(label_train)):
                if label_train[j] == i:
                    y_train.append([1])
                else:
                    y_train.append([-1])
                    
            pca = PCA()
            X_train =  pca.fit_transform(X_train)
            X_test = pca.transform(X_test)

            print('running least squares')
            
            w = least_squares_train(X_train, y_train)
            weight_list.append(w)

        print('predicting')
        error = 0
        for j in range(len(X_test)):
            row_score = []
            for k in range(len(weight_list)):
                result = least_squares_predict(X_test[j], weight_list[k], "multiclass")
                dist = abs(result-1)
                row_score.append(dist)
                
            prediction = np.argmin(row_score)
            
            if prediction != y_test[j]:
                error = error + 1
        
        error_rate = error / batch_size
        error_arr.append(error_rate)
        print('error_rate: '+str(error_rate))
        print('***************************')

#     print("Error rate of each iteration: " + str(error_arr))
    print("Average error rate:" + str(np.average(error_arr)))
    return np.average(error_arr)

In [89]:
def least_squares_train(X, y):
    w = np.linalg.inv((X.transpose() @ X)) @ (X.transpose() @ y)
    return w

In [90]:
def least_squares_predict(X, w, classifier):
    raw_val = X.transpose() @ w
    if classifier == "binary":
        if raw_val >= 0:
            return 1
        if raw_val < 0:
            return -1
    if classifier == "multiclass":
        return raw_val

In [91]:
# Binary tasks
PATH = "./data_binary"
file_list = os.listdir(PATH)
file_list.sort()
task = []
result = []

for file in file_list:
    data = np.load(os.path.join(PATH, file))
    X = data['x']
    y = data['y']
    print(file)
    avg_err = cross_val_binary(X, y, int(len(X)/10))
    task.append(file.split(".")[0])
    result.append(avg_err)

least_squares_binary_result = {"task": task,"avg_err": result}
df = pd.DataFrame(least_squares_binary_result, columns = ["task","avg_err"])

df.to_csv("./result/least_squares_binary.csv", index=False)

all_0_vs_rest.npz
Average error rate:0.24915824915824913
all_1_vs_rest.npz
Average error rate:0.234006734006734
all_2_vs_rest.npz
Average error rate:0.19191919191919193
all_3_vs_rest.npz
Average error rate:0.29461279461279455
all_4_vs_rest.npz
Average error rate:0.2601010101010101
all_5_vs_rest.npz
Average error rate:0.2617845117845118
all_6_vs_rest.npz
Average error rate:0.2617845117845118
all_7_vs_rest.npz
Average error rate:0.28787878787878785
all_8_vs_rest.npz
Average error rate:0.28367003367003363
all_9_vs_rest.npz
Average error rate:0.2095959595959596
all_all_vs_rest.npz
Average error rate:0.43928738865447725
frontal_0_vs_rest.npz
Average error rate:0.28114478114478114
frontal_1_vs_rest.npz
Average error rate:0.20033670033670034
frontal_2_vs_rest.npz
Average error rate:0.18265993265993266
frontal_3_vs_rest.npz
Average error rate:0.2525252525252525
frontal_4_vs_rest.npz
Average error rate:0.20538720538720537
frontal_5_vs_rest.npz
Average error rate:0.24494949494949494
frontal_6_vs

In [92]:
# Multiclass tasks
task = "classify 10 digit seeing"
data = np.load("./data_multi/all_raw_digit.npz") 
x = data['x']
y = data['y']

avg_err = cross_val_multi(x, y, int(len(x)/10))

least_squares_multi_result = {"task": [task],"avg_err": [avg_err]}
df = pd.DataFrame(least_squares_multi_result, columns = ["task","avg_err"])

df.to_csv("./result/least_squares_multi.csv", index=False)

batch: 0
building predictor for 0
running least squares
building predictor for 1
running least squares
building predictor for 2
running least squares
building predictor for 3
running least squares
building predictor for 4
running least squares
building predictor for 5
running least squares
building predictor for 6
running least squares
building predictor for 7
running least squares
building predictor for 8
running least squares
building predictor for 9
running least squares
predicting
error_rate: 0.9102564102564102
***************************
batch: 1
building predictor for 0
running least squares
building predictor for 1
running least squares
building predictor for 2
running least squares
building predictor for 3
running least squares
building predictor for 4
running least squares
building predictor for 5
running least squares
building predictor for 6
running least squares
building predictor for 7
running least squares
building predictor for 8
running least squares
building predictor 