## Set Up

We import our code and any frequently used libraries, and set up our data.

In [4]:
DATA_PATH = '../data/zipcombo.dat'
SRC_PATH = '..'

In [5]:
import os
import sys
module_path = os.path.abspath(os.path.join(SRC_PATH))
if module_path not in sys.path:
    sys.path.append(module_path) 
    

In [6]:
%load_ext autoreload
%autoreload 2

In [12]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [8]:
from src.MLkNN import MLkNN

In [35]:
df = pd.read_csv(DATA_PATH, sep=' ', header=None).drop(columns=[257])
df.rename(columns={0: 'label'}, inplace=True)
X = df[list(range(1, 257))].values
y = df['label'].values.astype(np.int)

In [36]:
# we do not currently use subsampling, but we keep the function for testing purposes

def subsample(df, classes, sample_size=100):
    # sampling
    df_small = pd.DataFrame()
    for clazz in classes:
        df_clazz = df[df['y'] == clazz]
        df_sample = df_clazz.sample(sample_size)
        df_small = df_small.append(df_sample)

    #shuffle
    df_small = df_small.sample(frac=1.)

    X_small = df_small.drop(columns='y').values
    y_small = df_small['y'].values
    
    return X_small, y_small

In [40]:
df = pd.DataFrame(X)
df['y'] = y
X, y = subsample(df, list(range(10)), sample_size=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [19]:
def one_hot(y):
    classes = 10
    values_train = y_train.reshape(-1)
    enc_y = np.eye(classes)[values_train]
    return enc_y

## Exercise

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def error_score(y, y_pred):
    return 1 - accuracy_score(y, y_pred)

### 1. Basic Results
We split our data into 80%/20% train and test. We perform 20 runs for $d = 1, ..., 7$, and report the mean test and training errors with their standard deviations.

In [25]:
# define basic run for part 1.1

def basic_run(X_train, X_test, y_train, y_test, k, s=1):    
    #fit model
    mlknn = MLkNN(X_train, y_train, k, s)
    mlknn.fit()
    
    #return errors
    error_train = error_score(y_train, np.argmax(mlknn.predict(X_train), axis=1))
    error_test = error_score(y_test, np.argmax(mlknn.predict(X_test), axis=1))
    
    return {'err_train': error_train, 'err_test': error_test, 'model': mlknn}

In [45]:
# perform basic runs
iterations = 20
list_ks = [1, 2, 3, 4]
err_train = {k: [] for k in list_ks}
err_test = {k: [] for k in list_ks}

for iteration in tqdm(list(range(iterations))):
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)
    
    for k in list_ks:
        #split data
        results = basic_run(X_train, X_test, y_train, y_test, k)
        err_train[k].append(results['err_train'])
        err_test[k].append(results['err_test'])
    
err_train_mean = {d: np.mean(errs) for d, errs in err_train.items()}
err_test_mean = {d: np.mean(errs) for d, errs in err_test.items()}
err_train_std = {d: np.std(errs) for d, errs in err_train.items()}
err_test_std = {d: np.std(errs) for d, errs in err_test.items()}


  0%|                                                                                           | 0/20 [00:00<?, ?it/s]
  5%|████▏                                                                              | 1/20 [00:05<01:40,  5.28s/it]
 10%|████████▎                                                                          | 2/20 [00:10<01:35,  5.30s/it]
 15%|████████████▍                                                                      | 3/20 [00:15<01:30,  5.31s/it]
 20%|████████████████▌                                                                  | 4/20 [00:21<01:25,  5.32s/it]
 25%|████████████████████▊                                                              | 5/20 [00:26<01:20,  5.33s/it]
 30%|████████████████████████▉                                                          | 6/20 [00:32<01:14,  5.34s/it]
 35%|█████████████████████████████                                                      | 7/20 [00:37<01:09,  5.35s/it]
 40%|█████████████████████████████████▏

In [46]:
# display in dataframe
df_err = pd.DataFrame([err_train_mean, err_test_mean,
                       err_train_std, err_test_std], 
                       index=['train_mean', 'test_mean', 'train_std', 'test_std'], 
                       columns=list_ks).T
df_err

Unnamed: 0,train_mean,test_mean,train_std,test_std
1,0.097063,0.14475,0.005173,0.021064
2,0.115938,0.15225,0.011151,0.021649
3,0.129688,0.15975,0.01042,0.021359
4,0.119312,0.14525,0.011999,0.027133


___

### 2. Cross-validation

We split our data into 80%/20% train and test. We then use 5-fold cross validation to find our best $d^*$ parameter for $d^* \in \{1, ..., 7\}$. We then retrain our optimal kernelised perceptron on the full training set, and calculate training and test errors over 20 runs. We report the mean test and training errors for this perceptron, as well as its standard deviations.

In [None]:
def make_fold_indices(n, k=5):
    ixs = np.array(range(n))
    np.random.shuffle(ixs)
    folds = np.array_split(ixs, k)
    fold_ixs = np.zeros(n)
    for i in range(k):
        fold_ixs[folds[i]] = i
    return fold_ixs

In [None]:
# generate k folds and perform cross-validation on them, returning error per fold.
def cross_validation_error(X, y, kernel, epochs=2, k=5):
    fold_ixs = make_fold_indices(len(X), k=k)

    cv_errs = []
    for fold_ix in np.unique(fold_ixs):
        X_val = X[fold_ixs == fold_ix]
        y_val = y[fold_ixs == fold_ix]
        X_train = X[fold_ixs != fold_ix]
        y_train = y[fold_ixs != fold_ix]
        
        #fit model
        mkp = VectorizedOneVsOneKernelPerceptron(X_train, y_train, kernel)
        mkp.train_for_epochs(epochs=5)
        
        #record validation fold error
        mkp.train_for_epochs(epochs)
        cv_errs.append(error_score(y_val, mkp.predict_all(X_val)))
        
    return np.mean(cv_errs)

In [None]:
# perform cross-validation runs

iterations = 20
ds = list(range(1, 8))
errs_cv = {}

d_stars = []
errs_test = []
confusion_matrices = []
for iteration in tqdm(list(range(iterations))):
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)
    
    # perform cross validations
    for d in ds:
        errs_cv[d] = cross_validation_error(X_train, y_train, polynomial_kernel(d))
        
    # get best parameter
    d_star = min(errs_cv, key=errs_cv.get)
    d_stars.append(d_star)
    
    # get final error
    results = basic_run(X_train, X_test, y_train, y_test, polynomial_kernel(d_star))
    errs_test.append(results['err_test'])

    
# compute results   
err_test_mean = np.mean(errs_test)
d_star_mean = np.mean(d_stars)
err_test_std = np.std(errs_test)
d_star_std = np.std(d_stars)

In [None]:
# display in dataframe
df_err = pd.DataFrame([[err_test_mean, err_test_std],
                       [d_star_mean, d_star_std]], 
                       columns=['mean', 'std'], index=['err_test', 'd_star']).T
print("Answer to 2:")
df_err

In [None]:
d_stars

In [None]:
errs_test

___
___
___