## Set Up

We import our code and any frequently used libraries, and set up our data.

In [2]:
DATA_PATH = '../data/zipcombo.dat'
SRC_PATH = '..'

In [3]:
import os
import sys
module_path = os.path.abspath(os.path.join(SRC_PATH))
if module_path not in sys.path:
    sys.path.append(module_path) 
    

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [6]:
from src.kernel_knn import VectorizedKernelKNN
from src.kernels import polynomial_kernel

In [20]:
df = pd.read_csv(DATA_PATH, sep=' ', header=None).drop(columns=[257])
df.rename(columns={0: 'label'}, inplace=True)
X = df[list(range(1, 257))].values
y = df['label'].values.astype(np.int)

In [8]:
# we do not currently use subsampling, but we keep the function for testing purposes

def subsample(df, classes, sample_size=100):
    # sampling
    df_small = pd.DataFrame()
    for clazz in classes:
        df_clazz = df[df['y'] == clazz]
        df_sample = df_clazz.sample(sample_size)
        df_small = df_small.append(df_sample)

    #shuffle
    df_small = df_small.sample(frac=1.)

    X_small = df_small.drop(columns='y').values
    y_small = df_small['y'].values
    
    return X_small, y_small

In [9]:
df = pd.DataFrame(X)
df['y'] = y
X, y = subsample(df, list(range(10)), sample_size=50)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

## Exercise

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def error_score(y, y_pred):
    return 1 - accuracy_score(y, y_pred)

### 1. Basic Results
We split our data into 80%/20% train and test. We perform 20 runs for $d = 1, ..., 7$, and report the mean test and training errors with their standard deviations.

In [11]:
# define basic run for part 1.1

def basic_run(X_train, X_test, y_train, y_test, kernel, k):    
    #fit model
    kknn = VectorizedKernelKNN(X_train, y_train, kernel, k)
    
    #return errors
    error_train = error_score(y_train, kknn.predict_all(X_train))
    error_test = error_score(y_test, kknn.predict_all(X_test))
    
    return {'err_train': error_train, 'err_test': error_test, 'model': kknn}

In [14]:
# perform basic runs
iterations = 20
list_ks = range(1, 5)
list_ds = range(1, 4)
err_train = {(k, d): [] for k in list_ks for d in list_ds}
err_test = {(k, d): [] for k in list_ks for d in list_ds}

for iteration in tqdm(list(range(iterations))):
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)
    
    for k in list_ks:
        for d in list_ds:
            results = basic_run(X_train, X_test, y_train, y_test, polynomial_kernel(d), k)
            err_train[(k, d)].append(results['err_train'])
            err_test[(k, d)].append(results['err_test'])
    
err_train_mean = {pair: np.mean(errs) for pair, errs in err_train.items()}
err_test_mean = {pair: np.mean(errs) for pair, errs in err_test.items()}
err_train_std = {pair: np.std(errs) for pair, errs in err_train.items()}
err_test_std = {pair: np.std(errs) for pair, errs in err_test.items()}

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:09<00:00,  2.12it/s]


In [13]:
results_dict = {}

for pair in err_train_mean.keys():
    results_dict[pair] = {
        'err_mean_train': err_train_mean[pair],
        'err_mean_test': err_test_mean[pair],
        'err_std_train': err_train_std[pair],
        'err_std_test': err_test_std[pair]
    }
df_err = pd.DataFrame(results_dict).T
df_err.index.rename(['k', 'd'], inplace=True)
df_err

Unnamed: 0_level_0,Unnamed: 1_level_0,err_mean_test,err_mean_train,err_std_test,err_std_train
k,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,0.121,0.0,0.026249,0.0
1,2,0.1085,0.0,0.032905,0.0
1,3,0.1005,0.0,0.024794,0.0
1,4,0.122,0.0,0.022935,0.0
1,5,0.2065,0.0,0.037984,0.0
1,6,0.3445,0.0,0.04189,0.0
1,7,0.4555,0.0,0.047799,0.0
2,1,0.1465,0.056,0.030212,0.007348
2,2,0.14,0.048125,0.037014,0.007495
2,3,0.133,0.041375,0.039636,0.007223


___

### 2. Cross-validation

We split our data into 80%/20% train and test. We then use 5-fold cross validation to find our best $d^*$ parameter for $d^* \in \{1, ..., 7\}$. We then retrain our optimal kernelised perceptron on the full training set, and calculate training and test errors over 20 runs. We report the mean test and training errors for this perceptron, as well as its standard deviations.

In [15]:
def make_fold_indices(n, num_folds=5):
    ixs = np.array(range(n))
    np.random.shuffle(ixs)
    folds = np.array_split(ixs, num_folds)
    fold_ixs = np.zeros(n)
    for i in range(num_folds):
        fold_ixs[folds[i]] = i
    return fold_ixs

In [16]:
# generate k folds and perform cross-validation on them, returning error per fold.
def cross_validation_error(X, y, kernel, k, num_folds=5):
    fold_ixs = make_fold_indices(len(X), num_folds=num_folds)

    cv_errs = []
    for fold_ix in np.unique(fold_ixs):
        X_val = X[fold_ixs == fold_ix]
        y_val = y[fold_ixs == fold_ix]
        X_train = X[fold_ixs != fold_ix]
        y_train = y[fold_ixs != fold_ix]
        
        #fit model
        vkknn = VectorizedKernelKNN(X_train, y_train, kernel=kernel, k=k)
        
        #record validation fold error
        cv_errs.append(error_score(y_val, vkknn.predict_all(X_val)))
        
    return np.mean(cv_errs)

In [21]:
# perform cross-validation runs

iterations = 20
list_ks = range(1, 5)
list_ds = range(1, 4)
errs_cv = {}

d_stars = []
errs_test = []
for iteration in tqdm(list(range(iterations))):
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)
    
    # perform cross validations
    for k in list_ks:
        for d in list_ds:
            errs_cv[(k, d)] = cross_validation_error(X_train, y_train, polynomial_kernel(d), k)
        
    # get best parameter
    d_star = min(errs_cv, key=errs_cv.get)
    d_stars.append(d_star)
    
    # get final error
    results = basic_run(X_train, X_test, y_train, y_test, polynomial_kernel(d_star[1]), d_star[0])
    errs_test.append(results['err_test'])

    
# compute results   
err_test_mean = np.mean(errs_test)
err_test_std = np.std(errs_test)
d_star_mean = (np.mean([d_star[0] for d_star in d_stars]), np.mean([d_star[1] for d_star in d_stars]))
d_star_std = (np.std([d_star[0] for d_star in d_stars]), np.std([d_star[1] for d_star in d_stars]))

100%|█████████████████████████████████████████████████████████████████████████████████| 20/20 [36:17<00:00, 111.57s/it]


In [22]:
cv_data = {'sigma_stars': d_stars, 'test_errors': errs_test}
df = pd.DataFrame(data=cv_data)
df

Unnamed: 0,sigma_stars,test_errors
0,"(1, 3)",0.032258
1,"(1, 3)",0.031183
2,"(1, 2)",0.027419
3,"(1, 3)",0.033871
4,"(1, 3)",0.029032
5,"(1, 3)",0.032258
6,"(1, 3)",0.033871
7,"(1, 2)",0.037097
8,"(1, 3)",0.029032
9,"(1, 2)",0.031183


In [23]:
# display in dataframe
df_err = pd.DataFrame([[err_test_mean, err_test_std],
                       [d_star_mean, d_star_std]], 
                       columns=['mean', 'std'], index=['err_test', 'd_star']).T
print("Answer to 2:")
df_err

Answer to 2:


Unnamed: 0,err_test,d_star
mean,0.0324462,"(1.1, 2.7)"
std,0.00276908,"(0.43588989435406744, 0.45825756949558405)"


___
___
___