## Set Up

We import our code and any frequently used libraries, and set up our data.

In [1]:
DATA_PATH = '../data/zipcombo.dat'
SRC_PATH = '..'

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join(SRC_PATH))
if module_path not in sys.path:
    sys.path.append(module_path) 
    

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.svm import SVC

In [47]:
df = pd.read_csv(DATA_PATH, sep=' ', header=None).drop(columns=[257])
df.rename(columns={0: 'label'}, inplace=True)
X = df[list(range(1, 257))].values
y = df['label'].values.astype(np.int)

In [7]:
# we do not currently use subsampling, but we keep the function for testing purposes

def subsample(df, classes, sample_size=100):
    # sampling
    df_small = pd.DataFrame()
    for clazz in classes:
        df_clazz = df[df['y'] == clazz]
        df_sample = df_clazz.sample(sample_size)
        df_small = df_small.append(df_sample)

    #shuffle
    df_small = df_small.sample(frac=1.)

    X_small = df_small.drop(columns='y').values
    y_small = df_small['y'].values
    
    return X_small, y_small

In [43]:
df = pd.DataFrame(X)
df['y'] = y
X, y = subsample(df, list(range(10)), sample_size=50)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [49]:
X_train.shape

(7438, 256)

## Exercise

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def error_score(y, y_pred):
    return 1 - accuracy_score(y, y_pred)

### 1. Basic Results
We split our data into 80%/20% train and test. We perform 20 runs for $d = 1, ..., 7$, and report the mean test and training errors with their standard deviations.

In [38]:
# define basic run for part 1.1

def basic_run(X_train, X_test, y_train, y_test, c):   
    
    #fit model
    ksvm = SVC(C=c, kernel='rbf', gamma='scale', shrinking=False)
    ksvm.fit(X_train, y_train)
    
    #return errors
    error_train = error_score(y_train, ksvm.predict(X_train))
    error_test = error_score(y_test, ksvm.predict(X_test))
    
    return {'err_train': error_train, 'err_test': error_test, 'model': ksvm}

In [39]:
# perform basic runs
iterations = 20
cs = list(np.arange(1, 3.5, 0.5))
err_train = {c: [] for c in cs}
err_test = {c: [] for c in cs}

for iteration in tqdm(list(range(iterations))):
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)
    
    for c in cs:
        #split data
        results = basic_run(X_train, X_test, y_train, y_test, c)
        err_train[c].append(results['err_train'])
        err_test[c].append(results['err_test'])
    
err_train_mean = {c: np.mean(errs) for c, errs in err_train.items()}
err_test_mean = {c: np.mean(errs) for c, errs in err_test.items()}
err_train_std = {c: np.std(errs) for c, errs in err_train.items()}
err_test_std = {c: np.std(errs) for c, errs in err_test.items()}



  0%|                                                                                           | 0/20 [00:00<?, ?it/s]

  5%|████▏                                                                              | 1/20 [01:08<21:48, 68.88s/it]

 10%|████████▎                                                                          | 2/20 [02:17<20:40, 68.94s/it]

 15%|████████████▍                                                                      | 3/20 [03:26<19:29, 68.79s/it]

 20%|████████████████▌                                                                  | 4/20 [04:36<18:26, 69.15s/it]

 25%|████████████████████▊                                                              | 5/20 [05:45<17:16, 69.11s/it]

 30%|████████████████████████▉                                                          | 6/20 [06:52<16:00, 68.59s/it]

 35%|█████████████████████████████                                                      | 7/20 [08:02<14:54, 68.79s/it]

 40%|█████████████████████████

In [40]:
# display in dataframe
df_err = pd.DataFrame([err_train_mean, err_test_mean,
                       err_train_std, err_test_std], 
                       index=['train_mean', 'test_mean', 'train_std', 'test_std'], 
                       columns=cs).T
df_err

Unnamed: 0,train_mean,test_mean,train_std,test_std
1.0,0.00884,0.025968,0.000607,0.003575
1.5,0.004376,0.024032,0.000497,0.003706
2.0,0.002743,0.022876,0.00026,0.003515
2.5,0.001728,0.022688,0.000263,0.003321
3.0,0.001324,0.022204,0.000149,0.003489


___

### 2. Cross-validation

We split our data into 80%/20% train and test. We then use 5-fold cross validation to find our best $d^*$ parameter for $d^* \in \{1, ..., 7\}$. We then retrain our optimal kernelised perceptron on the full training set, and calculate training and test errors over 20 runs. We report the mean test and training errors for this perceptron, as well as its standard deviations.

In [41]:
def make_fold_indices(n, num_folds=5):
    ixs = np.array(range(n))
    np.random.shuffle(ixs)
    folds = np.array_split(ixs, num_folds)
    fold_ixs = np.zeros(n)
    for i in range(num_folds):
        fold_ixs[folds[i]] = i
    return fold_ixs

In [42]:
# generate k folds and perform cross-validation on them, returning error per fold.
def cross_validation_error(X, y, c, num_folds=5):
    fold_ixs = make_fold_indices(len(X), num_folds=num_folds)

    cv_errs = []
    for fold_ix in np.unique(fold_ixs):
        X_val = X[fold_ixs == fold_ix]
        y_val = y[fold_ixs == fold_ix]
        X_train = X[fold_ixs != fold_ix]
        y_train = y[fold_ixs != fold_ix]
        
        #fit model
        ksvm = SVC(C=c, kernel='rbf', gamma='scale')
        ksvm.fit(X_train, y_train)
        
        #record validation fold error
        cv_errs.append(error_score(y_val, ksvm.predict(X_val)))
        
    return np.mean(cv_errs)

In [50]:
# perform cross-validation runs

iterations = 20
cs = list(np.arange(1, 3.5, 0.5))
errs_cv = {}

c_stars = []
errs_test = []
for iteration in tqdm(list(range(iterations))):
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)
    
    # perform cross validations
    for c in cs:
        errs_cv[c] = cross_validation_error(X_train, y_train, c)
        
    # get best parameter
    c_star = min(errs_cv, key=errs_cv.get)
    c_stars.append(c_star)
    
    # get final error
    results = basic_run(X_train, X_test, y_train, y_test, c)
    errs_test.append(results['err_test'])
        
# compute results   
err_test_mean = np.mean(errs_test)
c_star_mean = np.mean(c_stars)
err_test_std = np.std(errs_test)
c_star_std = np.std(c_stars)



  0%|                                                                                           | 0/20 [00:00<?, ?it/s]

  5%|████                                                                              | 1/20 [02:22<45:00, 142.12s/it]

 10%|████████▏                                                                         | 2/20 [04:38<42:08, 140.48s/it]

 15%|████████████▎                                                                     | 3/20 [06:55<39:28, 139.32s/it]

 20%|████████████████▍                                                                 | 4/20 [09:11<36:51, 138.25s/it]

 25%|████████████████████▌                                                             | 5/20 [11:28<34:27, 137.85s/it]

 30%|████████████████████████▌                                                         | 6/20 [13:43<32:01, 137.25s/it]

 35%|████████████████████████████▋                                                     | 7/20 [16:02<29:49, 137.69s/it]

 40%|█████████████████████████

In [51]:
cv_data = {'sigma_stars': c_stars, 'test_errors': errs_test}
df = pd.DataFrame(data=cv_data)
df

Unnamed: 0,sigma_stars,test_errors
0,2.5,0.022043
1,3.0,0.024194
2,3.0,0.015591
3,3.0,0.024194
4,3.0,0.02043
5,2.0,0.026882
6,2.5,0.026344
7,2.5,0.019355
8,3.0,0.019892
9,2.0,0.023656


In [52]:
# display in dataframe
df_err = pd.DataFrame([[err_test_mean, err_test_std],
                       [c_star_mean, c_star_std]], 
                       columns=['mean', 'std'], index=['err_test', 'c_star']).T
print("Answer to 2:")
df_err

Answer to 2:


Unnamed: 0,err_test,c_star
mean,0.022124,2.675
std,0.002902,0.396074


___
___
___