In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from kernels import polynomial_kernel
from perceptrons import OneVsAllKernelPerceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import itertools

In [41]:
def subsample(df, classes, sample_size=100):
    # sampling
    df_small = pd.DataFrame()
    for clazz in classes:
        df_clazz = df[df['y'] == clazz]
        df_sample = df_clazz.sample(sample_size)
        df_small = df_small.append(df_sample)

    #shuffle
    df_small = df_small.sample(frac=1.)

    X_small = df_small.drop(columns='y').values
    y_small = df_small['y'].values
    
    return X_small, y_small

In [70]:
df = pd.read_csv('zipcombo.dat', sep=' ', header=None).drop(columns=[257])
df.rename(columns={0: 'label'}, inplace=True)
X = df[list(range(1, 257))].values
y = df['label'].values.astype(np.int)

In [42]:
df = pd.DataFrame(X)
df['y'] = y
X, y = subsample(df, list(range(10)), sample_size=300)

In [43]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [56]:
def train_run(X_train, y_train, X_test, y_test, c):
    mod = SVC(C=c, kernel='rbf', gamma='scale')
    mod.fit(X_train, y_train)
    y_pred_train = mod.predict(X_train)
    y_pred_test = mod.predict(X_test)
    train_err = np.sum(y_pred_train == y_train)
    test_err = np.sum(y_pred_test == y_test)
    return train_err, test_err

Do basic runs

In [71]:
# perform basic runs
iterations = 5
list_cs = [2, 3, 4, 5, 7]
err_train = {c: [] for c in list_cs}
err_test = {c: [] for c in list_cs}

for iteration in tqdm(list(range(iterations))):
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)
    
    for c in list_cs:
        #split data
        train_err, test_err = train_run(X_train, y_train, X_test, y_test, c)
        err_train[c].append(train_err)
        err_test[c].append(test_err)
    
err_train_mean = {d: np.mean(errs) for d, errs in err_train.items()}
err_test_mean = {d: np.mean(errs) for d, errs in err_test.items()}
err_train_std = {d: np.std(errs) for d, errs in err_train.items()}
err_test_std = {d: np.std(errs) for d, errs in err_test.items()}







  0%|                                                                                            | 0/5 [00:00<?, ?it/s]





 20%|████████████████▊                                                                   | 1/5 [01:15<05:01, 75.50s/it]





 40%|█████████████████████████████████▌                                                  | 2/5 [02:28<03:44, 74.88s/it]





 60%|██████████████████████████████████████████████████▍                                 | 3/5 [03:36<02:25, 72.83s/it]





 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [05:06<01:17, 77.80s/it]





100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [06:13<00:00, 74.58s/it]

In [72]:
# display in dataframe
df_err = pd.DataFrame([err_train_mean, err_test_mean,
                       err_train_std, err_test_std], 
                       index=['train_mean', 'test_mean', 'train_std', 'test_std'], 
                       columns=list_cs).T
df_err

Unnamed: 0,train_mean,test_mean,train_std,test_std
2,7417.8,1812.0,1.32665,6.324555
3,7428.4,1812.6,1.019804,6.916647
4,7430.4,1814.8,1.019804,6.764614
5,7431.2,1815.2,1.16619,6.079474
7,7434.8,1815.8,0.748331,5.912698


Setup cross validation

In [73]:
def make_fold_indices(n, k=5):
    ixs = np.array(range(n))
    np.random.shuffle(ixs)
    folds = np.array_split(ixs, k)
    fold_ixs = np.zeros(n)
    for i in range(k):
        fold_ixs[folds[i]] = i
    return fold_ixs

In [74]:
# generate k folds and perform cross-validation on them, returning error per fold.
def cross_validation_error_svm(X, y, c, k=5):
    fold_ixs = make_fold_indices(len(X), k=k)

    cv_errs = []
    for fold_ix in tqdm(np.unique(fold_ixs)):
        X_val = X[fold_ixs == fold_ix]
        y_val = y[fold_ixs == fold_ix]
        X_train = X[fold_ixs != fold_ix]
        y_train = y[fold_ixs != fold_ix]
        
        #fit model
        mod = SVC(C=c, kernel='rbf', gamma='scale')
        mod.fit(X_train, y_train)
        
        #record validation fold error
        y_pred = mod.predict(X_val)
        cv_errs.append(np.sum(y_pred == y_val))
        
    return np.mean(cv_errs)

Do cross-validation runs

In [75]:
# perform cross-validation runs

iterations = 5
list_cs = [2, 3, 4, 5, 7]
errs_cv = {}
d_stars = []
errs_test = []
confusion_matrices = []
for iteration in tqdm(list(range(iterations))):
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)
    
    # perform cross validations
    for c in list_cs:
        errs_cv[c] = cross_validation_error_svm(X_train, y_train, c)
        
    # get best parameter
    d_star = max(errs_cv, key=errs_cv.get)
    d_stars.append(d_star)
    
    # get final error
    train_err, test_err = train_run(X_train, y_train, X_test, y_test, d_star)
    errs_test.append(test_err)
    
# compute results   
err_test_mean = np.mean(errs_test)
d_star_mean = np.mean(d_stars)
err_test_std = np.std(errs_test)
d_star_std = np.std(d_stars)







  0%|                                                                                            | 0/5 [00:00<?, ?it/s]






  0%|                                                                                            | 0/5 [00:00<?, ?it/s]






 20%|████████████████▊                                                                   | 1/5 [00:05<00:20,  5.06s/it]






 40%|█████████████████████████████████▌                                                  | 2/5 [00:10<00:15,  5.08s/it]






 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:15<00:10,  5.13s/it]






 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:20<00:05,  5.11s/it]






100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:25<00:00,  5.09s/it]






  0%|                                                                                            | 0/5 [00:00<?,

 40%|█████████████████████████████████▌                                                  | 2/5 [00:10<00:15,  5.02s/it]






 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:15<00:10,  5.12s/it]






 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:22<00:05,  5.83s/it]






100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:29<00:00,  5.93s/it]






  0%|                                                                                            | 0/5 [00:00<?, ?it/s]






 20%|████████████████▊                                                                   | 1/5 [00:05<00:23,  5.81s/it]






 40%|█████████████████████████████████▌                                                  | 2/5 [00:11<00:17,  5.83s/it]






 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:17<00:11,  5.86s/it

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:42<00:00,  8.53s/it]






  0%|                                                                                            | 0/5 [00:00<?, ?it/s]






 20%|████████████████▊                                                                   | 1/5 [00:08<00:33,  8.47s/it]






 40%|█████████████████████████████████▌                                                  | 2/5 [00:16<00:25,  8.47s/it]






 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:24<00:16,  8.05s/it]






 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:29<00:07,  7.41s/it]






100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:36<00:00,  7.30s/it]






  0%|                                                                                            | 0/5 [00:00<?, ?it/s

In [55]:
# display in dataframe
df_err = pd.DataFrame([[err_test_mean, err_test_std],
                       [d_star_mean, d_star_std]], 
                       columns=['mean', 'std'], index=['err_test', 'd_star']).T
print("Answer to 2:")
df_err

Answer to 2:


Unnamed: 0,err_test,d_star
mean,577.6,6.6
std,5.161395,4.498889
