In [13]:
%load_ext autoreload
%autoreload 2

In [1]:
import numpy as np


In [2]:
def load_data(f):
    """
    Assume data format:
    feature1 feature 2 ... label
    """
    # process training data
    data = np.genfromtxt(f)
    # return all feature columns except last
    X = data[:, :-1]
    y = data[:, -1].astype(int)
    return X, y

In [3]:
def standardize(X, mean, std):
    """
    standardizes the dataset
    """
    return np.apply_along_axis(lambda row: (row - mean) / std, 1, X)

def get_stats(X, y=None):
    """
    get dictionary of all relevant statistics on dataset
    """
    covs, c_means, classes, classn = {}, {}, 0, 0 # dictionaries
    mean, std, cov = np.mean(X, axis=0).reshape(-1,1), np.std(X, axis=0).reshape(-1,1), np.cov(X.T)
    if y is not None:
        classes = np.unique(y)  # get unique labels as dictionary items
        classn = len(classes)  # the number of classes in the dataset
        for c in classes:
            arr = X[y == c]
            covs[c] = np.cov(arr.T)
            if covs[c].shape == ():
                covs[c] = np.array([[covs[c]]]).reshape((1,1))
            c_means[c] = np.mean(arr, axis=0).reshape(-1,1)  # mean along rows
    stats = {
        'covs': covs,
        'c_means': c_means,
        'classes': classes,
        'classn': classn,
        'mean': mean,
        'std': std,
        'cov': cov
    }
    return stats

In [4]:
X_train_pima, y_train_pima = load_data('pima.tr')
X_test_pima, y_test_pima = load_data('pima.te')
print(y_test_pima.shape)
# print dataset shapes
print('-' * 20)
print(f"Pima dataset")
print(f"The dimension of the synth training data is: {X_train_pima.shape}")
print(f"The dimension of the synth testing data is: {X_test_pima.shape}")
print('-' * 20)

# create directory for images

# standardize the datasets
pima_stats = get_stats(X_train_pima, y_train_pima)
nX_train_pima = standardize(X_train_pima, pima_stats['mean'].flatten(), pima_stats['std'].flatten())
nX_test_pima = standardize(X_test_pima, pima_stats['mean'].flatten(), pima_stats['std'].flatten())

(332,)
--------------------
Pima dataset
The dimension of the synth training data is: (200, 7)
The dimension of the synth testing data is: (332, 7)
--------------------


In [None]:
from svm import SVM
from evaluation import *

svm = SVM(kernel='rbf', C=.1, max_iter=10000)
svm.fit(X_train_pima, y_train_pima)

starting training
starting epoch # 0
