In [21]:
import os, sys, inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 
from knn_regression_cf import *
from f1_measure_cf import *

from collections import defaultdict

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import math

# Prepare data

In [22]:
geyser_dataset_path = '/home/artem/Documents/ITMO/Master/ML/Datasets/geyser.csv'
chips_dataset_path = '/home/artem/Documents/ITMO/Master/ML/Datasets/chips.csv'
dataset_columns = ['x', 'y', 'class']

geyser_dataset = pd.read_csv(geyser_dataset_path, sep=',', header=0, names=dataset_columns)
chips_dataset = pd.read_csv(chips_dataset_path, sep=',', header=0, names=dataset_columns)

### Geyser dataset

In [23]:
geyser_dataset.head()

Unnamed: 0,x,y,class
0,1,4.4,N
1,1,3.9,N
2,1,4.0,P
3,1,4.0,N
4,1,3.5,N


In [24]:
geyser_dataset.dtypes

x          int64
y        float64
class     object
dtype: object

In [25]:
geyser_dataset.describe()

Unnamed: 0,x,y
count,222.0,222.0
mean,12.297297,3.576126
std,7.862615,1.08395
min,1.0,1.7
25%,5.0,2.3
50%,16.0,4.0
75%,20.0,4.4
max,23.0,5.2


In [26]:
geyser_ds_targets = geyser_dataset['class'].unique()
geyser_ds_digit_targets = [-1, 1]

In [27]:
geyser_ds_x = geyser_dataset.drop('class', axis='columns')
geyser_ds_y = geyser_dataset['class']

In [28]:
geyser_ds_y_digits = geyser_ds_y.replace(geyser_ds_targets, geyser_ds_digit_targets)
geyser_ds_y_digits.describe()

count    222.000000
mean      -0.207207
std        0.980508
min       -1.000000
25%       -1.000000
50%       -1.000000
75%        1.000000
max        1.000000
Name: class, dtype: float64

In [29]:
geyser_ds_x_offset = geyser_ds_x.mean()
geyser_ds_x_scale = geyser_ds_x.std()
geyser_ds_x_norm = (geyser_ds_x - geyser_ds_x_offset) / geyser_ds_x_scale
geyser_ds_x_norm.describe()

Unnamed: 0,x,y
count,222.0,222.0
mean,0.0,-1.056212e-15
std,1.0,1.0
min,-1.436837,-1.730823
25%,-0.928101,-1.177292
50%,0.470925,0.3910456
75%,0.979662,0.7600662
max,1.361214,1.498108


In [30]:
geyser_ds_x_norm.isna().values.any()

False

### Chips dataset

In [31]:
chips_dataset.head()

Unnamed: 0,x,y,class
0,0.051267,0.69956,P
1,-0.092742,0.68494,P
2,-0.21371,0.69225,P
3,-0.375,0.50219,P
4,-0.51325,0.46564,P


In [32]:
chips_dataset.dtypes

x        float64
y        float64
class     object
dtype: object

In [33]:
chips_dataset.describe()

Unnamed: 0,x,y
count,118.0,118.0
mean,0.054779,0.183102
std,0.496654,0.519743
min,-0.83007,-0.76974
25%,-0.37212,-0.254385
50%,-0.006336,0.213455
75%,0.47897,0.646562
max,1.0709,1.1089


In [34]:
chips_ds_targets = chips_dataset['class'].unique()
chips_ds_digit_targets = [-1, 1]

In [35]:
chips_ds_x = chips_dataset.drop('class', axis='columns')
chips_ds_y = chips_dataset['class']

In [36]:
chips_ds_y_digits = chips_ds_y.replace(chips_ds_targets, chips_ds_digit_targets)
chips_ds_y_digits.describe()

count    118.000000
mean       0.016949
std        1.004120
min       -1.000000
25%       -1.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: class, dtype: float64

In [37]:
chips_ds_x_offset = chips_ds_x.mean()
chips_ds_x_scale = chips_ds_x.std()
chips_ds_x_norm = (chips_ds_x - chips_ds_x_offset) / chips_ds_x_scale
chips_ds_x_norm.describe()

Unnamed: 0,x,y
count,118.0,118.0
mean,2.258081e-17,5.927462e-17
std,1.0,1.0
min,-1.781621,-1.833294
25%,-0.8595503,-0.8417365
50%,-0.1230541,0.05840087
75%,0.8540982,0.8917119
max,2.045934,1.781262


In [38]:
chips_ds_x_norm.isna().values.any()

False

# Train SVM

In [75]:
def train_svm(x_data, y_data, kernel_f, alphas, weights, w0, c=1.0):
    for i in range(1, len(x_data), 1):
        # current step data
        x1, x2 = x_data[i-1], x_data[i]
        y1, y2 = y_data[i-1], y_data[i]
        a1, a2 = alphas[i-1], alphas[i]
        
        # errors and derivative on current step
        error1 = calculate_error(predict(x1, x_data, y_data, kernel_f, alphas, w0), y1)
        error2 = calculate_error(predict(x2, x_data, y_data, kernel_f, alphas, w0), y2)
        d = calculate_derivative(x1, x2, kernel_f)
        
        # bounds
        l, h = calculate_bounds(a1, a2, c, y1, y2)
        
        # update alpha2
        a2_new = a2 + y2 * (error1 - error2) / (d + 1e-5)
        a2_new_clipped = max(l, min(h, a2_new))
        
        # update alpha1
        a1_new = a1 + y1 * y2 * (a2 - a2_new_clipped)
        
        # store new alphas
        alphas[i-1], alphas[i] = a1_new, a2_new_clipped
        
        # update weights
        weights = (alphas * y_data * x_data).sum(axis=0)
        for x, y, alpha in zip(x_data, y_data, alphas):
            if alpha > 0:
                w0 = x.dot(weights.T) - y
                break
    
    
def predict(x_t, x_data, y_data, kernel_f, alphas, w0):
    p = 0
    for x, y, a in zip(x_data, y_data, alphas):
        p += y * a * kernel_f(x_t, x) - w0
    return p


def calculate_error(p, y):
    return p - y
    
    
def calculate_derivative(x1, x2, kernel_f):
    return kernel_f(x1, x1) + kernel_f(x2, x2) - 2 * kernel_f(x1, x2)
    

def calculate_bounds(a1, a2, c, y1, y2):
    if y1 != y2:
        l = max(0, a2 - a1)
        h = min(c, c + a2 - a1)
    else:
        l = max(0, a2 + a1 - c)
        h = min(c, a2 + a1)
    return l, h

#### Kernel functions

In [76]:
def linear_kernel(x1, x2):
    return x1.dot(x2)


def polynomial_kernel(x1, x2, power=1):
    return (x1.dot(x2) + 1) * power


def gaussian_radial_kernel(x1, x2, phi=1):
    x_diff = x1 - x2
    return math.exp(-phi * x_diff.dot(x_diff))

#### Loss functions

In [77]:
def calculate_accuracy(y_predicted, y_test):
    return (y_predicted * y_test > 0).sum() / len(y_test)

### Geyser dataset

In [89]:
def examine_linear_kernel(ds_x, ds_y, c):
    weights = np.random.randn(geyser_ds_x_norm.shape[1])
    w0 = 0

    k_folds = 10
    ds_size = len(geyser_ds_x_norm)
    fold_size = int(ds_size / k_folds)

    test_errors = []

    for i in range(0, len(geyser_ds_x_norm), fold_size):
        test_indices = list(range(i, min(ds_size, i + fold_size), 1))

        x_train = geyser_ds_x_norm.drop(test_indices, axis='index').values
        y_train = geyser_ds_y_digits.drop(test_indices, axis='index').values.reshape(-1, 1)
        x_test = geyser_ds_x_norm.iloc[test_indices].values
        y_test = geyser_ds_y_digits.iloc[test_indices].values.reshape(-1, 1)

        alphas = np.random.randn(len(x_train)).reshape(-1, 1)
        train_svm(x_train, y_train, linear_kernel, alphas, weights, w0, c)

        y_ps = []
        for x_t in x_test:
            y_p = predict(x_t, x_train, y_train, linear_kernel, alphas, w0)
            y_ps.append(y_p)

        test_errors.append(calculate_accuracy(np.array(y_ps), y_test))

    print('C: {} | Mean error: {}'.format(c, np.array(test_errors).mean()))
    
    
cs = [0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]
for c in cs:
    examine_linear_kernel(geyser_ds_x_norm, geyser_ds_y_digits, c)

C: 0.05 | Mean error: 0.8884297520661156
C: 0.1 | Mean error: 0.8760330578512395
C: 0.5 | Mean error: 0.8801652892561983
C: 1.0 | Mean error: 0.884297520661157
C: 5.0 | Mean error: 0.8884297520661157
C: 10.0 | Mean error: 0.8760330578512397
C: 50.0 | Mean error: 0.847107438016529
C: 100.0 | Mean error: 0.7892561983471075


In [84]:
weights = np.random.randn(geyser_ds_x_norm.shape[1])
w0 = 0

c = 0.1

k_folds = 10
ds_size = len(geyser_ds_x_norm)
fold_size = int(ds_size / k_folds)

test_errors = []

for i in range(0, len(geyser_ds_x_norm), fold_size):
    test_indices = list(range(i, min(ds_size, i + fold_size), 1))
    
    x_train = geyser_ds_x_norm.drop(test_indices, axis='index').values
    y_train = geyser_ds_y_digits.drop(test_indices, axis='index').values.reshape(-1, 1)
    x_test = geyser_ds_x_norm.iloc[test_indices].values
    y_test = geyser_ds_y_digits.iloc[test_indices].values.reshape(-1, 1)
    
    alphas = np.random.randn(len(x_train)).reshape(-1, 1)
    train_svm(x_train, y_train, linear_kernel, alphas, weights, w0, c)
    
    y_ps = []
    for x_t in x_test:
        y_p = predict(x_t, x_train, y_train, linear_kernel, alphas, w0)
        y_ps.append(y_p)
        
    test_errors.append(calculate_accuracy(np.array(y_ps), y_test))
    
print('Mean error: {}'.format(np.array(test_errors).mean()))

Mean error: 0.884297520661157
