# Kernel machines

In this notebook we will use simple two-dimensional data sets to illustrate the behavior of the support vector machine and the Perceptron, when used with quadratic and RBF kernels.

### Import

In [None]:
%matplotlib inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.svm import SVC

matplotlib.rc('xtick', labelsize=14) 
matplotlib.rc('ytick', labelsize=14)

### Data

 - two-dimensional data files `data1.txt` ~ `data5.txt` contain one data coordinate/point per line, along with a label (either -1 or 1)   
 e.g.: * `3 8 -1` (meaning that point `x=(3,8)` has label `y=-1`)

In [None]:
data_files = ['dummy0', '../../_data/data1.txt', '../../_data/data2.txt', '../../_data/data3.txt', 
 '../../_data/data4.txt', '../../_data/data5.txt']

## Kernel SVM

 1. loads one of these data sets
 2. learns `sklearn.SVC` classifier
 3. plot the data, support vectors and boundary

Parameters:
* `datafile` is one of `'data1.txt'` ~ `'data5.txt'` (or another file in the same format)
* `kernel_type` is either `'quadratic'` or `'rbf'`
* `C_value` is the setting of the soft-margin parameter `C` (default: 1.0)
* `s_value` (for the RBF kernel) is the scaling parameter `s` (default: 1.0)

Hyperparameter __`C`__ is the cost of misclassification:
 - reducing C means less misclassification cost, expect more misclassifications
 - increases the boundary margin
 - increases bias (misclassifications)
 - lowers variance and as result overfitting
 - the default value for parameter `C` is 1.0

For RBF kernel - hyperparameter __Sigma__ (std. deviation):

- sigma plays an role to be an amplifier of the distance between x and x'
- when the distance between x and x' is much larger than sigma, the kernel function tends to be zero. 
- if the sigma is very small, only the x within the certain distance can affect the predicting point. 

As for the variance and bias explanation, 
 - smaller sigma => less bias and more variance 
 - larger sigma => less variance and more bias => more smooth boundary and less overfitting

### Helper for grid pairs

In [None]:
def xy_grid(x, y, ax_pad=0, density=0.1):
    """returns grid of (xx, yy) pairs, w.r.t. density and padding"""
    xmin, xmax = min(x)-ax_pad, max(x)+ax_pad
    ymin, ymax = min(y)-ax_pad, max(y)+ax_pad
    xx, yy = np.meshgrid(np.arange(xmin, xmax+density, density), np.arange(ymin, ymax+density, density))
    return {'array': np.c_[xx.ravel(), yy.ravel()], 
            'xx': xx, 'yy': yy,
            'xmin': xmin, 'xmax': xmax, 
            'ymin': ymin, 'ymax': ymax}

In [None]:
def learn_and_display(datafile, classifier):
    
    data = np.loadtxt(datafile)
    n, d = data.shape
    
    # Create training set x and labels y
    x = data[:, 0:2]
    y = data[:, 2]
    
    # Train a support vector machine
    clf = classifier.fit(x, y)
    
    # Support vectors
    sv = np.zeros(n, dtype=bool)
    sv[clf.support_] = True
    notsv = np.logical_not(sv)
    
    # Determine the x1- and x2- limits of the plot
    grid = xy_grid(x[:, 0], x[:, 1], ax_pad=1, density=0.05)
    grid_xy = grid['array']
    plt.xlim(grid['xmin'], grid['xmax'])
    plt.ylim(grid['ymin'], grid['ymax'])
    
    # Plot the data points, enlarging the support vectors  
    plt.plot(x[(y==1)*notsv, 0], x[(y==1)*notsv, 1], 'ro')
    plt.plot(x[(y==1)*sv,    0], x[(y==1)*sv,    1], 'ro', markersize=10)
    plt.plot(x[(y==-1)*notsv, 0], x[(y==-1)*notsv, 1], 'k^')
    plt.plot(x[(y==-1)*sv,    0], x[(y==-1)*sv,    1], 'k^', markersize=10)
    
    Z = clf.decision_function(grid_xy)
            
    # Show boundary and margin using a color plot
    Z = Z.reshape(grid['xx'].shape)
    plt.pcolormesh(grid['xx'], grid['yy'], Z, cmap=plt.cm.PRGn, vmin=-2, vmax=2, alpha=.8)
    plt.contourf(grid['xx'], grid['yy'], Z, cmap=plt.cm.PRGn, vmin=-2, vmax=2, alpha=.3)
    plt.show()

### SVM with the quadratic kernel

#### Gamma - boundary width

In [None]:
# clf = SVC(kernel=kernel_type, C=1.0, gamma=1/(sigma**2))
kernel = 'rbf'  # distance algo / metric
C = 20.0        # misclassification cost; C higher => less tolerance => boundaries less smooth
sigma = 20      # margin width; sigma lower => gamma higher => narrower boundary width => less constraint

for data in data_files[3:4]:
    for sigma in [10e-2, 10e-1, 10e0, 10e1, 10e2]:
        print(C, sigma)
        learn_and_display(data, SVC(kernel='rbf', C=C, gamma=1.0/sigma**2))

#### C - misclassification tolerance

In [None]:
# clf = SVC(kernel=kernel_type, C=1.0, gamma=1/(sigma**2))
kernel = 'rbf'
C = 20.0
sigma = 10

for data in data_files[3:4]:
    for C in [10e-2, 10e-1, 10e0, 10e1, 10e2]:
        print(C, sigma)
        learn_and_display(data, SVC(kernel='rbf', C=C, gamma=1.0/sigma**2))

Also try `data2.txt` through `data5.txt`. Also try changing the value of `C` (the third parameter) to see how that affects the boundary and margin.

### SVM with the RBF kernel

In [None]:
# clf = SVC(kernel=kernel_type, C=1.0, gamma=1/(sigma**2))
kernel = 'rbf'  # distance algo / metric
C = 100.0         # slack
sigma = 10.0     # soft margin

for data in data_files[1:]:
    learn_and_display(data, SVC(kernel='rbf', C=C, gamma=1/sigma**2))

## Kernel Perceptron

- the Perceptron algorithm does not always converge

### Kernels

In [None]:
def rbf(x, y, degree=None, sigma=5.0, denominator=1):
    # Note: denominator factor: 1(from lecture), 2(from wiki)
    # 1. vector-vector distance
    if x.shape == y.shape:
        return np.exp(-np.linalg.norm(x-y)**2 / (denominator*(sigma**2)))
    # 2. matrix-vector distances
    return np.array([np.exp(-np.linalg.norm(x[i] - y)**2 / (denominator*(sigma**2))) 
                     for i in range(x.shape[0])])

In [None]:
def poly(x, y, degree=2, sigma=None):
    return (1 + x.dot(y))**degree

In [None]:
def train_kernel(x, y, kernel, degree, sigma, n_iters=1000):
    n, d = x.shape
    
    # nxn kernel similarity matrix
    K = np.zeros((n, n))        
    for (i,j), _ in np.ndenumerate(K):
        K[i,j] = kernel(x[i], x[j], degree, sigma)
    
    # Random iterations
    convergence = 0
    alpha, b = np.zeros((n,)), 0
    np.random.seed(0)
    for itr in range(n_iters):
        for idx in np.random.permutation(n):
            # Compute no. of misclassifications alpha: -1*1 && 1*-1
            if y[idx] * np.sum(alpha * y * K[:,idx] + b) <= 0:
                alpha[idx] += 1
                b = b + y[idx]
                convergence = itr + 1
                
    print("kernel:{}, degree:{}, sigma:{}, {}/{} iterations for convergence".format(
        kernel.__name__, degree, sigma, convergence, n_iters))
    
    return alpha, b, convergence < n_iters

### Learn and plot kernel perceptrons

In [None]:
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider

In [None]:
@interact_manual(datafile=(1, 5), 
                 kernel={'rbf':rbf, 'poly':poly}, 
                 degree=(1, 4, 1), 
                 sigma=widgets.FloatLogSlider(value=1, base=10, min=-3, max=3, step=1, description='sigma'),
                 iterations=(100, 1000, 100),
                 density=widgets.FloatSlider(value=.1, min=0.05, max=0.25, step=.05, description='plot density'))
def interactive_perceptron(datafile, kernel, degree, sigma, iterations, density):
    
    # Create training set x and labels y
    data = np.loadtxt(data_files[datafile])
    x = data[:, 0:2]
    y = data[:, 2]
    
    # Train a perceptron
    alpha, b, converged = train_kernel(x, y, kernel, degree, sigma, iterations)
    
    # Plot if converged else stop
    if not converged:
        print('NOT CONVERGED')
        return
    
    # Determine the x1- and x2- limits of the plot
    ax_pad = 1.5
    x1min, x1max = min(x[:, 0])-ax_pad, max(x[:, 0])+ax_pad
    x2min, x2max = min(x[:, 1])-ax_pad, max(x[:, 1])+ax_pad
    plt.xlim(x1min, x1max)
    plt.ylim(x2min, x2max)
    
    # Plot the data points, enlarging those that are support vectors
    plt.plot(x[(y==1), 0], x[(y==1), 1], 'ro')
    plt.plot(x[(y==-1), 0], x[(y==-1), 1], 'k^')
    
    # Construct a grid of points and evaluate classifier at each grid points
    xx1, xx2 = np.meshgrid(np.arange(x1min, x1max, density), np.arange(x2min, x2max, density))
    grid = np.c_[xx1.ravel(), xx2.ravel()]
    
    # Predict (Z)
    Z = np.sign([sum(y * alpha * kernel(x, pt, degree, sigma) + b) for pt in grid])
            
    # Show boundary and margin using a color plot
    Z = Z.reshape(xx1.shape)
    
    # Plot
    plt.contourf(xx1, xx2, Z, cmap=plt.cm.PRGn, vmin=-2, vmax=2, alpha=.8)
    
    plt.show()

In [None]:
def learn_and_display_Perceptron(datafile, **kwargs):
    kernel, degree, sigma, n_iters = kwargs.values()
    
    # Create training set x and labels y
    data = np.loadtxt(datafile)
    x = data[:, 0:2]
    y = data[:, 2]
    
    # Train a perceptron
    alpha, b, converged = train_kernel(x, y, kernel, degree, sigma, n_iters)
    
    # Plot if converged else stop
    if not converged:
        print('NOT CONVERGED')
        return
    
    # Determine the x1- and x2- limits of the plot
    ax_pad = 1.5
    x1min, x1max = min(x[:, 0])-ax_pad, max(x[:, 0])+ax_pad
    x2min, x2max = min(x[:, 1])-ax_pad, max(x[:, 1])+ax_pad
    plt.xlim(x1min, x1max)
    plt.ylim(x2min, x2max)
    
    # Plot the data points, enlarging those that are support vectors
    plt.plot(x[(y==1), 0], x[(y==1), 1], 'ro')
    plt.plot(x[(y==-1), 0], x[(y==-1), 1], 'k^')
    
    # Construct a grid of points and evaluate classifier at each grid points
    density = 0.1 # 'dpi'
    xx1, xx2 = np.meshgrid(np.arange(x1min, x1max+density, density), np.arange(x2min, x2max+density, density))
    grid = np.c_[xx1.ravel(), xx2.ravel()]
    
    # Predict (Z)
    Z = np.sign([sum(y * alpha * kernel(x, pt, degree, sigma) + b) for pt in grid])
            
    # Show boundary and margin using a color plot
    Z = Z.reshape(xx1.shape)
    
    # Alternative plot methods
#     plt.pcolormesh(xx1, xx2, Z, cmap=plt.cm.PRGn, vmin=-2, vmax=2)
    plt.contourf(xx1, xx2, Z, cmap=plt.cm.PRGn, vmin=-2, vmax=2, alpha=.8)
#     plt.imshow(Z, aspect='auto', origin='lower', interpolation='none') # TODO
    
    plt.show()

#### Loop through hyperparameters

In [None]:

for data in data_files[1:]:
    for degree, sigma in zip([1, 2, 3, 4], [0.2, 1, 5, 25]):
        for kernel in [rbf, poly]:
            print(data)
            learn_and_display_Perceptron(data, kernel=kernel, degree=degree, sigma=sigma, n_iter=1000)

### Test distance metrics

#### Vector length and distances

In [None]:
np.linalg.norm(np.array([2,2,2]))
np.linalg.norm(np.array([1,1,1]))
np.linalg.norm(1)
1 - np.array([2,2,2])
np.linalg.norm(1 - np.array([2,2,2]))  # broadcasting

#### Similarity/distance from vector to matrix

- value 1 means vectors are similar

sigma is;
- as usually defined in a Gaussian Distribution, the standard deviation
- radius around support vectors
- [interactive demo](https://cs.stanford.edu/people/karpathy/svmjs/demo/)

In [None]:
def rbf_(x, y, sigma=3, denominator=2):
    """"""
    if x.shape == y.shape:
        return np.exp(-np.linalg.norm(x-y)**2 / (denominator*(sigma**2)))
    return np.array([np.exp(-np.linalg.norm(x[i] - y)**2 / (denominator*(sigma**2))).round(3)
                     for i in range(x.shape[0])])

In [None]:
data = np.loadtxt(data_files[5])
x, y = data[:, :2], data[:, 2]

for sigma in [0.3, 0.5, 1, 2, 4]:
    rbf = rbf_(x[:10], x[2], sigma)
    print('sigma: {:<3}   vector similarity: {}\t mean similarity: {:.3f}'.format(sigma, rbf, rbf.mean(0)))

In [None]:
# 