- [Lab: 9.6.1 Support Vector Classifier](#9.6.1-Support-Vector-Classifier)
- [Lab: 9.6.2 Support Vector Machine](#9.6.2-Support-Vector-Machine)
- [Lab: 9.6.3 ROC Curves](#9.6.3-ROC-Curves)
- [Lab: 9.6.4 SVM with Multiple Classes](#9.6.4-SVM-with-Multiple-Classes)
- [Lab: 9.6.5 Application to Gene Expression Data](#9.6.5-Application-to-Gene-Expression-Data)

# Chapter 9 - Support Vector Machines

In [None]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report
 
%matplotlib inline
plt.style.use('seaborn-white')

### Helper functions

In [None]:
def confusion_df(clf, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    try:
        cm_df = pd.DataFrame(cm.T, index=clf.classes_, columns=clf.classes_)
    except:
        cm_df = pd.DataFrame(cm.T)
    cm_df.index.name = 'Predicted'
    cm_df.columns.name = 'True'
    return cm_df

## Kernel SVM

Hyperparameter __`C`__ is the cost of misclassification:
 - reducing C means less misclassification cost, expect more misclassifications
 - increases the boundary margin
 - increases bias (misclassifications)
 - lowers variance and as result overfitting
 - the default value for parameter `C` is 1.0
 
For RBF kernel - hyperparameter __Sigma__ (std. deviation):

- sigma plays an role to be an amplifier of the distance between x and x'
- when the distance between x and x' is much larger than sigma, the kernel function tends to be zero. 
- if the sigma is very small, only the x within the certain distance can affect the predicting point. 

As for the variance and bias explanation, 
 - smaller sigma => less bias and more variance 
 - larger sigma => less variance and more bias => more smooth boundary and less overfitting

## LAB

### 9.6.1 Support Vector Classifier

Define a function to plot a classifier with support vectors.

In [None]:
def plot_svc(svc, X, y, h=0.02, pad=0.25):
    
    x_min, x_max = X[:, 0].min()-pad, X[:, 0].max()+pad
    y_min, y_max = X[:, 1].min()-pad, X[:, 1].max()+pad
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.3)

    plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired)
    
    # Support vectors indicated in plot by vertical lines
    sv = svc.support_vectors_
    plt.scatter(sv[:,0], sv[:,1], c='k', marker='+', s=100, linewidths='1')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()
    print('Number of support vectors: ', svc.support_.size)

### Synthetic data

In [None]:
# Generating random data: 20 observations of 2 features and divide into two classes.
np.random.seed(5)
X = np.random.randn(20, 2)
y = np.repeat([1, -1], 10)

X[y == -1] = X[y == -1] +1
plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired)
plt.xlabel('X1')
plt.ylabel('X2');

### Support Vector Classifier with linear kernel

In [None]:
svc = SVC(C= 1.0, kernel='linear')
svc.fit(X, y)

plot_svc(svc, X, y)

In [None]:
# When using a smaller cost parameter (C=0.1) the margin is wider, resulting in more support vectors.
svc2 = SVC(C=0.1, kernel='linear')
svc2.fit(X, y)
plot_svc(svc2, X, y)

### Select the optimal C parameter by cross-validation

In [None]:
tuned_parameters = [{'C': [0.001, 0.01, 0.1, 1, 5, 10, 100]}]
clf = GridSearchCV(SVC(kernel='linear'), 
                   tuned_parameters, 
                   cv=10, 
                   scoring='accuracy')
clf.fit(X, y)
# clf.grid_scores_  # deprecated

In [None]:
sorted([(param, score) for param, score in zip(clf.cv_results_['params'], clf.cv_results_['mean_test_score'])], key=lambda x: -x[1])

#### Best hyperparameter (C)

In [None]:
clf.best_params_

### Synthetic test data

In [None]:
np.random.seed(1)
X_test = np.random.randn(20, 2)
y_test = np.random.choice([-1, 1], 20)
X_test[y_test==1] = X_test[y_test==1] -1  # shift X to make more separable

plt.scatter(X_test[:,0], X_test[:,1], s=70, c=y_test, cmap=plt.cm.Paired)
plt.xlabel('X1')
plt.ylabel('X2');

### Predict

In [None]:
# svc2 : C = 0.1
y_pred = svc2.predict(X_test)
confusion_df(svc2, y_test, y_pred)

In [None]:
svc3 = SVC(C=0.001, kernel='linear').fit(X, y)

In [None]:
# svc3 : C = 0.001
y_pred = svc3.predict(X_test)
confusion_df(svc3, y_test, y_pred)

In [None]:
# Changing the test data so that the classes are really seperable with a hyperplane.
X_test[y_test==1] = X_test[y_test==1] -1

plt.scatter(X_test[:,0], X_test[:,1], s=70, c=y_test, cmap=plt.cm.Paired)
plt.xlabel('X1')
plt.ylabel('X2');

In [None]:
svc4 = SVC(C=10.0, kernel='linear').fit(X_test, y_test)

In [None]:
plot_svc(svc4, X_test, y_test)

In [None]:
# C: 10.0 => 1.0, (less misclassification cost) increases the margin:
#  Now there is one misclassification: increased bias, lower variance.
svc5 = SVC(C=1.0, kernel='linear').fit(X_test, y_test)

In [None]:
plot_svc(svc5, X_test, y_test)

### 9.6.2 Support Vector Machine 

### Synthetic test data

In [None]:
np.random.seed(8)
X = np.random.randn(200,2)
X[:100] = X[:100] +2
X[101:150] = X[101:150] -2
y = np.concatenate([np.repeat(-1, 150), np.repeat(1,50)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2)

plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired)
plt.xlabel('X1')
plt.ylabel('X2');

In [None]:
svm = SVC(C=1.0, kernel='rbf', gamma=1.0).fit(X_train, y_train)

In [None]:
plot_svc(svm, X_train, y_train)

In [None]:
# Increasing C parameter, allowing more flexibility
svm2 = SVC(C=100, kernel='rbf', gamma=1.0).fit(X_train, y_train)

In [None]:
plot_svc(svm2, X_train, y_train)

### Set the parameters by cross-validation

In [None]:
tuned_parameters = [{'C': [0.01, 0.1, 1, 10, 100],
                     'gamma': [0.5, 1, 2, 3, 4]}]

clf = GridSearchCV(SVC(kernel='rbf'), tuned_parameters, cv=10, scoring='accuracy', return_train_score=True).fit(X_train, y_train)

#### Sorted list of CV scores and hyperparameters

In [None]:
sorted([(param, score) for param, score in zip(clf.cv_results_['params'], clf.cv_results_['mean_test_score'])], key=lambda x: -x[1])

### Best parameters - estimator

In [None]:
clf.best_params_

In [None]:
clf.best_estimator_

In [None]:
confusion_matrix(y_test, clf.best_estimator_.predict(X_test))

#### Test score

In [None]:
clf.best_estimator_.score(X_test, y_test)

### 9.6.3 ROC Curves

Comparing the ROC curves of two models on train/test data. One model is more flexible than the other.  
NOTE: multiclass format is not supported

In [None]:
svm3 = SVC(C=1, kernel='rbf', gamma=2).fit(X_train, y_train)

In [None]:
# More flexible model
svm4 = SVC(C=1, kernel='rbf', gamma=50).fit(X_train, y_train)

### Decision function - train set

In [None]:
y_train_score3 = svm3.decision_function(X_train)
y_train_score4 = svm4.decision_function(X_train)

false_pos_rate3, true_pos_rate3, _ = roc_curve(y_train, y_train_score3)
roc_auc3 = auc(false_pos_rate3, true_pos_rate3)

false_pos_rate4, true_pos_rate4, _ = roc_curve(y_train, y_train_score4)
roc_auc4 = auc(false_pos_rate4, true_pos_rate4)

### Decision function - test set

In [None]:
y_test_score3_t = svm3.decision_function(X_test)
y_test_score4_t = svm4.decision_function(X_test)

false_pos_rate3_t, true_pos_rate3_t, _ = roc_curve(y_test, y_test_score3_t)
roc_auc3_t = auc(false_pos_rate3_t, true_pos_rate3_t)

false_pos_rate4_t, true_pos_rate4_t, _ = roc_curve(y_test, y_test_score4_t)
roc_auc4_t = auc(false_pos_rate4_t, true_pos_rate4_t)

### Visualise

In [None]:
label = lambda g, auc: 'SVM $\gamma = {}$ ROC curve (area = {:.2f})'.format(g, auc)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
_ = ax1.plot(false_pos_rate3, true_pos_rate3, label=label(2, roc_auc3), color='b')
_ = ax1.plot(false_pos_rate4, true_pos_rate4, label=label(50, roc_auc4), color='r')
_ = ax1.set_title('Training Data')

_ = ax2.plot(false_pos_rate3_t, true_pos_rate3_t, label=label(2, roc_auc3_t), color='b')
_ = ax2.plot(false_pos_rate4_t, true_pos_rate4_t, label=label(50, roc_auc4_t), color='r')
_ = ax2.set_title('Test Data')

for ax in fig.axes:
    _ = ax.plot([0, 1], [0, 1], 'k--')
    _ = ax.set_xlim([-0.05, 1.0])
    _ = ax.set_ylim([0.0, 1.05])
    _ = ax.set_xlabel('False Positive Rate')
    _ = ax.set_ylabel('True Positive Rate')
    _ = ax.legend(loc="lower right")

As expected, the more flexible model scores better on training data but worse on the test data.

### 9.6.4 SVM with Multiple Classes

#### Mulitclass synthetic data set

In [None]:
# Adding a third class of observations
np.random.seed(8)
XX = np.vstack([X, np.random.randn(50, 2)])
yy = np.hstack([y, np.repeat(0, 50)])
XX[yy ==0] = XX[yy == 0] +4

plt.scatter(XX[:,0], XX[:,1], s=70, c=yy, cmap=plt.cm.prism)
plt.xlabel('XX1')
plt.ylabel('XX2');

In [None]:
svm5 = SVC(C=1, kernel='rbf').fit(XX, yy)

In [None]:
plot_svc(svm5, XX, yy)

### 9.6.5 Application to Gene Expression Data

#### Data

In [None]:
!find ../../_data | grep -i khan_

In [None]:
X_train = pd.read_csv('../../_data/Khan_xtrain.csv').drop('Unnamed: 0', axis=1)
y_train = pd.read_csv('../../_data/Khan_ytrain.csv').drop('Unnamed: 0', axis=1).as_matrix().ravel()
X_test = pd.read_csv('../../_data/Khan_xtest.csv').drop('Unnamed: 0', axis=1)
y_test = pd.read_csv('../../_data/Khan_ytest.csv').drop('Unnamed: 0', axis=1).as_matrix().ravel()

In [None]:
# y_train counts
pd.Series(y_train).value_counts(sort=False)

In [None]:
# y_test counts
pd.Series(y_test).value_counts(sort=False)

In [None]:
# This model gives identical results to the svm() of the R package e1071, also based on libsvm library.
svc = SVC(kernel='linear').fit(X_train, y_train)

In [None]:
confusion_df(svc, y_train, svc.predict(X_train))

In [None]:
confusion_df(svc, y_test, svc.predict(X_test))