In [16]:
# Support Vector Machines

# SVMs can be used for linear or nonlinear classification
# regression, and outlier detection

# Linear SVM
# creates a decision boundary with largest possible margin,
# margin is distance from line to nearest point of class
# having a margin allows for new instances to be classified better
# than if the separating hyperplane had a smaller margin
# svms are sensitive to feature scaling, use some scaler on data
# finds the widest possible street(margin), the lines going through nearest
# points are called support vectors
# adding more instances outside of the street in training will not affect 
# decision boundary at all, only in the street (between support vectors)

In [17]:
# Soft Margin Classification

# hard margin classification is when all instances are off of 
# the street and are on the right side
# issues: will only work if data is linearly separable
# it is sensitive to outliers

# we can avoid these issues with a more flexible model
# find a good balance between keeping the margin wide and limiting
# margin violations. this is soft margin classification

# svm in scikit, we can specify hyperparams
# C value is how much we want to avoid misclassifying each example
# higher could potentially overfit

# we want fewer margin violations, but keep in mind bias/variance
# tradeoff in order to generalize better

# load iris, scale features, train linear SVM

import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

iris = datasets.load_iris()
X = iris['data'][:, (2, 3)] # petal length and width
y = (iris['target'] == 2).astype(np.float64) # Iris virginica

svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_svc', LinearSVC(C = 1, loss = 'hinge'))
])

svm_clf.fit(X, y)

Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_svc', LinearSVC(C=1, loss='hinge'))])

In [18]:
svm_clf.predict([[5.5, 1.7]])
# does not output a probability like logistic regression,
# but simply classifies

# instead of LinearSVC, we could use SVC class with linear kernel
# or SGDClassifier with hinge loss and alpha = 1/(m*C)
# it will not converge as fast but will be useful to handle online
# classification tasks or huge datasets that don't fit in memory

# LinearSVC regularizes bias term, so center training set by first subtracting
# the mean. StandardScaler does this. hinge loss is not default, so remember to set it
# for performance, set dual to false unless there are more features than training
# instances. 

array([1.])

In [19]:
# Nonlinear SVM Classification

# you could add more features like we did previously, (x, x^2)

# we can do this in scikit with pipeline of PolynomialFeatures, 
# StandardScaler, and LinearSVC

# test on moons dataset, data points are shaped as two
# interleaving half-circles

from sklearn.datasets import make_moons
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

X, y = make_moons(n_samples = 100, noise = .15)
polynomial_svm_clf = Pipeline([
    ('poly_features', PolynomialFeatures(degree = 3)),
    ('scaler', StandardScaler()),
    ('svm_clf', LinearSVC(C = 10, loss = 'hinge'))
])

polynomial_svm_clf.fit(X, y)


# adding polynomial features is simple and can work greate with ml algorithms
# low degree will not likely help with complex data, and high degree creates
# too many features making the model too slow

# there is a kernel trick for SVM that appears as if you added many polynomial
# features, without having to add them. therefore there is no combinatorial
# explosion

Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('scaler', StandardScaler()),
                ('svm_clf', LinearSVC(C=10, loss='hinge'))])

In [20]:
from sklearn.svm import SVC

poly_kernel_svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('svm_clf', SVC(kernel = 'poly', degree = 3, coef0 = 1, C = 5))
])
poly_kernel_svm_clf.fit(X, y)

# this trains a SVM using third degree polynomial kernel. higher 
# degree will overfit, lower degree will underfit. coef0 controls how
# much the model is influenced by high degree polynomials versus
# low degree polynomials

# can find best hyperparameters using grid search (chapter 2)

Pipeline(steps=[('scaler', StandardScaler()),
                ('svm_clf', SVC(C=5, coef0=1, kernel='poly'))])