In [1]:
# SVM is versatile in classification, regression and outlier detection for linear and nonlinear model. Very suitable for
# medium to small datasets

In [2]:
# hard margin classifier: that all the instances must be correctly separated. which works only for linearly separable data
# and sensitive to outliers. 
# soft margin classifier: regularization is added to not overfit the data, tradeoff between maximum margin and the violation numbers


In [3]:
import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

In [4]:
iris = datasets.load_iris()

In [5]:
X = iris["data"][:,(2,3)]
y = (iris["target"] ==2).astype(np.float64)


In [6]:
svm_clf = Pipeline([
("scaler", StandardScaler()),
("linear_svc", LinearSVC(C=1, loss = 'hinge')),
])

In [7]:
svm_clf.fit(X,y)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('linear_svc', LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0))])

In [8]:
svm_clf.predict([[5.5, 1.6]])

array([ 1.])

In [9]:
# nonlinear SVM: using nonlinear transformation like polynomial to make unseparable data separable
from sklearn.datasets import make_moons
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
polynomial_svm_clf = Pipeline((
("poly_features", PolynomialFeatures(degree = 3)),
("scaler", StandardScaler()),
("SVM_clf", LinearSVC(C = 10, loss ='hinge'))
))

In [10]:
polynomial_svm_clf.fit(X, y)

Pipeline(memory=None,
     steps=[('poly_features', PolynomialFeatures(degree=3, include_bias=True, interaction_only=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('SVM_clf', LinearSVC(C=10, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0))])

In [11]:
# polynomial features: kernal trick can be implemented by using SVC class
from sklearn.svm import SVC
poly_kernal_svm_clf = Pipeline((
("scaler", StandardScaler()),
("svm_clf", SVC(kernel = 'poly', degree = 3, coef0 = 1, C = 5))
))
# we can use a gridsearchcv to obtain the possible best hyperparameter: d, r and C

In [12]:
#RBF kernel: measurement of similarity feature: can map a linearly nonseparable dataset to separable data in transformed domain
rbf_kernel_svm_clf = Pipeline((
("scaler", StandardScaler()),
('svm_clf', SVC(kernel = 'rbf', gamma = 5, C = 0.001))
))
rbf_kernel_svm_clf.fit(X, y)
# note the hyperparameters gamma and C, which decide the shape of the gaussian function, hence decides the sensitivity of the
# decision boundary to individual data points

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm_clf', SVC(C=0.001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=5, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [13]:
# Support vector regression: modify the objective of SVC: which is to fit as many instance on the separating margin (street)
# as possible while maintaining limiting margin violations

In [14]:
# linear regression using support vector, equivalent to LinearSVC
from sklearn.svm import LinearSVR
svm_reg = LinearSVR(epsilon = 1.5)
svm_reg.fit(X, y)

LinearSVR(C=1.0, dual=True, epsilon=1.5, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0)

In [15]:
# we can also do non linear regression: use kernelized SVM model.
from sklearn.svm import SVR
svm_poly_reg = SVR(kernel = 'poly', degree = 2, C = 100, epsilon = 0.1)
svm_poly_reg.fit(X, y)

SVR(C=100, cache_size=200, coef0=0.0, degree=2, epsilon=0.1, gamma='auto',
  kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [16]:
# some exercise: using SVM to do mnist classification
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata("MNIST original")
X = mnist["data"]
y = mnist["target"]

In [17]:
X_train = X[:60000]
X_test = X[60000:]
y_train = y[:60000]
y_test = y[60000:]

In [18]:
# shuffle the training and testing data of course
np.random.seed(42)
rnd_idx = np.random.permutation(60000)

In [19]:
X_train = X_train[rnd_idx]
y_train = y_train[rnd_idx]

In [20]:
# firstly use a linear SVM classifier, it will automatically turn 
# into a one vs all classifier
lin_clf = LinearSVC(random_state = 42)
lin_clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)

In [21]:
from sklearn.metrics import accuracy_score
y_predict = lin_clf.predict(X_train)
accuracy_score(y_predict,y_train)

0.85375000000000001

In [22]:
# 0.8 is bad. we can try to scale the data first before we train
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float32))
X_test_sclaed = scaler.transform(X_test.astype(np.float32))

In [23]:
# now we see the score
lin_clf = LinearSVC(random_state = 42)
lin_clf.fit(X_train_scaled, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)

In [24]:
# now calculate the accuracy
y_predict = lin_clf.predict(X_train_scaled)
accuracy_score(y_train, y_predict)

0.9204

In [25]:
svm_clf = SVC(decision_function_shape="ovr")
svm_clf.fit(X_train_scaled[:10000], y_train[:10000])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [26]:
# now we used RBF kernel using SVC which is default.
# see the accuracy
y_predict = svm_clf.predict(X_train_scaled)


NameError: name 'accuracy' is not defined

In [28]:
# much higher accuracy score using nonlinear kernel
accuracy_score(y_predict, y_train)

0.94615000000000005

In [29]:
# now we can perform a randomized search to optimize the hyperparameters
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform
param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(svm_clf, param_distributions, n_iter= 10,
                                  verbose = 2)
rnd_search_cv.fit(X_train_scaled[:1000], y_train[:1000])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=8.85231605842, gamma=0.00176607465048 .........................
[CV] .......... C=8.85231605842, gamma=0.00176607465048, total=   0.8s
[CV] C=8.85231605842, gamma=0.00176607465048 .........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV] .......... C=8.85231605842, gamma=0.00176607465048, total=   0.9s
[CV] C=8.85231605842, gamma=0.00176607465048 .........................
[CV] .......... C=8.85231605842, gamma=0.00176607465048, total=   0.8s
[CV] C=1.82719601047, gamma=0.00636473705545 .........................
[CV] .......... C=1.82719601047, gamma=0.00636473705545, total=   1.2s
[CV] C=1.82719601047, gamma=0.00636473705545 .........................
[CV] .......... C=1.82719601047, gamma=0.00636473705545, total=   1.0s
[CV] C=1.82719601047, gamma=0.00636473705545 .........................
[CV] .......... C=1.82719601047, gamma=0.00636473705545, total=   1.0s
[CV] C=9.87519919377, gamma=0.0513498334519 ..........................
[CV] ........... C=9.87519919377, gamma=0.0513498334519, total=   1.1s
[CV] C=9.87519919377, gamma=0.0513498334519 ..........................
[CV] ........... C=9.87519919377, gamma=0.0513498334519, total=   1.0s
[CV] C=9.87519919377, gamma=0.0513498334519 ..........................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   46.2s finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019C800A5588>, 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019C800A5A90>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [31]:
# obtain the best estimator
# we can use this way to obtain the best estimator hyperparameter 
# combination
rnd_search_cv.best_estimator_

SVC(C=8.8523160584230869, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001766074650481071,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [33]:
# obtain the best accuracy
rnd_search_cv.best_score_

0.85599999999999998

In [38]:
# we can use the best hyperparameter to predict the data
y_predict = rnd_search_cv.best_estimator_.fit(X_train_scaled, y_train)

In [40]:
y_predict = rnd_search_cv.best_estimator_.predict(X_train_scaled)

In [42]:
accuracy_score(y_train, y_predict)

0.99965000000000004

In [55]:
# scale the test data
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X[60000:])
X_test_scaled
y_test.shape



(64000,)

In [56]:
# realize that even this number is close to 1, it is done on
# training data hence it could be overfitting, we need to see the 
# real testing data
y_prediction_test = rnd_search_cv.best_estimator_.predict(X_test_scaled)
accuracy_score(y[60000:], y_prediction_test)

0.97050000000000003

In [58]:
# now we see that the accuracy is not as high as training set

In [59]:
# train a SVM regressor on the California housing dataset
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
X = housing["data"]
y = housing["target"]

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to C:\Users\Leo Du\scikit_learn_data


In [60]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [63]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [64]:
from sklearn.svm import LinearSVR
lin_svr = LinearSVR(random_state= 42)
lin_svr.fit(X_train_scaled, y_train)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=42, tol=0.0001, verbose=0)

In [66]:
from sklearn.metrics import mean_squared_error
y_prediction = lin_svr.predict(X_train_scaled)
mse = mean_squared_error(y_train, y_prediction)
mse

0.96128066532972734

In [68]:
# RMSE
np.sqrt(mse)

0.9804492160890983

In [69]:
# we can search for better hyperparameters
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform
param_distributions = {"gamma": reciprocal(0.001, 0.1),
                      "C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10,
                                  verbose = 2, random_state = 42)
rnd_search_cv.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=4.74540118847, gamma=0.0796945481864 ..........................
[CV] ........... C=4.74540118847, gamma=0.0796945481864, total=   5.9s
[CV] C=4.74540118847, gamma=0.0796945481864 ..........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.8s remaining:    0.0s


[CV] ........... C=4.74540118847, gamma=0.0796945481864, total=   6.1s
[CV] C=4.74540118847, gamma=0.0796945481864 ..........................
[CV] ........... C=4.74540118847, gamma=0.0796945481864, total=   5.9s
[CV] C=8.31993941811, gamma=0.0157513204998 ..........................
[CV] ........... C=8.31993941811, gamma=0.0157513204998, total=   5.3s
[CV] C=8.31993941811, gamma=0.0157513204998 ..........................
[CV] ........... C=8.31993941811, gamma=0.0157513204998, total=   6.0s
[CV] C=8.31993941811, gamma=0.0157513204998 ..........................
[CV] ........... C=8.31993941811, gamma=0.0157513204998, total=   5.3s
[CV] C=2.56018640442, gamma=0.00205111041884 .........................
[CV] .......... C=2.56018640442, gamma=0.00205111041884, total=   4.7s
[CV] C=2.56018640442, gamma=0.00205111041884 .........................
[CV] .......... C=2.56018640442, gamma=0.00205111041884, total=   4.7s
[CV] C=2.56018640442, gamma=0.00205111041884 .........................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  3.6min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019C8024FEF0>, 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019C800FFC18>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [70]:
rnd_search_cv.best_estimator_

SVR(C=4.7454011884736254, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma=0.079694548186439285, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [71]:
y_prediction = rnd_search_cv.best_estimator_.predict(X_train_scaled)

In [74]:
mse = mean_squared_error(y_train, y_prediction)
np.sqrt(mse)

0.57275247707853572

In [76]:
# now we use test data 
y_prediction = rnd_search_cv.best_estimator_.predict(X_test_scaled)

In [77]:
mse = mean_squared_error(y_test, y_prediction)
np.sqrt(mse)

0.59291683855287458