# 8

Train a LinearSVC on linearly separable dataset. Then train SVC and SGDClassifier on the same dataset.

Try and get them to produce roughly the same model.

In [10]:
import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [59]:
iris = datasets.load_iris()
X = iris["data"][:99, (0,1)]  # retain only setosa and versicolor, sepal width and length
Y = iris["target"][:99]

In [60]:
iris = datasets.load_iris()
X = iris["data"]
Y = iris["target"]

In [61]:
# There are three classes, Setosa class is linearly separable from the other two classes.
# Versicolor and Virginica classes are not linearly separable.
iris["target_names"]

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [63]:
linsvc_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_svc', LinearSVC(C=1, loss='hinge'))
])

linsvc_clf.fit(X_train, y_train)
y_pred = linsvc_clf.predict(X_test)

print("Accuracy Score: {}".format(accuracy_score(y_test, y_pred)))

Accuracy Score: 0.9473684210526315


In [64]:
svc_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='linear', C=1))
])

svc_clf.fit(X_train, y_train)
y_pred = svc_clf.predict(X_test)
print("Accuracy Score: {}".format(accuracy_score(y_test, y_pred)))

Accuracy Score: 0.9736842105263158


In [65]:
sgd_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('sgdclf', SGDClassifier(loss='hinge', alpha=.01))
])


sgd_clf.fit(X_train, y_train)
y_pred = sgd_clf.predict(X_test)
print("Accuracy Score: {}".format(accuracy_score(y_test, y_pred)))

Accuracy Score: 0.9736842105263158


# 9

Train SVM on MNIST. Since SVM is binary - need to use one-versus-rest to classify all 10 digits.

Tune hyperparameters using small validation sets to speed up process.

What accuracy can you reach?

In [2]:
# fetch MNIST
from sklearn.datasets import fetch_openml

In [3]:
mnist = fetch_openml('mnist_784', version=1)

In [4]:
mnist.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])

In [5]:
X, y = mnist['data'], mnist['target']
print(f'X Shape: {X.shape}, Y Shape: {y.shape}')

X Shape: (70000, 784), Y Shape: (70000,)


In [23]:
# let's pop off a final test set (last 10,000 observations)
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [6]:
# let's scale the data outside of a pipeline, since we don't need to do anything else to it
# fit scaler on training data only (not test) to not snoop into test set
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
print('Shape X Train: {}'.format(X_train.shape))
print('Shape Y Train: {}'.format(y_train.shape))
print('Shape X Test: {}'.format(X_test.shape))
print('Shape Y Test: {}'.format(y_test.shape))

Shape X Train: (60000, 784)
Shape Y Train: (60000,)
Shape X Test: (10000, 784)
Shape Y Test: (10000,)


In [7]:
# let's pop off the first 5000 examples for hyperparameter tuning.
X_trainval, y_trainval = X[:5000], y[:5000]

In [8]:
print('Shape X TrainValidate: {}'.format(X_trainval.shape))
print('Shape Y TrainValidate: {}'.format(y_trainval.shape))

Shape X TrainValidate: (5000, 784)
Shape Y TrainValidate: (5000,)


In [17]:
svc_mnist_clf = SVC(decision_function_shape='ovr')

In [18]:
# let's grid search
from sklearn.model_selection import GridSearchCV
param_grid = {'kernel': ['rbf'], 'C': [.1,1,3,5,10], 'gamma': [.001, .01, .1]}

In [19]:
grid_search = GridSearchCV(svc_mnist_clf, param_grid, cv=4, scoring='accuracy')

In [20]:
%time grid_search.fit(X_trainval, y_trainval)

CPU times: user 20min 22s, sys: 9.07 s, total: 20min 31s
Wall time: 21min 18s


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1, 3, 5, 10], 'gamma': [0.001, 0.01, 0.1],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [21]:
grid_search.best_score_

0.9262

In [22]:
grid_search.best_estimator_

SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [25]:
# refit the bes SVC with the entire training data set
grid_search.best_estimator_.fit(X_train, y_train)

SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [26]:
# get the training accuracy
y_train_pred = grid_search.best_estimator_.predict(X_train)
accuracy_score(y_train, y_train_pred)

0.9917

In [27]:
# get predictions on our test set
y_test_pred = grid_search.best_estimator_.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.9704

# 10

Train an SVM Regressor on the California Housing Prices data set.

In [1]:
from sklearn.datasets import fetch_california_housing

In [2]:
housingdat = fetch_california_housing()

In [3]:
X = housingdat['data']
y = housingdat['target']

In [4]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [7]:
from sklearn.model_selection import train_test_split
# pop off a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [8]:
print('Shape X Train: {}'.format(X_train.shape))
print('Shape Y Train: {}'.format(y_train.shape))
print('Shape X Test: {}'.format(X_test.shape))
print('Shape Y Test: {}'.format(y_test.shape))

Shape X Train: (15480, 8)
Shape Y Train: (15480,)
Shape X Test: (5160, 8)
Shape Y Test: (5160,)


In [11]:
# fit the scaler on only the training data to not snoop into the test data
scaler_svr = StandardScaler()
X_train_scaled = scaler_svr.fit_transform(X_train)
X_test_scaled = scaler_svr.transform(X_test)

In [32]:
# let's run a baseline linear SVR to get an idea of how we could improve with kernalized
from sklearn.svm import LinearSVR

svm_reg = LinearSVR()
svm_reg.fit(X_train_scaled, y_train)

y_train_pred = svm_reg.predict(X_train_scaled)

print("RMSE on Linear SVR: {}".format(np.sqrt(mean_squared_error(y_train, y_train_pred))))

RMSE on Linear SVR: 0.9116271799245662




In [18]:
param_grid_svr = {'kernel': ['rbf'], 'C': [.1,1,3,5,10], 'epsilon': [.01, .1, .5, 1, 3]}

In [20]:
from sklearn.model_selection import GridSearchCV
gridsearch_svr = GridSearchCV(SVR(gamma='auto'), param_grid_svr, cv=4, scoring='neg_mean_squared_error')

In [21]:
gridsearch_svr.fit(X_train_scaled, y_train)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto', kernel='rbf', max_iter=-1,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1, 3, 5, 10],
                         'epsilon': [0.01, 0.1, 0.5, 1, 3], 'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [22]:
gridsearch_svr.best_estimator_

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [23]:
# refit on best estimator found from grid search on entire train set
gridsearch_svr.best_estimator_.fit(X_train_scaled, y_train)

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [24]:
# get our predictions from the training set
y_pred = gridsearch_svr.best_estimator_.predict(X_train_scaled)

In [30]:
# RMSE, R2 on the training set
np.sqrt(mean_squared_error(y_train, y_pred))

0.5453177943714517

In [26]:
r2_score(y_train, y_pred)

0.7771354398781665

In [None]:
# much better than our linear SVM

In [31]:
# now let's get MSE, R2 on our test set
y_test_pred = gridsearch_svr.best_estimator_.predict(X_test_scaled)
np.sqrt(mean_squared_error(y_test, y_test_pred))

0.5659749559253856

In [28]:
r2_score(y_test, y_test_pred)

0.7579177238088236

In [None]:
# not bad...seems to generalize fairly well.