In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
import sklearn
from IPython.display import display
%matplotlib inline

In [2]:
# For cancer data, 
# split the data, compute the mini‐ mum and maximum, scale the data, 
# and train the SVM

from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler

# load and split the data
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0)
# compute minimum and maximum on the training data
scaler = MinMaxScaler().fit(X_train)

# rescale the training data
X_train_scaled = scaler.transform(X_train)
svm = SVC()
# learn an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)
# scale the test data and score the scaled data
X_test_scaled = scaler.transform(X_test)
print("Test score: {:.2f}".format(svm.score(X_test_scaled, y_test)))


Test score: 0.95


In [3]:
# Naive way to find better parameters for SVC
# Naive because used all data to scale test set. The model cannot
# be applied then to new data.

from sklearn.model_selection import GridSearchCV
# for illustration purposes only, don't use this code! 
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                  'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=5)
grid.fit(X_train_scaled, y_train)
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) 
print("Best parameters: ", grid.best_params_)
print("Test set accuracy: {:.2f}".format(grid.score(X_test_scaled, y_test)))

Best cross-validation accuracy: 0.98
Best parameters:  {'gamma': 1, 'C': 1}
Test set accuracy: 0.97


In [4]:
# Use Pipeline class to extress workflow for training an SVM after
# scaling the data with MinMaxScaler. First, build Pipeline object
# by providing it with a list of steps.

from sklearn.pipeline import Pipeline
# two steps: scaler and svm
pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])
# Now fit the pipeline
pipe.fit(X_train, y_train)
# Evaluate
print("Test score: {:.2f}".format(pipe.score(X_test, y_test)))

Test score: 0.95


In [5]:
# Now try using a pipeline in a grid search
# Unlike the example in In[3], no imformation is leaked between
# test and training sets.
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) 
print("Test set score: {:.2f}".format(grid.score(X_test, y_test))) 
print("Best parameters: {}".format(grid.best_params_))

Best cross-validation accuracy: 0.98
Test set score: 0.97
Best parameters: {'svm__gamma': 1, 'svm__C': 1}
