In [1]:
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC

# Get data

In [2]:
dataset = load_breast_cancer()
print("Features:")
print(dataset.feature_names)
print("\nTargets:")
print(dataset.target_names)

Features:
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

Targets:
['malignant' 'benign']


In [3]:
X = dataset.data
y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
# Normalize features
scaler = StandardScaler()
x_train = scaler.fit_transform(X_train)
x_test  = scaler.fit_transform(X_test)

# Try different SVC

In [5]:
# Unscaled features are problematic for the rbf kernel
model = SVC(kernel = 'rbf', gamma='auto')

t0 = datetime.now()
model.fit(X_train, y_train)
print(f"Training time: {datetime.now() - t0}")
print(f"Train accuracy: {model.score(X_train, y_train):.2f}")
print(f"Test  accuracy: {model.score(X_test, y_test):.2f}")

Training time: 0:00:00.019290
Train accuracy: 1.00
Test  accuracy: 0.56


In [6]:
# The rbf kernel with gamma='scale' deals much better with unscaled features
model = SVC(kernel = 'rbf', gamma='scale')

t0 = datetime.now()
model.fit(X_train, y_train)
print(f"Training time: {datetime.now() - t0}")
print(f"Train accuracy: {model.score(X_train, y_train):.2f}")
print(f"Test  accuracy: {model.score(X_test, y_test):.2f}")

Training time: 0:00:00.009964
Train accuracy: 0.95
Test  accuracy: 0.94


In [7]:
# The rbf kernel performs much better with normalised features
model = SVC(kernel = 'rbf', gamma='auto')  # same as gamma='scale' with normalised features

t0 = datetime.now()
model.fit(x_train, y_train)
print(f"Training time: {datetime.now() - t0}")
print(f"Train accuracy: {model.score(x_train, y_train):.2f}")
print(f"Test  accuracy: {model.score(x_test, y_test):.2f}")

Training time: 0:00:00.006299
Train accuracy: 0.99
Test  accuracy: 0.96


In [8]:
# The linear model works fine even with unscaled features
model = SVC(kernel='linear')

t0 = datetime.now()
model.fit(X_train, y_train)
print(f"Training time: {datetime.now() - t0}")
print(f"Train accuracy: {model.score(X_train, y_train):.2f}")
print(f"Test  accuracy: {model.score(X_test, y_test):.2f}")

Training time: 0:00:00.812120
Train accuracy: 0.96
Test  accuracy: 0.96


In [9]:
# The linear model still performs slightly better with scaled features
model = SVC(kernel='linear')

t0 = datetime.now()
model.fit(x_train, y_train)
print(f"Training time: {datetime.now() - t0}")
print(f"Train accuracy: {model.score(x_train, y_train):.2f}")
print(f"Test  accuracy: {model.score(x_test, y_test):.2f}")

Training time: 0:00:00.005530
Train accuracy: 0.99
Test  accuracy: 0.96


# Better model comparison with cross validation

In order to properly compare different models, we do cross validation with sklearn.model_selection.cross_val_score.

Note that data preprocessing steps (such as normalization) must be learnt on the train set, and therefore repeated on each split. To achieve this, we use sklearn.pipeline.make_pipeline.

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

In [11]:
K = 10  # K-fold cross validation

def my_cross_val(model, x, y, cv=K, scale_x=False, **kwargs):
    clf = model(**kwargs)
    if scale_x: clf = make_pipeline(StandardScaler(), clf)
    
    scores = cross_val_score(clf, x, y, cv=cv)
    print(f"Accuracy: {scores.mean():0.2f} (+/- {2 * scores.std():0.2f})")

In [12]:
params = {'scale_x': False, 'kernel': 'rbf', 'gamma': 'auto'}
my_cross_val(SVC, X, y, **params)

Accuracy: 0.63 (+/- 0.01)


In [13]:
params = {'scale_x': False, 'kernel': 'rbf', 'gamma': 'scale'}
my_cross_val(SVC, X, y, **params)

Accuracy: 0.94 (+/- 0.05)


In [14]:
params = {'scale_x': True, 'kernel': 'rbf', 'gamma': 'auto'}
my_cross_val(SVC, X, y, **params)

Accuracy: 0.98 (+/- 0.05)


In [15]:
params = {'scale_x': False, 'kernel': 'linear'}
my_cross_val(SVC, X, y, **params)

Accuracy: 0.95 (+/- 0.04)


In [16]:
params = {'scale_x': True, 'kernel': 'linear'}
my_cross_val(SVC, X, y, **params)

Accuracy: 0.98 (+/- 0.04)
