Andrew Carr

In [1]:
import numpy as np
from sklearn import datasets
from sklearn import svm
from sklearn import preprocessing

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt

plt.style.use("ggplot")
%matplotlib inline

## Load and observe

In [2]:
data = datasets.load_breast_cancer()

In [3]:
print("we have {} as features".format(", ".join(data.feature_names))) 

we have mean radius, mean texture, mean perimeter, mean area, mean smoothness, mean compactness, mean concavity, mean concave points, mean symmetry, mean fractal dimension, radius error, texture error, perimeter error, area error, smoothness error, compactness error, concavity error, concave points error, symmetry error, fractal dimension error, worst radius, worst texture, worst perimeter, worst area, worst smoothness, worst compactness, worst concavity, worst concave points, worst symmetry, worst fractal dimension as features


In [4]:
print("there are {} features and {} targets".format(data.data.shape, data.target.shape))

there are (569, 30) features and (569,) targets


In [5]:
features, targets = data.data, data.target

Look at a single feature vector

In [6]:
features[0]

array([  1.79900000e+01,   1.03800000e+01,   1.22800000e+02,
         1.00100000e+03,   1.18400000e-01,   2.77600000e-01,
         3.00100000e-01,   1.47100000e-01,   2.41900000e-01,
         7.87100000e-02,   1.09500000e+00,   9.05300000e-01,
         8.58900000e+00,   1.53400000e+02,   6.39900000e-03,
         4.90400000e-02,   5.37300000e-02,   1.58700000e-02,
         3.00300000e-02,   6.19300000e-03,   2.53800000e+01,
         1.73300000e+01,   1.84600000e+02,   2.01900000e+03,
         1.62200000e-01,   6.65600000e-01,   7.11900000e-01,
         2.65400000e-01,   4.60100000e-01,   1.18900000e-01])

## Scale Data

In [7]:
print("original mean {} and std {}".format(np.mean(features), np.sqrt(np.var(features))))

original mean 61.890712339519624 and std 228.29740508276657


In [8]:
features = preprocessing.scale(features, with_std=True)

In [9]:
print("scaled mean {} and std {}".format(np.mean(features), np.sqrt(np.var(features))))

scaled mean -6.118909323768877e-16 and std 1.0


## Split data

In [10]:
#train_x, test_x, train_y, test_y = train_test_split(features, targets, train_size=0.7)

## RBF kernel

In [11]:
accuracy_rbf = []
for k in tqdm(range(-5,11)):
    for j in range(-5,11):
        C = 2**k
        gamma = 2**j
        model = svm.SVC(C=C, kernel='rbf', gamma=gamma)
        # perform 3 fold cross validation
        scores = cross_val_score(model, features, targets, cv=3)
        accuracy_rbf.append([C,gamma,np.mean(scores)])

100%|██████████| 16/16 [00:09<00:00,  1.69it/s]


## Polynomial Kernel

In [12]:
accuracy_poly = []
for k in tqdm(range(-5,11)):
    for j in range(k+1,11):
        for degree in range(1,6):
            for coef in [0,1,-1]:
                C = 2**k
                gamma = 2**j
                model = svm.SVC(C=C, kernel='poly', degree=degree, gamma=gamma, coef0=coef, cache_size=7000)
                scores = cross_val_score(model, features, targets, cv=3)
                accuracy_poly.append([degree,C, gamma, coef, np.mean(scores)])

        
        

100%|██████████| 16/16 [01:06<00:00,  4.15s/it]


In [13]:
accuracy_rbf = np.array(accuracy_rbf)
accuracy_poly = np.array(accuracy_poly)

In [14]:
print("C={} gamma={} give the best accuracy of {}".format(*accuracy_rbf[accuracy_rbf[:,2].argsort()[::-1]][0]))

C=4.0 gamma=0.03125 give the best accuracy of 0.9789195210247842


In [15]:
print("degree={} C={} gamma={} and coef0={} give the best accuracy of {}".format(*accuracy_poly[accuracy_poly[:,4].argsort()[::-1]][0]))

degree=2.0 C=0.125 gamma=0.25 and coef0=1.0 give the best accuracy of 0.9806831894551192


We can therefore conclude that the poly fit works very well, but seems to be slower (since the hyperparameter space is larger)