In [1]:
import numpy as np
from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
import pandas as pd

Apply scikit-learn's SVM classifier with both an RBF and polynomial kernels to the (`scikitlearn`) cancer dataset by:

1. load and look at the data

2. normalize the data so that all features have mean 0 and variance 1 (this is easy with scikit learn's preprocessing package)

3. apply SVC with an RBF kernel.  Try all values of $C$ and $\gamma$ in the set $\{2^k | k =-5..+10\}$ (so 121 total pairs $(C,\gamma))$ and for each of these, do 3-fold cross validation.  Identify which pair gives the best cross-validation score.
 
4. Repeat #3 with a polynomial kernel.  Now you also need to search over possible choices of degree (say, 1...5) and coeff0 (say 0, 1, -1), for a total of 121 x 5 x 3 different models.

In [2]:
cancer = datasets.load_breast_cancer()
X, y = cancer.data, cancer.target

In [3]:
sample = pd.DataFrame(data= cancer.data,columns= cancer.feature_names)
sample.sample(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
66,9.465,21.01,60.11,269.4,0.1044,0.07773,0.02172,0.01504,0.1717,0.06899,...,10.41,31.56,67.03,330.7,0.1548,0.1664,0.09412,0.06517,0.2878,0.09211
475,12.83,15.73,82.89,506.9,0.0904,0.08269,0.05835,0.03078,0.1705,0.05913,...,14.09,19.35,93.22,605.8,0.1326,0.261,0.3476,0.09783,0.3006,0.07802
479,16.25,19.51,109.8,815.8,0.1026,0.1893,0.2236,0.09194,0.2151,0.06578,...,17.39,23.05,122.1,939.7,0.1377,0.4462,0.5897,0.1775,0.3318,0.09136
402,12.96,18.29,84.18,525.2,0.07351,0.07899,0.04057,0.01883,0.1874,0.05899,...,14.13,24.61,96.31,621.9,0.09329,0.2318,0.1604,0.06608,0.3207,0.07247
434,14.86,16.94,94.89,673.7,0.08924,0.07074,0.03346,0.02877,0.1573,0.05703,...,16.31,20.54,102.3,777.5,0.1218,0.155,0.122,0.07971,0.2525,0.06827


In [4]:
sample = pd.DataFrame(data= cancer.target,columns=['target'])
sample.sample(5)

Unnamed: 0,target
318,1
373,0
3,0
413,1
236,0


In [5]:
X_tran = preprocessing.normalize(X)
acc=[]
cgam=[]
for c in range(-5,11):
    for gam in range(-5,11):
        clf = svm.SVC(C=2**c, gamma=2**gam)
        clf.fit(X_tran, y)
        scores = cross_val_score(clf, X_tran, y, cv=3)
        acc.append(scores.mean())
        cgam.append([c,gam])
print("Accuracy=",max(acc), "with C and gamma = ",cgam[acc.index(max(acc))])

Accuracy= 0.950775085863 with C and gamma =  [10, 5]


In [None]:
acc=[]
cgam=[]
for c in range(-5,11):
    for gam in range(-5,11):
        for deg in range(1,6):
            for coef in [-1,0,1]:
                clf = svm.SVC(kernel='poly', C=2**c, gamma=2**gam, coef0=coef, degree=deg)
                clf.fit(X_tran, y)
                scores = cross_val_score(clf, X_tran, y, cv=3)
                acc.append(scores.mean())
                cgam.append([c,gam, deg, coef])
print("Accuracy=",max(acc), "with C, gamma, degree and coefficient = ",cgam[acc.index(max(acc))])