In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from sklearn.metrics import *
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler

## Загрузим готовый датасет Breast Cancer из библиотеки Sklearn и разделим данные на train/test

In [2]:
X, y = load_breast_cancer(return_X_y = True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, shuffle=True, random_state=0)

# Стандартизируем данные
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Попробуем SVM c линейным ядром - linear kernel SVM

In [3]:
svm_linear = SVC(C=1, kernel = 'linear')
# Другая, более быстрая альтернатива, это напрямую использовать класс LinearSVC()
svm_linear.fit(X_train, y_train)
y_train_pred = svm_linear.predict(X_train)
y_test_pred = svm_linear.predict(X_test)

print("Train accuracy: ", accuracy_score(y_train,y_train_pred))
print("Test accuracy: ", accuracy_score(y_test,y_test_pred))

Train accuracy:  0.9855072463768116
Test accuracy:  1.0


## Попробуем SVM c ядром Гаусса - Gaussian kernel (RBF) SVM

In [4]:
svm_linear = SVC(C=1.0, kernel='rbf')

svm_linear.fit(X_train, y_train)
y_train_pred = svm_linear.predict(X_train)
y_test_pred = svm_linear.predict(X_test)

print("Train accuracy: ", accuracy_score(y_train,y_train_pred))
print("Test accuracy: ", accuracy_score(y_test,y_test_pred))

Train accuracy:  0.9855072463768116
Test accuracy:  1.0


## Попробуем SVM c  полиномиальным ядром ядром в 3 степени - 3rd degree polynomial kernel

In [5]:
svm_linear = SVC(C=1, kernel='poly', degree=3)
svm_linear.fit(X_train, y_train)
y_train_pred = svm_linear.predict(X_train)
y_test_pred = svm_linear.predict(X_test)

print("Train accuracy: ", accuracy_score(y_train,y_train_pred))
print("Test accuracy: ", accuracy_score(y_test,y_test_pred))

Train accuracy:  0.9130434782608695
Test accuracy:  0.8953488372093024


In [6]:
# Попробуем подобрать лучшую комбинацию из коэффициента C и типа ядра

In [7]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
Cs = [0.6, 0.9, 1.0, 2, 5, 10]
#Cs = [1.0, 2.0]

for k in kernels:
    for c in Cs:
        print("Kernel = ", k)
        print("C = ", c)
        svm_linear = SVC(C=c, kernel=k)

        svm_linear.fit(X_train, y_train)
        y_train_pred = svm_linear.predict(X_train)
        y_test_pred = svm_linear.predict(X_test)

        print("Train accuracy: ", accuracy_score(y_train,y_train_pred))
        print("Test accuracy: ", accuracy_score(y_test,y_test_pred))
        
        print("-------------")

# Наилучший баланс между train/test accuracy достигается 
# при использовании  kernel = 'linear' и С = 0.6, 0.9, 1.0, или 5.0

Kernel =  linear
C =  0.6
Train accuracy:  0.9855072463768116
Test accuracy:  1.0
-------------
Kernel =  linear
C =  0.9
Train accuracy:  0.9855072463768116
Test accuracy:  1.0
-------------
Kernel =  linear
C =  1.0
Train accuracy:  0.9855072463768116
Test accuracy:  1.0
-------------
Kernel =  linear
C =  2
Train accuracy:  0.9855072463768116
Test accuracy:  1.0
-------------
Kernel =  linear
C =  5
Train accuracy:  0.9875776397515528
Test accuracy:  1.0
-------------
Kernel =  linear
C =  10
Train accuracy:  0.9875776397515528
Test accuracy:  0.9651162790697675
-------------
Kernel =  poly
C =  0.6
Train accuracy:  0.9006211180124224
Test accuracy:  0.8837209302325582
-------------
Kernel =  poly
C =  0.9
Train accuracy:  0.9026915113871635
Test accuracy:  0.8837209302325582
-------------
Kernel =  poly
C =  1.0
Train accuracy:  0.9130434782608695
Test accuracy:  0.8953488372093024
-------------
Kernel =  poly
C =  2
Train accuracy:  0.9337474120082816
Test accuracy:  0.91860465116