In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn import svm

In [2]:
# Load and split data
cancer_data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer_data.data, 
    cancer_data.target,
    stratify = cancer_data.target,
    shuffle = True,
    random_state=144
)


In [3]:
# Fit model with low regularization
svm_model = svm.LinearSVC(C = 100.0)
svm_model.fit(X_train, y_train)
print("Training set score: {:.2f}".format(svm_model.score(X_train, y_train)))
print("Test set score: {:.2f}".format(svm_model.score(X_test, y_test)))

Training set score: 0.86
Test set score: 0.85


In [4]:
# Assess model performance
pred_svm = svm_model.predict(X_test)
confusion = confusion_matrix(y_test, pred_svm)
print("Confusion matrix:\n{}".format(confusion))
print(classification_report(y_test, pred_svm, target_names = ["malignant", "benign"]))

Confusion matrix:
[[49  4]
 [17 73]]
             precision    recall  f1-score   support

  malignant       0.74      0.92      0.82        53
     benign       0.95      0.81      0.87        90

avg / total       0.87      0.85      0.86       143



In [5]:
# Repeat with scaled data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
svm_model = svm.LinearSVC(C=1.0)
svm_model.fit(X_train_scaled, y_train)
print("Train set accuracy: {:.2f}".format(svm_model.score(X_train_scaled, y_train)))
print("Test set accuracy: {:.2f}".format(svm_model.score(X_test_scaled, y_test)))


Train set accuracy: 0.99
Test set accuracy: 0.97


In [7]:
pred_svm = svm_model.predict(X_test_scaled)
confusion = confusion_matrix(y_test, pred_svm)
print("Confusion matrix:\n{}".format(confusion))
print(classification_report(y_test, pred_svm, target_names = ["malignant", "benign"]))

Confusion matrix:
[[49  4]
 [ 1 89]]
             precision    recall  f1-score   support

  malignant       0.98      0.92      0.95        53
     benign       0.96      0.99      0.97        90

avg / total       0.97      0.97      0.96       143

