# FINAL PROJECT — SUPPORT VECTOR MACHINE (SVM) MODEL

## Import Packages & Read in the Data

In [12]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
breastCancer = pd.read_csv('/Users/benvanzyll/Desktop/CPSC393/FinalProject/breast-cancer.csv')
breastCancer.drop('id', axis=1, inplace=True)
breastCancer.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Encode Diagnosis Label (0 for Benign, 1 for Malignant)

In [3]:
diagnosis_num = {'B':0, 'M':1}
breastCancer['diagnosis'] = breastCancer['diagnosis'].map(diagnosis_num)
breastCancer.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Split Data into Training, Validation, and Testing Sets

In [4]:
features = breastCancer.drop('diagnosis', axis=1)
labels = breastCancer['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.33, random_state=42)

# Check to make sure each set is the current size (0.7, 0.2, 0.1)
for dataset in [y_train, y_val, y_test]:
    print(round(len(dataset) / len(labels), 2))

0.7
0.2
0.1


## Define Function for Calculating Optimal Hyperparameters

In [5]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

## Build Model & Perform 5-Fold Cross Validation

In [6]:
svc = SVC()
parameters = {
    'kernel': ['linear', 'rbf'],
    'C': [0.001, 0.1, 1, 10, 100]
}

cv = GridSearchCV(svc, parameters, cv=5)
cv.fit(X_train, y_train.values.ravel())

print_results(cv)

BEST PARAMS: {'C': 10, 'kernel': 'linear'}

0.93 (+/-0.044) for {'C': 0.001, 'kernel': 'linear'}
0.626 (+/-0.008) for {'C': 0.001, 'kernel': 'rbf'}
0.947 (+/-0.049) for {'C': 0.1, 'kernel': 'linear'}
0.874 (+/-0.056) for {'C': 0.1, 'kernel': 'rbf'}
0.95 (+/-0.045) for {'C': 1, 'kernel': 'linear'}
0.894 (+/-0.053) for {'C': 1, 'kernel': 'rbf'}
0.95 (+/-0.05) for {'C': 10, 'kernel': 'linear'}
0.907 (+/-0.057) for {'C': 10, 'kernel': 'rbf'}
0.942 (+/-0.056) for {'C': 100, 'kernel': 'linear'}
0.922 (+/-0.056) for {'C': 100, 'kernel': 'rbf'}


------------------------------------------
Best parameters to try on validation set:
    
    {'C': 10, 'kernel': 'linear'} = 0.95
    {'C': 1, 'kernel': 'linear'}  = 0.95
    {'C': 0.1, 'kernel': 'linear'}   = 0.947

## Try 3 Best Hyperparameter Settings on Validation Set

In [7]:
svc1 = SVC(C=10, kernel='linear')
svc1.fit(X_train, y_train.values.ravel())

svc2 = SVC(C=1, kernel='linear')
svc2.fit(X_train, y_train.values.ravel())

svc3 = SVC(C=0.1, kernel='linear')
svc3.fit(X_train, y_train.values.ravel())

SVC(C=0.1, kernel='linear')

In [8]:
for lr in [svc1, svc2, svc3]:
    y_pred = lr.predict(X_val)
    accuracy = round(accuracy_score(y_val, y_pred), 3)
    precision = round(precision_score(y_val, y_pred), 3)
    recall = round(recall_score(y_val, y_pred), 3)
    print('C: {} / kernel: {} -- A: {} / P: {} / R: {}'.format(lr.C,
                                                               lr.kernel,
                                                               accuracy,
                                                               precision,
                                                               recall))

C: 10 / kernel: linear -- A: 0.974 / P: 1.0 / R: 0.936
C: 1 / kernel: linear -- A: 0.974 / P: 1.0 / R: 0.936
C: 0.1 / kernel: linear -- A: 0.965 / P: 1.0 / R: 0.915


------------------------------------------
We will select a C value of 1 and a linear kernel; This 
is an intuitive decision, as it presents us with the
best results and is more generalizable.

## Test Final Model on Testing Set and Evaluate Results

In [9]:
y_pred = svc2.predict(X_test)
accuracy = round(accuracy_score(y_test, y_pred), 3)
precision = round(precision_score(y_test, y_pred), 3)
recall = round(recall_score(y_test, y_pred), 3)
print('C: {} / kernel: {} -- A: {} / P: {} / R: {}'.format(svc2.C,
                                                               svc1.kernel,
                                                               accuracy,
                                                               precision,
                                                               recall))

C: 1 / kernel: linear -- A: 0.982 / P: 0.941 / R: 1.0


------------------------------------------
Our final model performs very well on unseen data.
Performance was nearly at the same caliber as it 
performed on the validation data, with an accuracy score of
0.982, precision of 0.941, and recall of 1.0 is still
very solid.

In [13]:
confusion_matrix(y_test, y_pred)

array([[40,  1],
       [ 0, 16]])