In [3]:
# Author: Anthony W. Silva
# Date: 11/15/2019

In [4]:
# In this project I will apply the Support Vector Machine method to 
# accurately diagnose patients with either malignant or benign tumors.
# The data was provided to me by UC Davis for educational purposes.

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.set_style('whitegrid')

In [6]:
# Load csv file containing data into pandas dataframe
Data = pd.read_csv('wdbc.csv')
Data.columns = range(32)
Data.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
5,844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
6,84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
7,844981,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
8,84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075
9,845636,M,16.02,23.24,102.7,797.8,0.08206,0.06669,0.03299,0.03323,...,19.19,33.88,123.8,1150.0,0.1181,0.1551,0.1459,0.09975,0.2948,0.08452


In [7]:
Data.shape

(568, 32)

In [8]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 32 columns):
0     568 non-null int64
1     568 non-null object
2     568 non-null float64
3     568 non-null float64
4     568 non-null float64
5     568 non-null float64
6     568 non-null float64
7     568 non-null float64
8     568 non-null float64
9     568 non-null float64
10    568 non-null float64
11    568 non-null float64
12    568 non-null float64
13    568 non-null float64
14    568 non-null float64
15    568 non-null float64
16    568 non-null float64
17    568 non-null float64
18    568 non-null float64
19    568 non-null float64
20    568 non-null float64
21    568 non-null float64
22    568 non-null float64
23    568 non-null float64
24    568 non-null float64
25    568 non-null float64
26    568 non-null float64
27    568 non-null float64
28    568 non-null float64
29    568 non-null float64
30    568 non-null float64
31    568 non-null float64
dtypes: float64(30), int64(1), obj

In [9]:
# Each row in the Data is a patient
# Each column represents a patient attribute
# The First Column is the primary key
# The second column is the classification label
# M --> Malignant tumor
# B --> Benign tumor

In [10]:
# Relabel data such that
# M --> 1
# B --> -1
Labels=list(map(lambda x: 1 if x=='M' else -1, Data[:][1]))

In [19]:
# Split training and testing data
from sklearn.model_selection import train_test_split
DataArray = np.array(Data)
X_train, X_test, y_train, y_test = train_test_split(DataArray[:,2:32],np.array(Labels),test_size=0.4)

In [20]:
# Fit separating hyperplane onto testing data using SVC
from sklearn.svm import SVC
Fit = SVC(kernel='linear')
Fit.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [21]:
# Test model on testing data
Predictions = Fit.predict(X_test)

In [22]:
from sklearn.metrics import classification_report,confusion_matrix

In [23]:
# Output confusion matrix for classification results
# Row 1 --> Malignant tumors
# Row 2 --> Benign tumors
ConfusionMatrix = confusion_matrix(y_test,Predictions)
pd.DataFrame(ConfusionMatrix,columns=['Actual Benign','Actual Malignant'],index=['Predicted Benign','Predicted Malignant'])

Unnamed: 0,Actual Benign,Actual Malignant
Predicted Benign,126,4
Predicted Malignant,6,92


In [24]:
# Accuracy of classification results
data = np.zeros(2)
for i in range(2):
    data[i] = ConfusionMatrix[i,i]/np.sum(ConfusionMatrix[i,:])

# Create and print data frame for borda count method
pd.DataFrame(data,['Benign Tumors','Malignant Tumors'],columns=['Classifier Accuracy'])

Unnamed: 0,Classifier Accuracy
Benign Tumors,0.969231
Malignant Tumors,0.938776


In [25]:
print('Total Accuracy: %{}'.format(100*round(sum(np.diag(ConfusionMatrix))/len(y_test),2)))

Total Accuracy: %96.0


In [26]:
# Our model accuracy using SVM method was %96. 
# Model is highly accurate and ready for deployment