In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

Reading dataset

In [2]:
data = pd.read_csv('CellSamples.csv')
data.head()

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


Dataset Source: [CellsSample.csv](https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original))

Dataset Description: This dataset contains sample of human cells record with different characteristics.

| Field name  | Description                 |
| ----------- | --------------------------- |
| ID          | Clump thickness             |
| Clump       | Clump thickness             |
| UnifSize    | Uniformity of cell size     |
| UnifShape   | Uniformity of cell shape    |
| MargAdh     | Marginal adhesion           |
| SingEpiSize | Single epithelial cell size |
| BareNuc     | Bare nuclei                 |
| BlandChrom  | Bland chromatin             |
| NormNucl    | Normal nucleoli             |
| Mit         | Mitoses                     |
| Class       | Benign or malignant         |


Dataset pre-processing

In [3]:
data.dtypes

ID              int64
Clump           int64
UnifSize        int64
UnifShape       int64
MargAdh         int64
SingEpiSize     int64
BareNuc        object
BlandChrom      int64
NormNucl        int64
Mit             int64
Class           int64
dtype: object

In [4]:
data = data[pd.to_numeric(data['BareNuc'],errors='coerce').notnull()]
data['BareNuc'] = data['BareNuc'].astype('int')
data['Class'] = data['Class'].astype('int')

In [5]:
x = data[['Clump','UnifSize','MargAdh','SingEpiSize','BareNuc','BlandChrom','NormNucl','Mit']]
y = data[['Class']]
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.25)

Converting pandas dataframe into numpy array for computation.

In [6]:
x_train = np.asanyarray(x_train)
y_train = np.asanyarray(y_train)
x_test = np.asanyarray(x_test)
y_test = np.asanyarray(y_test)

Model Training

In [7]:
model = svm.SVC(kernel='rbf')
model.fit(x_train, y_train.ravel())

SVC()

In [8]:
predictions = model.predict(x_test)

In [9]:
print('Classification Report:\n')
print(classification_report(y_test, predictions, target_names = ['Benign','Malignant']))

Classification Report:

              precision    recall  f1-score   support

      Benign       1.00      0.99      1.00       111
   Malignant       0.98      1.00      0.99        60

    accuracy                           0.99       171
   macro avg       0.99      1.00      0.99       171
weighted avg       0.99      0.99      0.99       171



In [10]:
confusion_matrix = confusion_matrix(y_test, predictions, labels = [2,4])
print('Confusion Matrix:')
print('True Positives:', confusion_matrix[0,0])
print('True Negatives:', confusion_matrix[0,1])
print('False Positives:', confusion_matrix[1,0])
print('False Negatives:', confusion_matrix[1,1])

Confusion Matrix:
True Positives: 110
True Negatives: 1
False Positives: 0
False Negatives: 60


In [11]:
print('F1 Score:',f1_score(y_test, predictions, average='weighted'))

F1Score: 0.9941629812161756
