# Breast Cancer Classification using SVM

### Import libraries

In [1]:
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib notebook
from sklearn import svm, metrics
from sklearn.utils import Bunch
from sklearn.model_selection import train_test_split
import skimage.io
from skimage.transform import resize

### Function to load images from structured directory

In [2]:
# code reference: https://github.com/whimian/SVM-Image-Classification/blob/master/Image%20Classification%20using%20scikit-learn.ipynb

def load_image_files(container_path, dimension=(150, 150)):
    image_dir = Path(container_path)
    folders = [directory for directory in image_dir.iterdir() if directory.is_dir()]
    categories = [fo.name for fo in folders]

    images = []
    flat_data = []
    target = []
    for i, direc in enumerate(folders):
        for file in direc.iterdir():
            img = skimage.io.imread(file)
            img_resized = resize(img, dimension, anti_aliasing=True, mode='reflect')
            flat_data.append(img_resized.flatten()) 
            images.append(img_resized)
            target.append(i)
    flat_data = np.array(flat_data)
    target = np.array(target)
    images = np.array(images)

    return Bunch(data=flat_data,
                 target=target,
                 target_names=categories,
                 images=images)

## Binary Classification (Benign or Malignant)

### Load data

In [3]:
%%time
image_dataset = load_image_files("C:/Users/Abdullah Abid/Desktop/Project/BreakHis Data Classes/")

Wall time: 7min 33s


### Train-test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(image_dataset.data, image_dataset.target, test_size=0.3, random_state=100)

### SVM Classifier

In [5]:
clf = svm.SVC(gamma='scale')

In [6]:
%%time
clf.fit(X_train, y_train)

Wall time: 22min 14s


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [7]:
clf.score(X_test, y_test)

0.8436578171091446

In [8]:
%%time
y_pred = clf.predict(X_test)

Wall time: 9min 14s


### Classification report

In [9]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.64      0.72       743
           1       0.85      0.94      0.89      1630

    accuracy                           0.84      2373
   macro avg       0.84      0.79      0.81      2373
weighted avg       0.84      0.84      0.84      2373



In [10]:
image_dataset.target_names

['Benign', 'Malignant']

# Multiclass Classification (A, DC, F, LC, MC, PC, PT or TA)

### Load data

In [3]:
%%time
image_dataset = load_image_files("C:/Users/Abdullah Abid/Desktop/Project/BreakHis Data Subclasses/")

Wall time: 8min 30s


### Train-test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(image_dataset.data, image_dataset.target, test_size=0.3, random_state=100)

### SVM Classifier

In [5]:
clf = svm.SVC(gamma='scale')

In [6]:
%%time
clf.fit(X_train, y_train)

Wall time: 57min 43s


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [7]:
clf.score(X_test, y_test)

0.5512010113780025

In [8]:
%%time
y_pred = clf.predict(X_test)

Wall time: 17min 6s


### Classification report

In [9]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.44      0.10      0.16       146
           1       0.59      0.97      0.73      1015
           2       0.43      0.71      0.53       298
           3       0.78      0.24      0.36       189
           4       0.56      0.13      0.20       239
           5       0.33      0.01      0.01       185
           6       0.60      0.04      0.08       140
           7       0.32      0.12      0.18       161

    accuracy                           0.55      2373
   macro avg       0.51      0.29      0.28      2373
weighted avg       0.53      0.55      0.46      2373



In [12]:
image_dataset.target_names

['A', 'DC', 'F', 'LC', 'MC', 'PC', 'PT', 'TA']