<a href="https://colab.research.google.com/github/architgore/Breast-Cancer-Classification/blob/main/Breast_Cancer_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Breast Cancer Classification**

Archit Gore

The most prevalent form of breast cancer is Invasive Ductal Carcinoma (IDC). Pathologists usually concentrate on the areas of a whole mount sample that contain IDC to determine its aggressiveness grade. Consequently, a common preliminary step in automatically grading aggressiveness is to identify and outline the specific regions of IDC within a whole mount slide.

The dataset comprises 279 patients' Whole Slide Imaging (WSI) images. A total of 277,524 patches measuring 50 x 50 were extracted from these images, consisting of 198,738 IDC-negative patches and 78,786 IDC-positive patches.

---



**Importing Libraries**

In [None]:
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import cv2 
import glob
import random
from os import listdir
import os
from tqdm import tqdm

**Dataset Prep**


In [None]:
#getting all the images name for non IDC and IDC

parent_dir = '/kaggle/input/breast-histopathology-images/IDC_regular_ps50_idx5/'
dir_list = os.listdir(parent_dir)

N_IDC = []
P_IDC = []

for dir_name in tqdm(dir_list):
    
    #getting all the IDC - images
    negative_dir_path = os.path.join(parent_dir,dir_name,'0')
    negative_image_path = []
    for image_name in os.listdir(negative_dir_path):
        negative_image_path.append(os.path.join(negative_dir_path, image_name))
    N_IDC.extend(negative_image_path)
    
    #getting all the IDC + images
    positive_dir_path = os.path.join(parent_dir,dir_name,'1')
    positive_image_path = []
    for image_name in os.listdir(positive_dir_path):
        positive_image_path.append(os.path.join(positive_dir_path, image_name))
    P_IDC.extend(positive_image_path)
    
    
print(f'total number of IDC positive images {len(P_IDC)}')
print(f'total number of IDC negative images {len(N_IDC)}')


**Visualizing Images**

In [None]:
import keras.utils as image

i = np.random.randint(0, len(N_IDC))
img = image.load_img((N_IDC[i]), target_size=(100, 100))
img = image.img_to_array(img)
plt.subplot(1, 2, 1)
plt.title('IDC (-)')
plt.imshow(img.astype('uint8'))


i = np.random.randint(0, len(P_IDC))
img2 = image.load_img((P_IDC[i]), target_size=(100, 100))
img2 = image.img_to_array(img2)
plt.subplot(1, 2, 2)
plt.title('IDC (+)')
plt.imshow(img2.astype('uint8'))


In [None]:
#reducing dataset due to computational constraints 

total_images = 50000
n_img_arr = np.zeros(shape = (total_images,50,50,3),dtype = np.float32)
p_img_arr = np.zeros(shape = (total_images,50,50,3),dtype = np.float32)
label_n = []
label_p = []


for i,img in tqdm(enumerate(N_IDC[:total_images])):
        
    n_img = cv2.imread(img, cv2.IMREAD_COLOR)
    n_img_size = cv2.resize(n_img, (50, 50), interpolation = cv2.INTER_LINEAR)
    n_img_arr[i] = n_img_size
    label_n.append(0)
    
for i,img in tqdm(enumerate(P_IDC[:total_images])):
    c_img = cv2.imread(img, cv2.IMREAD_COLOR)
    c_img_size = cv2.resize(c_img, (50, 50), interpolation = cv2.INTER_LINEAR)
    p_img_arr[i] = c_img_size
    label_p.append(1)
    
label_p = np.array(label_p)
label_n = np.array(label_n)

print(n_img_arr.shape,p_img_arr.shape)

In [None]:
#preparing X array containing all the images and y array contains all the corresponding IDC class

X = np.concatenate((p_img_arr, n_img_arr), axis = 0)
y = np.concatenate((label_p, label_n), axis = 0)

from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state=0)

print('Processed dataset size')
print(X.shape, y.shape)

In [None]:
#saving data as numpy array
np.save('X.npy', X)
np.save('y.npy', y)

In [None]:
#deleting variables to free up memories
del p_img_arr
del n_img_arr

In [None]:
from tensorflow.keras.utils import to_categorical
Y = to_categorical(y, num_classes = 2)
print(Y[0],y[0])

## **Splitting training and test set**

In [None]:
from sklearn.model_selection import train_test_split

#stratified to have balanced training and testing dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y)
print("Training Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)

In [None]:
#deleting variables to free up memories
del X
del y
del Y

In [None]:
print('Training set size')
print('IDC(-) Images: {}'.format(np.sum(Y_train==[1. ,0.])/2))
print('IDC(+) Images: {}'.format(np.sum(Y_train==[0. ,1.])/2))

print('Test set size')
print('IDC(-) Images: {}'.format(np.sum(Y_test==[1. ,0.])/2))
print('IDC(+) Images: {}'.format(np.sum(Y_test==[0. ,1.])/2))

**Model Preparation**

In [None]:
#importing libraries
from tensorflow.keras.optimizers import Adam, SGD
from keras.metrics import binary_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D,MaxPooling2D, Flatten, Dropout, BatchNormalization
from tensorflow.keras.optimizers import SGD
import itertools

**Preparing Model Prep**

In [None]:
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=(50, 50, 3)))
model.add(BatchNormalization())
model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(MaxPooling2D((2, 2)))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(BatchNormalization())
model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))

model.add(Flatten())

model.add(Dense(64, activation='relu', kernel_initializer='he_uniform'))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu', kernel_initializer='he_uniform'))
model.add(Dropout(0.3))
model.add(Dense(24, activation='relu', kernel_initializer='he_uniform'))
model.add(Dense(2, activation='softmax'))

model.summary()

In [None]:
import tensorflow as tf
model.compile(Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss',patience=10)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=3, min_lr=0.0001, mode = 'max')

log_dir = os.getcwd() + f'/logs/'
if not os.path.exists(log_dir):os.mkdir(log_dir)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = os.getcwd() +f'/classifier.h5',
    save_weights_only = False,
    save_freq = 'epoch',
    monitor = 'val_accuracy',
    mode = 'max',
    save_best_only = True,
    verbose=1)

In [None]:
history = model.fit(
    X_train, 
    Y_train, 
    validation_data = (X_test, Y_test), 
    epochs = 20,
    batch_size = 32,
    callbacks = [early_stop,
                 reduce_lr,
                 model_checkpoint_callback])

### **Metrics of the trained model**

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
Y_pred = model.predict(X_test)
Y_pred_classes = np.argmax(Y_pred,axis = 1) 
Y_true = np.argmax(Y_test,axis = 1) 

target_names = ['negative', 'positive']
confusion_mtx = confusion_matrix(Y_true, Y_pred_classes) 
f,ax = plt.subplots(figsize=(8,5))
sns.heatmap(confusion_mtx, annot=True, linewidths=0.01,cmap="BuPu",linecolor="gray", fmt= '.1f',ax=ax)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
from sklearn.metrics import classification_report

#classification report of model
print(classification_report(Y_true, Y_pred_classes, target_names=target_names))