<a href="https://colab.research.google.com/github/aaaadigup/BREAST_PROJECT/blob/main/custom_and_breast_cancer_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
ambarish_breakhis_path = kagglehub.dataset_download('ambarish/breakhis')

print('Data source import complete.')


In [None]:
#Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os
import shutil
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

**LOAD DATA**

In [None]:
#Loading the text file
df = pd.read_csv("../input/breakhis/Folds.csv")

In [None]:
df.info()

In [None]:
#Show first 5 entries
df.head()

In [None]:
#Show last 5 entries
df.tail()

In [None]:
print("Number of missing values in each column are: \n", df.isnull().sum())

In [None]:
df['fold'].value_counts()

**Checking for duplicates**

In [None]:
#find duplicate rows across specific columns
duplicateRows = df[df.duplicated(['mag', 'grp', 'filename'])]

In [None]:
duplicateRows

**Drop duplicates**

In [None]:
dataset = df.iloc[:7909, :]

In [None]:
dataset

In [None]:
#Renaming the column filename to path
dataset = dataset.rename(columns = {"filename":"path"})
#Show first 5 entries
dataset.head()

**Extracting file name, class and subclass labels from the path column**

In [None]:
#Creating new columns for the filename, file location and labels from the path column
dataset['filename'] = dataset['path'].apply(lambda x:x.split("/")[-1])
dataset["label"] = dataset['path'].apply(lambda x: x.split("/")[3])
dataset["sublabel"] = dataset['path'].apply(lambda x: x.split("/")[5])
dataset['file_loc'] = dataset['label'] + "_" + dataset['filename']

In [None]:
#Show first 5 entries
dataset.head()

**Encoding the class to integer**

Setting benign to 0
Setting Malignant to 1

In [None]:
#Encoding the class to integer
dataset['class'] = dataset['label'].apply(lambda x: 0 if x =='benign' else 1)

In [None]:
#Show first 5 entries
dataset.head()

In [None]:
#Show last 5 entries
dataset.tail()

In [None]:
#Plot illustrating data distribution
plt.figure(figsize=(10,6))
sns.set(font_scale = 1.5)
sns.set_style("darkgrid")
sns.countplot(dataset['label']);
plt.xlabel("Class")
plt.title("Number of Patients Benign and Malignant");

In [None]:
print('Number of samples for each class: \n', dataset["class"].value_counts())

**Extracting images from the folders**

In [None]:
#Creating new directory
os.makedirs("../Image_Dataset/")

In [None]:
#Creating new directory for benign
os.makedirs("../Image_Dataset/Benign/")

In [None]:
#Creating new directory for malignant
os.makedirs("../Image_Dataset/Malignant/")

In [None]:
#Moving all the images into one folder (Image_Dataset)
i = 0
for p in dataset['path']:
    src = "../input/breakhis/BreaKHis_v1/" + p
    dest = "../Image_Dataset/"
    if dataset["class"][i] == 0:
        dest = "../Image_Dataset/Benign/"
    else:
        dest = "../Image_Dataset/Malignant/"
    #saving the files with its corresponding class and patient_id
    dest = os.path.join(dest,src.split("/")[7]+ "_" + src.split("/")[-1])
    shutil.copyfile(src,dest)
    i +=1

In [None]:
#Checking the len
len(os.listdir("../Image_Dataset/"))

In [None]:
#Check the directories present
os.listdir("../Image_Dataset/")

In [None]:
#Check number of images in each folder
path = "../Image_Dataset/"
dir_list = [os.path.join(path,i) for i in os.listdir(path)]
size_dict = {}
for i,value in enumerate(dir_list):
    size_dict[os.listdir(path)[i]] = len(os.listdir(value))
size_dict

In [None]:
#Function to read the dataset from a specified directory
def import_images(folder,target):
    images = []
    for item in os.listdir(folder):
        img = cv2.imread(os.path.join(folder,item), cv2.IMREAD_COLOR)
        img = cv2.resize(img, (224, 224))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if img is not None:
            images.append([img,target])
    return images

In [None]:
#Importing the dataset from the specified directory using the created function and assign them with labels
#Where 0 is for Benign, 1 is malignant and 1 is for Malignant
benign = import_images("../Image_Dataset/Benign",0)
malignant = import_images("../Image_Dataset/Malignant",1)

In [None]:
#Plotting the benign sample
plt.title('Benign')
plt.imshow(benign[0][0])

In [None]:
#Here is a sample of an image in the dataset
plt.title('Malignant')
plt.imshow(malignant[0][0])

In [None]:
#Joining all the dataset together
benign.extend(malignant)

In [None]:
#Assigning the dataset to a new variable
full_data = benign

In [None]:
#Performing the dataset splitting into training and test set
training_data, test_dataset = train_test_split(full_data, test_size = 0.2, random_state = 30)

In [None]:
#Extracting validation dataset from the training data
training_dataset, val_dataset = train_test_split(training_data, test_size = 0.2, random_state = 30)

In [None]:
"""Function to separate the dataset into images and label puting the images in array called feature_map and labels into
array called label"""

def split_data(main_data):
    feature_matrix = []
    label = []
    for x,y in main_data:
        feature_matrix.append(x)
        label.append(y)
    return np.array(feature_matrix), np.array(label)

# **Extracting Training dataset**

In [None]:
#Extract validation images and labels
training_images, training_labels = split_data(training_dataset)

In [None]:
# Normalize images
training_images = np.array(training_images, dtype="float") / 255.0

In [None]:
#The shape of the datasets
print(training_images.shape)
print(training_labels.shape)

In [None]:
#The first image as a picture
plt.imshow(training_images[0])

In [None]:
#The first label in the dataset
training_labels[0]

In [None]:
#Check the data distribution among the classes
print(pd.Series(training_labels).value_counts())

# **Extracting Test dataset**

In [None]:
#Extract validation images and labels
test_images, test_labels = split_data(test_dataset)

In [None]:
# Normalize images
test_images = np.array(test_images, dtype="float") / 255.0

In [None]:
#The shape of the datasets
print(test_images.shape)
print(test_labels.shape)

In [None]:
#The first image as a picture
plt.imshow(test_images[0])

In [None]:
#The first label in the dataset
test_labels[0]

In [None]:
#Check the data distribution among the classes
print(pd.Series(test_labels).value_counts())

# **Extracting Validation dataset**

In [None]:
#Extract validation images and labels
val_images, val_labels = split_data(val_dataset)

In [None]:
# Normalize images
val_images = np.array(val_images, dtype="float") / 255.0

In [None]:
#The shape of the datasets
print(val_images.shape)
print(val_labels.shape)

In [None]:
#The first image as a picture
plt.imshow(val_images[0])

In [None]:
#The first label in the dataset
val_labels[0]

In [None]:
#Check the data distribution among the classes
print(pd.Series(val_labels).value_counts())

# Model Training

In [None]:
#Importing libraries needed for the modeling and training
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, Flatten, Dense, Dropout, MaxPooling2D, BatchNormalization
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import EarlyStopping, Callback , ModelCheckpoint
from tensorflow.keras.metrics import Accuracy,binary_crossentropy, FalsePositives, FalseNegatives, TruePositives, TrueNegatives
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
#Setting the batch size for the cnn model
bs = 32

In [None]:
#Defining Image Data Generator for image augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    horizontal_flip=True,
    vertical_flip=True,
    shear_range=0.2,
    fill_mode='nearest',
    zoom_range=0.2)

# Custom CNN Model

In [None]:
#Defining the base model
cancer_model = Sequential()

#First Layer
cancer_model.add(Conv2D(filters = 32, kernel_size = (3,3), input_shape = (224,224,3), activation = 'relu'))
cancer_model.add(MaxPooling2D(pool_size = (2,2)))

#Second Layer
cancer_model.add(Conv2D(filters = 64, kernel_size = (3,3), padding = 'same',activation = 'relu'))
cancer_model.add(MaxPooling2D(pool_size = (2,2)))

#Third Layer
cancer_model.add(Conv2D(filters = 128, kernel_size = (3,3), padding = 'same', activation = 'relu'))
cancer_model.add(MaxPooling2D(pool_size = (2,2)))
cancer_model.add(Dropout(0.4))

#Fourth Layer
cancer_model.add(Conv2D(filters = 256, kernel_size = (3,3), padding = 'same', activation = 'relu'))
cancer_model.add(MaxPooling2D(pool_size = (2,2)))
cancer_model.add(Dropout(0.2))

#Flattening the layers
cancer_model.add(Flatten())

#Adding the dense layer
cancer_model.add(Dense(256, activation = 'relu'))
cancer_model.add(Dense(128, activation = 'relu'))
cancer_model.add(Dense(1, activation = 'sigmoid'))

cancer_model.summary()

In [None]:
#Setting the learning rate to reduce gradually over the training period
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
  0.001,
  decay_steps=20*50,
  decay_rate=1,
  staircase=False)

def get_optimizer():
  return tf.keras.optimizers.Adam(lr_schedule)

In [None]:
#Compiling the model
cancer_model.compile(loss='binary_crossentropy', optimizer = get_optimizer(), metrics = ['accuracy'])
early_stop = EarlyStopping(monitor='val_loss',patience=5)
checkpoint = ModelCheckpoint("./Best_model/",save_best_only=True,)

In [None]:
#Fitted the model on the training and and validation dataset using the image augmentor for 200 epochs
history = cancer_model.fit_generator(datagen.flow(training_images, training_labels, batch_size = bs),
                                     validation_data=datagen.flow(val_images, val_labels),
                                     epochs=200,
                                     callbacks=[early_stop,checkpoint],
                                     verbose = 1)

In [None]:
#Plotting the model results

#Getting the accuracy
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

#Getting the losses
loss = history.history['loss']
val_loss = history.history['val_loss']

#No of epochs it trained
epochs_range = history.epoch

#Plotting Training and Validation accuracy
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

#Plotting Training and Validation Loss
plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

# Performance Evaluation of custom CNN model

In [None]:
y_pred = cancer_model.predict(test_images)

In [None]:
print(y_pred)

In [None]:
#Round up model prediction to 0 and 1 and convert result to an array
rounded = [float(np.round(x)) for x in y_pred]
rounded_predictions = np.asarray(rounded).astype(int)
print(rounded_predictions)

In [None]:
#Round up model prediction to 0 and 1 and convert result to an array
rounded = [float(np.round(x)) for x in y_pred]
rounded_predictions = np.asarray(rounded).astype(int)
print(rounded_predictions)

In [None]:
#Classification report
print(classification_report(test_labels, rounded_predictions))

In [None]:
#Confusion matrix
cm = confusion_matrix(test_labels, rounded_predictions)
print(cm)

import seaborn as sns

#Setting the labels
labels = ['Benign', 'Malignant']

#Plot the Confusion matrix graph
fig= plt.figure(figsize=(8, 5))
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt='g')
ax.set_xlabel('Predicted Labels', fontsize=10)
ax.xaxis.set_label_position('bottom')
plt.xticks(rotation=90)
ax.xaxis.set_ticklabels(labels, fontsize = 5)
ax.xaxis.tick_bottom()

ax.set_ylabel('True Labels', fontsize=10)
ax.yaxis.set_ticklabels(labels, fontsize = 10)
plt.yticks(rotation=0)

plt.title('Confusion Matrix', fontsize=15)

plt.savefig('ConMat24.png')
plt.show()

In [None]:
#Accuracy
accuracy = accuracy_score(test_labels, rounded_predictions)
print('Accuracy: %f' % accuracy)

In [None]:
#Precision
precision = precision_score(test_labels, rounded_predictions)
print('Precision: %f' % precision)

In [None]:
# Recall
recall = recall_score(test_labels, rounded_predictions, pos_label=1)
print('Recall: %f' % recall)

In [None]:
#Specificity
specificity = recall_score(test_labels, rounded_predictions, pos_label=0)
print('Specificity: %f' % specificity)

In [None]:
#F1-score
F1_score = f1_score(test_labels, rounded_predictions)
print('F1_score: %f' % F1_score)

In [None]:
#Save the model
cancer_model.save("Final_custom_breast_cancer_model.h5")

from tensorflow.keras.models import load_model
new_cancer_model = load_model("custom_breast_cancer_model.h5")

#Check if the new model has the right validation loss and accuracy information on 16 steps
results = new_cancer_model.evaluate_generator(datagen.flow(val_images, val_labels), steps=16)
print("Validation and Accuracy are ", results)

#Continue to train the Model for 50 epochs
new_history = new_cancer_model.fit_generator(datagen.flow(training_images, training_labels, batch_size = bs),
                           validation_data=datagen.flow(val_images, val_labels), epochs=50,
                 callbacks=[early_stop,checkpoint], verbose = 1)

# VGG16 Modelling

In [None]:
from tensorflow.keras.applications.vgg16 import VGG16

In [None]:
#Loading the model
base_model = VGG16(
    include_top=False,
    weights="imagenet",
    input_shape=(224, 224, 3),
)

#Making sure the layers of the VGG16 model are not retrained
for layer in base_model.layers:
    layer.trainable = False

In [None]:
#Adding custom top layers
VGG16_model = Sequential()
VGG16_model.add(base_model)

VGG16_model.add(Flatten())
VGG16_model.add(BatchNormalization())
#Adding two dense layer and dropout
VGG16_model.add(Dense(256,activation='relu'))
VGG16_model.add(Dropout(0.5))
VGG16_model.add(Dense(128,activation='relu'))
VGG16_model.add(Dropout(0.5))
VGG16_model.add(Dense(1,activation='sigmoid'))
VGG16_model.summary()

In [None]:
#Compiling the model
VGG16_model.compile(optimizer='adam',loss='binary_crossentropy' ,metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss',patience=5)
checkpoint = ModelCheckpoint("./Best_model/",save_best_only=True,)

In [None]:
#Fitted the VGG16 model on the training and and validation dataset using the image augmentor for 50 epochs
VGG16_history = VGG16_model.fit_generator(datagen.flow(training_images, training_labels, batch_size = bs),
                                     validation_data=datagen.flow(val_images, val_labels),
                                     epochs=50,
                                     callbacks=[early_stop,checkpoint],
                                     verbose = 1)

In [None]:
#Plotting the model results

#Getting the accuracy
acc = VGG16_history.history['accuracy']
val_acc = VGG16_history.history['val_accuracy']

#Getting the losses
loss = VGG16_history.history['loss']
val_loss = VGG16_history.history['val_loss']

#No of epochs it trained
epochs_range = VGG16_history.epoch

#Plotting Training and Validation accuracy
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

#Plotting Training and Validation Loss
plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

# Performance Evaluation of VGG16 CNN model

In [None]:
vgg16_y_pred = VGG16_model.predict(test_images)

In [None]:
print(vgg16_y_pred)

In [None]:
#Round up model prediction to 0 and 1 and convert result to an array
rounded = [float(np.round(x)) for x in vgg16_y_pred]
rounded_vgg16 = np.asarray(rounded).astype(int)
print(rounded_vgg16)

In [None]:
#Confusion matrix
cm = confusion_matrix(test_labels, rounded_vgg16)
print(cm)

import seaborn as sns

#Setting the labels
labels = ['Benign', 'Malignant']

#Plot the Confusion matrix graph
fig= plt.figure(figsize=(8, 5))
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt='g')
ax.set_xlabel('Predicted Labels', fontsize=10)
ax.xaxis.set_label_position('bottom')
plt.xticks(rotation=90)
ax.xaxis.set_ticklabels(labels, fontsize = 5)
ax.xaxis.tick_bottom()

ax.set_ylabel('True Labels', fontsize=10)
ax.yaxis.set_ticklabels(labels, fontsize = 10)
plt.yticks(rotation=0)

plt.title('Confusion Matrix', fontsize=15)

plt.savefig('ConMat24.png')
plt.show()

In [None]:
#Classification report
print(classification_report(test_labels, rounded_vgg16))

In [None]:
#Accuracy
accuracy = accuracy_score(test_labels, rounded_vgg16)
print('Accuracy: %f' % accuracy)

In [None]:
#Precision
precision = precision_score(test_labels, rounded_vgg16)
print('Precision: %f' % precision)

In [None]:
# Recall
recall = recall_score(test_labels, rounded_vgg16, pos_label=1)
print('Recall: %f' % recall)

In [None]:
#Specificity
specificity = recall_score(test_labels, rounded_vgg16, pos_label=0)
print('Specificity: %f' % specificity)

In [None]:
#F1-score
F1_score = f1_score(test_labels, rounded_vgg16)
print('F1_score: %f' % F1_score)

In [None]:
#Save the model
VGG16_model.save("VGG16_breast_cancer_model.h5")

code for loading the model

from tensorflow.keras.models import load_model
custom_cancer_model = load_model("../input/my-trained-models/Final_custom_breast_cancer_model.h5")

#Loading the VGG16 model
vgg16_cancer_model = load_model("../input/my-trained-models/VGG16_breast_cancer_model.h5")