## Introduction

This note book classifies the Mammography images to detect cancer. This notebook is for a kaggle competition. The details of the competetion can be found [here](https://www.kaggle.com/competitions/rsna-breast-cancer-detection/overview).

Some of the fetures of the model are:  
1- Using Inception-V3 as base model  
2- Croping the image to remove black region  
3- Moving/copying files and making use of tensorfolw flow_from_directory  
4- Saving the model 

## Imports

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
%matplotlib inline

import glob

import matplotlib.image as mpimg
import matplotlib.pyplot as plt

import pydicom
import random

import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.optimizers import RMSprop

from tqdm import tqdm
import cv2
from joblib import Parallel, delayed
import shutil

2.6.4


## Config

In [2]:
# resolution used in the enitre of the modeling
target_size = 256


## UDFs

In [3]:
def img2roi(path):
    img = cv2.imread(path)
    img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    # Binarize the image
    bin_img = cv2.threshold(img, 20, 255, cv2.THRESH_BINARY)[1]

    # Make contours around the binarized image, keep only the largest contour
    contours, _ = cv2.findContours(bin_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    contour = max(contours, key=cv2.contourArea)

    # Find ROI from largest contour
    ys = contour.squeeze()[:, 0]
    xs = contour.squeeze()[:, 1]
    roi =  img[np.min(xs):np.max(xs), np.min(ys):np.max(ys)]
    
    roi = cv2.resize(roi, (img.shape[1], img.shape[0]))
    roi = cv2.cvtColor(roi,cv2.COLOR_GRAY2RGB)
#     roi = np.expand_dims(roi, axis =-1)
    return roi

# function for plotting the accuracy and loss vs epochs
def plot_loss_acc(history):
    '''Plots the training and validation loss and accuracy from a history object'''
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(len(acc))

    plt.plot(epochs, acc, 'bo', label='Training accuracy')
    plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
    plt.title('Training and validation accuracy')

    plt.figure()

    plt.plot(epochs, loss, 'bo', label='Training Loss')
    plt.plot(epochs, val_loss, 'b', label='Validation Loss')
    plt.title('Training and validation loss')
    plt.legend()

    plt.show()


Load the already sized-down images: 

In [4]:
save_folder = '/kaggle/input/rsna-breast-cancer-' + str(target_size)+'-pngs'
f_names = os.listdir(save_folder)
print(len(f_names))

54706


Download the weights for the inception-V3 model:

In [5]:
!wget --no-check-certificate \
    https://storage.googleapis.com/mledu-datasets/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5 \
    -O /tmp/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5

--2022-12-14 04:16:55--  https://storage.googleapis.com/mledu-datasets/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.214.128, 172.217.193.128, 108.177.13.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.214.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 87910968 (84M) [application/x-hdf]
Saving to: ‘/tmp/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5’


2022-12-14 04:16:56 (154 MB/s) - ‘/tmp/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5’ saved [87910968/87910968]



In [6]:
base_dir = "/tmp/cancer_data_" + str(target_size)

os.makedirs(os.path.join(base_dir,'train'), exist_ok=True)
os.makedirs(os.path.join(base_dir,'train','cancer'), exist_ok=True)
os.makedirs(os.path.join(base_dir,'train','non-cancer'), exist_ok=True)

os.makedirs(os.path.join(base_dir,'validation'), exist_ok=True)
os.makedirs(os.path.join(base_dir,'validation','cancer'), exist_ok=True)
os.makedirs(os.path.join(base_dir,'validation','non-cancer'), exist_ok=True)

train_dir = os.path.join(base_dir, 'train')
validation_dir = os.path.join(base_dir, 'validation')

# Directory with training cat/dog pictures
train_cancer_dir = os.path.join(train_dir, 'cancer')
train_non_cancer_dir = os.path.join(train_dir, 'non-cancer')

# Directory with validation cat/dog pictures
validation_cancer_dir = os.path.join(validation_dir, 'cancer')
validation_non_cancer_dir = os.path.join(validation_dir, 'non-cancer')

In [10]:
df = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/train.csv')
print(f'shape of the train data: {df.shape}')
print(f"number of the patients: {len(df['patient_id'].unique())}")
print(f"number of the unique images: {len(df['image_id'].unique().tolist())} \n")
df.head()

shape of the train data: (54706, 14)
number of the patients: 11913
number of the unique images: 54706 



Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True


Move the files into correct folders for tensorflow

In [None]:
random.seed(1)
N = len(df['patient_id'].unique().tolist())
# N = 1000
sample_ids = random.sample(df['patient_id'].unique().tolist(),N)
split_ratio = 0.8
train_ids  = sample_ids[:int(split_ratio * len(sample_ids))]
validation_ids = sample_ids[int(split_ratio * len(sample_ids)):]

for v in tqdm(f_names):
    patient_id = int(v.split('_')[0])
    image_id = int(v.split('_')[1][:-4])

    # train data
    if patient_id in train_ids and df.loc[df['image_id']==image_id,'cancer'].values[0]==1:
        cv2.imwrite(os.path.join(train_cancer_dir,v), img2roi(os.path.join(save_folder, v)))
#         (img * 255).astype(np.uint8)
#         shutil.copy(os.path.join(save_folder, v),os.path.join(train_cancer_dir,v))
    elif patient_id in train_ids and df.loc[df['image_id']==image_id,'cancer'].values[0]==0:
        cv2.imwrite(os.path.join(train_non_cancer_dir,v), img2roi(os.path.join(save_folder, v)))
#         shutil.copy(os.path.join(save_folder, v),os.path.join(train_non_cancer_dir,v))
    # validation data
    elif patient_id in validation_ids and df.loc[df['image_id']==image_id,'cancer'].values[0]==1:
        cv2.imwrite(os.path.join(validation_cancer_dir,v), img2roi(os.path.join(save_folder, v)))
#         shutil.copy(os.path.join(save_folder, v),os.path.join(validation_cancer_dir,v))
    elif patient_id in validation_ids and df.loc[df['image_id']==image_id,'cancer'].values[0]==0:
        cv2.imwrite(os.path.join(validation_non_cancer_dir,v), img2roi(os.path.join(save_folder, v)))
#         shutil.copy(os.path.join(save_folder, v),os.path.join(validation_non_cancer_dir,v))


Import the Inception-v3 and assign the weights and freeze the layers as they are not needed to be trained:

In [None]:
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras import layers

# Set the weights file you downloaded into a variable
local_weights_file = '/tmp/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5'

# Initialize the base model.
# Set the input shape and remove the dense layers.
pre_trained_model = InceptionV3(input_shape = (target_size, target_size, 3), 
                                include_top = False, 
                                weights = None)

# Load the pre-trained weights you downloaded.
pre_trained_model.load_weights(local_weights_file)

# Freeze the weights of the layers.
for layer in pre_trained_model.layers:
    layer.trainable = False

In [None]:
# Choose `mixed_7` as the last layer of your base model
last_layer = pre_trained_model.get_layer('mixed7')
print('last layer output shape: ', last_layer.output_shape)
last_output = last_layer.output

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# All images will be rescaled by 1./255.
train_datagen = ImageDataGenerator(rescale = 1.0/255.)
test_datagen  = ImageDataGenerator(rescale = 1.0/255.)
train_generator = train_datagen.flow_from_directory(train_dir,
                                                    batch_size=50,
                                                    class_mode='binary',
#                                                     color_mode='grayscale',
                                                    target_size=(target_size, target_size))     
validation_generator =  test_datagen.flow_from_directory(validation_dir,
                                                         batch_size=50,
                                                         class_mode  = 'binary',
#                                                          color_mode='grayscale',
                                                         target_size = (target_size, target_size))

Function to create a model:

In [None]:
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import Model

def create_model(pre_trained_model):
    
    last_layer = pre_trained_model.get_layer('mixed7')
    last_output = last_layer.output
    
    # Flatten the output layer to 1 dimension
    x = layers.Flatten()(last_output)
    # Add a fully connected layer with 1,024 hidden units and ReLU activation
    x = layers.Dense(1024, activation='relu')(x)
    # Add a dropout rate of 0.2
    x = layers.Dropout(0.2)(x)                  
    # Add a final sigmoid layer for classification
    x = layers.Dense  (1, activation='sigmoid')(x)           

    # Append the dense network to the base model
    return(Model(pre_trained_model.input, x))


In [None]:
model = create_model(pre_trained_model)
model.compile(loss='binary_crossentropy',
            optimizer=RMSprop(learning_rate=1e-4),
            metrics=['accuracy','Recall','Precision'])
# model.summary()
history = model.fit(
            train_generator,
            epochs=5,
            validation_data=validation_generator,
            verbose=1, batch_size=50
            )

Look at the results and model performance

In [None]:
# Plot training results
plot_loss_acc(history)

### Save the model

In [None]:
model.save_weights('/kaggle/working/inceptionv3-.h5')