<a href="https://colab.research.google.com/github/arvindhvasu/GL_Pneumonia_Detection/blob/Dev_Arvindh/code/model_densenet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**SUMMARY**

Filename    : model.ipynb   
Description : The file contains logic for creating model

Below steps were performed for data preparation,   
      1.   Building a pneumonia detection model starting from basic CNN and then   improving upon it.   
      2.   Train the model     
      3.   To deal with large training time, save the weights so that you can use them when training the model for the second time without starting from scratch.   


**Revision History**  
Date        ||       Description               ||              Author  
07-06-2020   ||   Initial logic for model creation   ||   Arvindh     
10-06-2020   ||   Model with Unet architecture   ||   Arvindh     
13-06-2020   ||   Model Testing - with test images  ||   Arvindh 
  



#**Import Libraries**

In [24]:
%tensorflow_version 2.x

In [25]:
import tensorflow
import keras
from keras import backend as k
print("Backend: " + k.backend() + " || tensorflow version: " + tensorflow.__version__)

Backend: tensorflow || tensorflow version: 2.2.0


In [26]:
import pandas as pd 
import numpy as np
from matplotlib.pyplot import figure, show
import os
import matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


In [27]:
!pip3 install pydicom
import pydicom
from skimage.transform import resize
import pickle
import cv2



In [28]:
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

#Mount Google Drive

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
#Set your project path 
project_path =  'drive/My Drive/Colab Notebooks/Capstone'

In [None]:
#changed the project base path
os.chdir(project_path)

In [32]:
%ls

[0m[01;34mdataset[0m/           model_Unet_1.h5        train_df_pickle
[01;34mdataset_new[0m/       model_Unet.h5          UNetW.h5
model_Densenet.h5  train_class_df_pickle  valid_df_pickle


In [34]:
#Set your data path 
train_img_path = 'dataset/stage_2_train_images'
valid_img_path = 'dataset/stage_2_train_images'
test_img_path = 'dataset/stage_2_test_images'

In [35]:
train_df_pick = "train_df_pickle"
valid_df_pick = "valid_df_pickle"

# we open the file for reading
fileObject = open(train_df_pick,'rb')  
fileObject1 = open(valid_df_pick,'rb')  

# load the object from the file into temp var b
train_df = pickle.load(fileObject)  
valid_df = pickle.load(fileObject1) 

In [36]:
train_patient_Ids = train_df.patientId.unique()
print(train_df.shape, 'training data')
print("The number of unique patient id in the training dataset is "+ str(train_patient_Ids.shape[0]))

(24181, 10) training data
The number of unique patient id in the training dataset is 21908


Splitted the Validation dataset into Validation and Test dataset with the ratio of 95:5 based on the 'Target' column.

In [37]:
from sklearn.model_selection import train_test_split
valid_train_df, valid_test_df = train_test_split(valid_df, test_size=0.05,
                                    stratify=valid_df['Target'])
valid_train_patient_Ids = valid_train_df.patientId.unique()
valid_test_patient_Ids = valid_test_df.patientId.unique()
print(valid_train_df.shape, 'validation data')
print("The number of unique patient id in the validation dataset is "+ str(valid_train_patient_Ids.shape[0]))
print("---------------------------------------------------------------------")
print(valid_test_df.shape, 'test data')
print("The number of unique patient id in the test dataset is "+ str(valid_test_patient_Ids.shape[0]))

(5743, 10) validation data
The number of unique patient id in the validation dataset is 5627
---------------------------------------------------------------------
(303, 10) test data
The number of unique patient id in the test dataset is 302


#Custom Data generator

In [38]:
class Generator(keras.utils.Sequence):
  def __init__(self, unique_patient_ids, folder, dataframe, batch_size=100, shuffle=False, image_size=256, predict=False):
        self.unique_patient_ids = unique_patient_ids
        self.folder = folder
        self.dataframe = dataframe
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.image_size = image_size
        self.predict = predict
        self.on_epoch_end()
   
  def __load__(self, pat_id):
        patIdWithExt = pat_id+'.dcm'
        # load dicom file from pixel array
        img = pydicom.dcmread(os.path.join(self.folder, patIdWithExt)).pixel_array
        # create empty array
        msk = np.zeros(img.shape) 
        target = 0
        if pat_id in self.dataframe["patientId"].values:
            pat_info = self.dataframe[self.dataframe.patientId == pat_id]

            # loop through patient rows in the df
            for info in pat_info.iterrows():
                row = info[1]
                if row.Target == 1:
                    x = int(row.x)
                    y = int(row.y)
                    msk[y: int(row.y) + int(row.height), x: int(row.x) + int(row.width)] = 1 
                    target = 1
      
        img = cv2.resize(img, (self.image_size, self.image_size))
        msk = cv2.resize(msk, (self.image_size, self.image_size))
        img = np.expand_dims(img, -1)
        msk = np.expand_dims(msk, -1)   
        '''
        img = np.dstack([img, img, img])
        img = img.astype(np.float32)/255.
        msk = np.dstack([msk, msk, msk])
        msk = msk.astype(np.float32)/255.
        '''
        return img, msk

  def __loadpredict__(self, pat_id):
        patIdWithExt = pat_id+'.dcm'
        # load dicom file from pixel array
        img = pydicom.dcmread(os.path.join(self.folder, patIdWithExt)).pixel_array
        # resize image
        img = resize(img, (self.image_size, self.image_size), mode='reflect')
        # add trailing channel dimension
        img = np.expand_dims(img, -1)
        return img

  def __getitem__(self, index):
        # select pat_ids by batches
        batch_pat_ids = self.unique_patient_ids[index*self.batch_size : (index+1)*self.batch_size]

        if self.predict:
            # load files
            imgs = [self.__loadpredict__(pat_id_row) for pat_id_row in batch_pat_ids]
            # create numpy batch
            imgs = np.array(imgs)
            return imgs, batch_pat_ids
        else:    
            # load pat ids by loop
            items = [self.__load__(pat_id_row) for pat_id_row in batch_pat_ids]
            imgs, msks = zip(*items)

            # create numpy batch
            imgs = np.array(imgs)
            msks = np.array(msks)
        return imgs, msks

  def __len__(self):
        if self.predict:
            return int(np.ceil(len(self.unique_patient_ids) / self.batch_size))
        else:
            # __len__ provides number of batches per epoch
            return int(np.floor(self.unique_patient_ids.shape[0] / self.batch_size))

  def on_epoch_end(self):
        if self.predict:
            self.indexes = np.arange(len(self.unique_patient_ids))
        else:
            self.indexes = np.arange(self.unique_patient_ids.shape[0])
        if self.shuffle == True:
              np.random.shuffle(self.indexes)

In [39]:
class DataGenerator(keras.utils.Sequence):
  def __init__(self, unique_patient_ids, folder, dataframe, batch_size=100, shuffle=False, image_size=256, predict=False):
        self.unique_patient_ids = unique_patient_ids
        self.folder = folder
        self.dataframe = dataframe
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.image_size = image_size
        self.predict = predict
        self.on_epoch_end()
   
  def __load__(self, pat_id):
        patIdWithExt = pat_id+'.dcm'
        # load dicom file from pixel array
        img = pydicom.dcmread(os.path.join(self.folder, patIdWithExt)).pixel_array
        # create empty array
        msk = np.zeros(img.shape) 
        labels = np.empty((self.batch_size), dtype = int)
        target = 0
        if pat_id in self.dataframe["patientId"].values:
            pat_info = self.dataframe[self.dataframe.patientId == pat_id]

            # loop through patient rows in the df
            for info in pat_info.iterrows():
                row = info[1]
                if row.Target == 1:
                    x = int(row.x)
                    y = int(row.y)
                    msk[y: int(row.y) + int(row.height), x: int(row.x) + int(row.width)] = 1 
                    target = 1
        
        img = cv2.resize(img, (self.image_size, self.image_size))
        msk = cv2.resize(msk, (self.image_size, self.image_size))
        img = np.expand_dims(img, -1)
        msk = np.expand_dims(msk, -1)   
        
        img = np.dstack([img, img, img])
        img = img.astype(np.float32)/255.
        msk = np.dstack([msk, msk, msk])
        msk = msk.astype(np.float32)/255.
        
        return img, target

  def __loadpredict__(self, pat_id):
        patIdWithExt = pat_id+'.dcm'
        # load dicom file from pixel array
        img = pydicom.dcmread(os.path.join(self.folder, patIdWithExt)).pixel_array
        # resize image
        img = resize(img, (self.image_size, self.image_size), mode='reflect')
        # add trailing channel dimension
        img = np.expand_dims(img, -1)
        return img

  def __getitem__(self, index):
        # select pat_ids by batches
        batch_pat_ids = self.unique_patient_ids[index*self.batch_size : (index+1)*self.batch_size]

        if self.predict:
            # load files
            imgs = [self.__loadpredict__(pat_id_row) for pat_id_row in batch_pat_ids]
            # create numpy batch
            imgs = np.array(imgs)
            return imgs, batch_pat_ids
        else:    
            # load pat ids by loop
            items = [self.__load__(pat_id_row) for pat_id_row in batch_pat_ids]
            imgs, msks = zip(*items)

            # create numpy batch
            imgs = np.array(imgs)
            msks = np.array(msks)
        return imgs, msks

  def __len__(self):
        if self.predict:
            return int(np.ceil(len(self.unique_patient_ids) / self.batch_size))
        else:
            # __len__ provides number of batches per epoch
            return int(np.floor(self.unique_patient_ids.shape[0] / self.batch_size))

  def on_epoch_end(self):
        if self.predict:
            self.indexes = np.arange(len(self.unique_patient_ids))
        else:
            self.indexes = np.arange(self.unique_patient_ids.shape[0])
        if self.shuffle == True:
              np.random.shuffle(self.indexes)

In [40]:
BATCH_SIZE = 100

#**Model Building**

#**Densenet121**

In [41]:
from keras.layers import GlobalAveragePooling2D, Dense, Dropout, Flatten, Input, Conv2D, GlobalAvgPool2D, AveragePooling2D, BatchNormalization, UpSampling2D
from keras.models import Sequential, Model
from keras.optimizers import Adam

In [42]:
from keras.applications.densenet import DenseNet121

In [19]:
base_model = DenseNet121(weights=None, include_top=False, input_shape=(224, 224,3))

for layer in base_model.layers[:-8]:
      layer.trainable = False
for layer in base_model.layers[-8:]:
      layer.trainable = True

x = GlobalAvgPool2D()(base_model.output)

x= BatchNormalization()(x)
x= Dropout(0.5)(x)
x= Dense(1024,activation='relu')(x) 
x= Dense(512,activation='relu')(x) 
x= BatchNormalization()(x)
x= Dropout(0.5)(x)

output = Dense(2, activation='softmax', name='predictions')(x)
model = Model(inputs=base_model.input, output=output)

In [20]:
checkpoint = ModelCheckpoint("model_Densenet.h5", monitor="val_loss", verbose=1,
                             mode="auto", period=1)
stop = EarlyStopping(monitor="val_loss", patience=10, mode="auto")
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=5, min_lr=1e-6, verbose=1, mode="auto")

model.compile(optimizer = Adam(lr = 1e-4), loss = 'sparse_categorical_crossentropy',
                           metrics = ['accuracy'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
zero_padding2d_1 (ZeroPadding2D (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1/conv (Conv2D)             (None, 112, 112, 64) 9408        zero_padding2d_1[0][0]           
__________________________________________________________________________________________________
conv1/bn (BatchNormalization)   (None, 112, 112, 64) 256         conv1/conv[0][0]                 
____________________________________________________________________________________________

In [21]:
# Train and validation generator for DenseNet121.
train_gen_1 = DataGenerator(train_patient_Ids, train_img_path, train_df, batch_size=BATCH_SIZE, image_size=224)
valid_gen_1 = DataGenerator(valid_train_patient_Ids, valid_img_path, valid_train_df, batch_size=BATCH_SIZE, image_size=224)

In [23]:
history1 = model.fit_generator(train_gen_1, 
                              validation_data=valid_gen_1, 
                              callbacks=[checkpoint, reduce_lr, stop],
                              epochs=4, 
                              workers=13, 
                              steps_per_epoch=len(train_gen_1),
                              validation_steps= len(valid_gen_1)//BATCH_SIZE,
                              use_multiprocessing=True)

Epoch 1/4

Epoch 00001: saving model to model_Densenet.h5
Epoch 2/4

Epoch 00002: saving model to model_Densenet.h5
Epoch 3/4

Epoch 00003: saving model to model_Densenet.h5
Epoch 4/4

Epoch 00003: saving model to model_Densenet.h5

Epoch 00004: saving model to model_Densenet.h5


In [45]:
from keras.models import load_model
model = load_model('model_Densenet.h5')
model.compile(optimizer = Adam(lr = 1e-4), loss = 'sparse_categorical_crossentropy',
                           metrics = ['accuracy'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
zero_padding2d_1 (ZeroPadding2D (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1/conv (Conv2D)             (None, 112, 112, 64) 9408        zero_padding2d_1[0][0]           
__________________________________________________________________________________________________
conv1/bn (BatchNormalization)   (None, 112, 112, 64) 256         conv1/conv[0][0]                 
____________________________________________________________________________________________

In [49]:
test_gen_1 = DataGenerator(valid_test_patient_Ids[:10], valid_img_path, valid_test_df, batch_size=1, image_size=224, predict=False)

In [65]:
## Dataset for prediction
x, y = test_gen_1.__getitem__(3)
result = model.predict(x)

In [67]:
print(result[0])
print(np.argmax(result[0]))

[0.97516805 0.02483191]
0


#**UNET with backbone RESNET34**

In [None]:
!pip install segmentation_models

Collecting segmentation_models
  Downloading https://files.pythonhosted.org/packages/da/b9/4a183518c21689a56b834eaaa45cad242d9ec09a4360b5b10139f23c63f4/segmentation_models-1.0.1-py3-none-any.whl
Collecting image-classifiers==1.0.0
  Downloading https://files.pythonhosted.org/packages/81/98/6f84720e299a4942ab80df5f76ab97b7828b24d1de5e9b2cbbe6073228b7/image_classifiers-1.0.0-py3-none-any.whl
Collecting efficientnet==1.0.0
  Downloading https://files.pythonhosted.org/packages/97/82/f3ae07316f0461417dc54affab6e86ab188a5a22f33176d35271628b96e0/efficientnet-1.0.0-py3-none-any.whl
Installing collected packages: image-classifiers, efficientnet, segmentation-models
Successfully installed efficientnet-1.0.0 image-classifiers-1.0.0 segmentation-models-1.0.1


In [None]:
from segmentation_models import Unet

Segmentation Models: using `keras` framework.


In [None]:
# Train and validation generator for Pretrained Model with UNET.
train_gen = Generator(train_patient_Ids[:300], train_img_path, train_df, batch_size=BATCH_SIZE, image_size=224)
valid_gen = Generator(valid_train_patient_Ids[:150], valid_img_path, valid_train_df, batch_size=BATCH_SIZE, image_size=224)

In [None]:
# prepare model
base_model = Unet(backbone_name='resnet34', encoder_weights='imagenet')

inp = Input(shape=(224, 224, 1))
l1 = Conv2D(1, (1, 1))(inp) # map N channels data to 3 channels
out = base_model(l1)

resnet50_model = Model(inp, out, name=base_model.name)
resnet50_model.compile('Adam', 'binary_crossentropy', ['binary_accuracy'])

In [None]:
resnet50_model.summary()


Model: "model_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 224, 224, 1)       0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 224, 224, 1)       2         
_________________________________________________________________
model_21 (Model)             multiple                  24456154  
Total params: 24,456,156
Trainable params: 24,438,806
Non-trainable params: 17,350
_________________________________________________________________


In [None]:
res_checkpoint = ModelCheckpoint("model_Resnet.h5", monitor="val_loss", verbose=1,
                             mode="auto", period=1)
res_stop = EarlyStopping(monitor="val_loss", patience=10, mode="auto")
res_reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=5, min_lr=1e-6, verbose=1, mode="auto")

res_history = resnet50_model.fit_generator(train_gen, 
                              validation_data=valid_gen, 
                              callbacks=[res_checkpoint, res_reduce_lr, res_stop],
                              epochs=2, 
                              workers=5, 
                              steps_per_epoch=len(train_gen),
                              validation_steps= len(valid_gen)//BATCH_SIZE,
                              use_multiprocessing=True)

Epoch 1/2

Epoch 00001: saving model to model_Resnet.h5
Epoch 2/2

Epoch 00002: saving model to model_Resnet.h5


#**Note** The model with Densenet architecture has given good accuracy in the training and validation dataset.
Training - Accuracy: -%, Loss: -%   
Validation - Accuracy: -%, Loss: -%



In [None]:
plt.figure(figsize=(12,4))
plt.subplot(131)
plt.plot(history.epoch, history.history["loss"], label="Train loss")
plt.plot(history.epoch, history.history["val_loss"], label="Valid loss")
plt.legend()
plt.subplot(132)
plt.plot(history.epoch, history.history["accuracy"], label="Train accuracy")
plt.plot(history.epoch, history.history["val_accuracy"], label="Valid accuracy")
plt.legend()
plt.show()

#**Model Testing**

**Note** The Model has been tested for few test samples for to see the model predictions and compare with the model training.

In [None]:
test_gen = Generator(valid_test_patient_Ids[:10], valid_img_path, valid_test_df, batch_size=1, image_size=256, predict=False)

In [None]:
sample_df=valid_test_patient_Ids[:10]
sample_df[1:2]

array(['becced78-9df7-4e9a-94c8-7cb750d41507'], dtype=object)

In [None]:
## Dataset for prediction
x, y = test_gen.__getitem__(1)
result = model.predict(x)

In [None]:
valid_test_df[valid_test_df.patientId == 'becced78-9df7-4e9a-94c8-7cb750d41507']

Unnamed: 0,patientId,x,y,width,height,Target,class,PatientAge,PatientSex,ViewPosition
26694,becced78-9df7-4e9a-94c8-7cb750d41507,568.0,119.0,190.0,498.0,1,Lung Opacity,52,F,AP


In [None]:
fig = plt.figure()
fig.subplots_adjust(hspace=0.4, wspace=0.4)

ax = fig.add_subplot(1, 3, 1)
ax.imshow(np.reshape(x[0], (256, 256)), cmap="gray")
ax = fig.add_subplot(1, 3, 2)
ax.imshow(np.reshape(y[0], (256, 256)), cmap="gray")
ax = fig.add_subplot(1, 3, 3)
ax.imshow(np.reshape(result[0], (256, 256)), cmap="gray")