<a href="https://colab.research.google.com/github/acg12/corona-disease-detector/blob/master/experiments/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup the Environment

In [1]:
!pip install openpyxl
!pip install kaggle



In [6]:
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [9]:
!kaggle datasets download -d tawsifurrahman/covid19-radiography-database

Downloading covid19-radiography-database.zip to /content
 99% 739M/745M [00:04<00:00, 165MB/s]
100% 745M/745M [00:04<00:00, 172MB/s]


In [10]:
!unzip -q /content/covid19-radiography-database.zip

In [11]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split

# For visualizing images
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.image as mpimg
import random
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# For augmenting data
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# For modelling
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D, Activation
from tensorflow.keras import Sequential, layers
import tensorflow_hub as hub
from sklearn.model_selection import StratifiedKFold

# For evaluation
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report, confusion_matrix, roc_curve

import os

# Prepare the Data

In [14]:
covid_df = pd.read_excel("/content/COVID-19_Radiography_Dataset/COVID.metadata.xlsx")
normal_df = pd.read_excel("/content/COVID-19_Radiography_Dataset/Normal.metadata.xlsx")

In [15]:
covid_df['TARGET'] = 'Positive'
normal_df['TARGET'] = 'Negative'
normal_df.head()

Unnamed: 0,FILE NAME,FORMAT,SIZE,URL,TARGET
0,NORMAL-1,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...,Negative
1,NORMAL-2,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...,Negative
2,NORMAL-3,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...,Negative
3,NORMAL-4,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...,Negative
4,NORMAL-5,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...,Negative


In [16]:
covid_df['FILE NAME'] = covid_df.apply(lambda x: "COVID/" + x['FILE NAME'] + "." + str.lower(x['FORMAT']), axis=1)
normal_df['FILE NAME'] = normal_df.apply(lambda x: "Normal/" + str.capitalize(x['FILE NAME']) + "." + str.lower(x['FORMAT']), axis=1)
normal_df.head()

Unnamed: 0,FILE NAME,FORMAT,SIZE,URL,TARGET
0,Normal/Normal-1.png,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...,Negative
1,Normal/Normal-2.png,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...,Negative
2,Normal/Normal-3.png,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...,Negative
3,Normal/Normal-4.png,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...,Negative
4,Normal/Normal-5.png,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...,Negative


In [17]:
df = pd.concat([normal_df[['FILE NAME', 'TARGET']], 
                covid_df[['FILE NAME','TARGET']]], ignore_index=True)
df.head()

Unnamed: 0,FILE NAME,TARGET
0,Normal/Normal-1.png,Negative
1,Normal/Normal-2.png,Negative
2,Normal/Normal-3.png,Negative
3,Normal/Normal-4.png,Negative
4,Normal/Normal-5.png,Negative


In [18]:
train, test = train_test_split(df, test_size=0.2, stratify=df['TARGET'], random_state=42)
print(train.shape)
print(test.shape)

(11046, 2)
(2762, 2)


# Augmentation

In [19]:
train_df, valid_df = train_test_split(train, test_size=0.2, stratify=train['TARGET'], random_state=42)
print(train_df.shape)
print(valid_df.shape)

(8836, 2)
(2210, 2)


In [59]:
data_list = [
    [
        'Negative',
        train_df['TARGET'].value_counts()[0],
        valid_df['TARGET'].value_counts()[0],
        test['TARGET'].value_counts()[0],
    ],
    [
        'Positive',
        train_df['TARGET'].value_counts()[1],
        valid_df['TARGET'].value_counts()[1],
        test['TARGET'].value_counts()[1], 
    ]
]
data_ov = pd.DataFrame(data_list, columns=['Class', 'Training data', 'Validation data', 'Test data'])
data_ov.loc['Total'] = data_ov.sum(axis=0, numeric_only=True)
data_ov['Total'] = data_ov.sum(axis=1, numeric_only=True)
data_ov

Unnamed: 0,Class,Training data,Validation data,Test data,Total
0,Negative,6522.0,1631.0,2039.0,10192.0
1,Positive,2314.0,579.0,723.0,3616.0
Total,,8836.0,2210.0,2762.0,13808.0


In [None]:
{train_df.head()

Unnamed: 0,FILE NAME,TARGET
13528,COVID/COVID-3337.png,Positive
11039,COVID/COVID-848.png,Positive
4776,Normal/Normal-4777.png,Negative
12061,COVID/COVID-1870.png,Positive
4287,Normal/Normal-4288.png,Negative


In [None]:
DATA_DIR = "/content/COVID-19_Radiography_Dataset"
IMAGE_SIZE = (224, 224)

non_aug_gen = ImageDataGenerator(rescale=1/255.)
aug_gen = ImageDataGenerator(
    rescale=1./255,
    width_shift_range=0.15,
    height_shift_range=0.15,
    zoom_range=[0.9, 1.25],
    brightness_range=[0.5, 1.5]
)

aug_train = aug_gen.flow_from_dataframe(
    dataframe=train_df,
    directory=DATA_DIR,
    x_col='FILE NAME',
    y_col='TARGET',
    target_size=IMAGE_SIZE,
    batch_size=32,
    class_mode='binary', 
    color_mode='rgb',
    shuffle=True
)

non_aug_valid = non_aug_gen.flow_from_dataframe(
    dataframe=valid_df,
    directory=DATA_DIR,
    x_col='FILE NAME',
    y_col='TARGET',
    target_size=IMAGE_SIZE,
    batch_size=32,
    class_mode='binary', 
    color_mode='rgb',
    shuffle=True
)

test_valid_gen = non_aug_gen.flow_from_dataframe(
    dataframe=valid_df,
    directory=DATA_DIR,
    x_col='FILE NAME',
    y_col='TARGET',
    target_size=IMAGE_SIZE,
    batch_size=32,
    class_mode='binary', 
    color_mode='rgb',
    shuffle=False
)

test_test_gen = non_aug_gen.flow_from_dataframe(
    dataframe=test,
    directory=DATA_DIR,
    x_col='FILE NAME',
    y_col='TARGET',
    target_size=IMAGE_SIZE,
    batch_size=32,
    class_mode='binary', 
    color_mode='rgb',
    shuffle=False
)

Found 8836 validated image filenames belonging to 2 classes.
Found 2210 validated image filenames belonging to 2 classes.
Found 2210 validated image filenames belonging to 2 classes.
Found 2762 validated image filenames belonging to 2 classes.


# Modelling

## Baseline

In [None]:
metrics = [
    keras.metrics.TruePositives(name='tp'),
    keras.metrics.FalsePositives(name='fp'),
    keras.metrics.TrueNegatives(name='tn'),
    keras.metrics.FalseNegatives(name='fn'), 
    'accuracy',
    keras.metrics.Precision(name='precision'),
    keras.metrics.Recall(name='recall'),
    keras.metrics.AUC(name='auc', curve='ROC')
]
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True),
             tf.keras.callbacks.ModelCheckpoint('model_baseline.h5', save_best_only=True, monitor='val_loss')]

In [None]:
tf.random.set_seed(42)
keras.backend.clear_session()

model = Sequential([
    Conv2D(filters=10, 
           kernel_size=3, 
           strides=1,
           padding='valid',
           activation='relu', 
           input_shape=IMAGE_SIZE + (3,)),
    Conv2D(10, 3, activation='relu'),
    Conv2D(10, 3, activation='relu'),
    Flatten(),
    Dense(1, activation='sigmoid')
])

# Compile and fit
model.compile(loss='binary_crossentropy',
                optimizer=Adam(),
                metrics=metrics)

history = model.fit(aug_train,
                    epochs=10,
                    validation_data=non_aug_valid,
                    steps_per_epoch=len(aug_train),
                    validation_steps=len(non_aug_valid),
                    callbacks=callbacks)

np.save('history_baseline.npy', history.history)

Epoch 1/10
Epoch 2/10
Epoch 3/10


## EfficientNetV2-B0

In [None]:
def create_model(model_url, num_classes=1):
    """Takes a TensorFlow Hub URL and creates a Keras Sequential model with it.

    Args:
    model_url (str): A TensorFlow Hub feature extraction URL.
    num_classes (int): Number of output neurons in output layer,
    should be equal to number of target classes, default 10.

    Returns:
    An uncompiled Keras Sequential model with model_url as feature
    extractor layer and Dense output layer with num_classes outputs.
    """
    # Download the pretrained model and save it as a Keras layer
    feature_extractor_layer = hub.KerasLayer(model_url,
                                          trainable=False, # freeze the underlying patterns
                                          name='feature_extraction_layer',
                                          input_shape=(224, 224, 3,)) # define the input image shape

    # Create our own model
    model = tf.keras.Sequential([
      feature_extractor_layer, # use the feature extraction layer as the base
      layers.Dense(num_classes, activation='sigmoid', name='output_layer') # create our own output layer      
    ])

    return model

In [None]:
efficientnetv2_url = 'https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_ft1k_b0/feature_vector/2'

In [None]:
# Add reduced LR to callbacks
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss",  
                                                 factor=0.2, # multiply the learning rate by 0.2 (reduce by 5x)
                                                 patience=4,
                                                 verbose=1, # print out when learning rate goes down 
                                                 min_lr=1e-7)
checkpoint_eff = tf.keras.callbacks.ModelCheckpoint('model_efficientnet.h5', save_best_only=True, monitor='val_loss')

In [None]:
tf.random.set_seed(42)

# Build model
model_efficientnet = create_model(efficientnetv2_url)

# Compile model
model_efficientnet.compile(loss='binary_crossentropy',
                           optimizer=Adam(),
                           metrics=metrics)

model_efficientnet.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 feature_extraction_layer (K  (None, 1280)             5919312   
 erasLayer)                                                      
                                                                 
 output_layer (Dense)        (None, 1)                 1281      
                                                                 
Total params: 5,920,593
Trainable params: 1,281
Non-trainable params: 5,919,312
_________________________________________________________________


In [None]:
# Train the model
history_efficientnet = model_efficientnet.fit(aug_train, epochs=50,
                                              validation_data=non_aug_valid,
                                              steps_per_epoch=len(aug_train),
                                              validation_steps=len(non_aug_valid),
                                              callbacks=[reduce_lr, checkpoint_eff])

np.save('history_efficientnet.npy', history_efficientnet.history)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 00042: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 00046: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 00050: ReduceLROnPlateau reducing learning rate to 8.000000525498762e-06.


## ResNetV2-50

In [None]:
checkpoint_res = tf.keras.callbacks.ModelCheckpoint('model_resnet.h5', save_best_only=True, monitor='val_loss')

In [None]:
tf.random.set_seed(42)

# Get the model
base_model = tf.keras.applications.ResNet50V2(include_top=False)
base_model.trainable = False

# Build the model
inputs = layers.Input(shape=(224, 224, 3), name='input_layer')
out = base_model(inputs, training=False)
out = layers.GlobalMaxPooling2D()(out)
out = layers.Dense(1, activation='sigmoid', name='output_layer')(out)
model_resnet = keras.Model(inputs, out)

# Compile model
model_resnet.compile(loss='binary_crossentropy',
                    optimizer=Adam(),
                    metrics=metrics)

model_resnet.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50v2_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 224, 224, 3)]     0         
                                                                 
 resnet50v2 (Functional)     (None, None, None, 2048)  23564800  
                                                                 
 global_max_pooling2d (Globa  (None, 2048)             0         
 lMaxPooling2D)                                                  
                                                                 
 output_layer (Dense)        (None, 1)                 2049      
                                                                 
Total params: 23,566,849
Trainable params: 2,049
Non-trainable params: 23,564,800
________________________________

In [None]:
# Train the model
history_resnet = model_resnet.fit(aug_train, epochs=50,
                                  validation_data=non_aug_valid,
                                  steps_per_epoch=len(aug_train),
                                  validation_steps=len(non_aug_valid),
                                  callbacks=[reduce_lr, checkpoint_res])

np.save('history_resnet.npy', history_resnet.history)

Epoch 1/50


Custom mask layers require a config and must override get_config. When loading, the custom mask layer must be passed to the custom_objects argument.



Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 00032: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 00040: ReduceLROnPlateau reducing learning rate to 8.000000525498762e-06.
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 00044: ReduceLROnPlateau reducing learning rate to 1.6000001778593287e-06.
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 00048: ReduceLROnPlateau reducing learning rate to 3.200000264769187e-07.
Epoch 49/50
Epoch 50/50


# Evaluation

In [None]:
# Plot the validation and training data separately
def plot_learning_curves(*histories):
    """
    Returns separate loss curves for training and validation metrics.
    """ 
    fig = make_subplots(rows=3, cols=1, subplot_titles=('Loss', 'Recall', 'AUC'))
    for history in histories:
        loss = history[1].history.get('loss')
        val_loss = history[1].history.get('val_loss')

        auc = history[1].history.get('auc')
        val_auc = history[1].history.get('val_auc')

        recall = history[1].history.get('recall')
        val_recall = history[1].history.get('val_recall')

        epochs = np.arange(1, len(history[1].history.get('loss')) + 1)

        fig.add_trace(go.Scatter(x=epochs, y=loss, name=history[0] + " train"), row=1, col=1)
        fig.add_trace(go.Scatter(x=epochs, y=val_loss, name=history[0] + " val"), row=1, col=1)
        fig.add_trace(go.Scatter(x=epochs, y=recall, name=history[0] + " train"), row=2, col=1)
        fig.add_trace(go.Scatter(x=epochs, y=val_recall, name=history[0] + " val"), row=2, col=1)
        fig.add_trace(go.Scatter(x=epochs, y=auc, name=history[0] + " train"), row=3, col=1)
        fig.add_trace(go.Scatter(x=epochs, y=val_auc, name=history[0] + "val"), row=3, col=1)

    # Update xaxis properties
    fig.update_xaxes(title_text="Epochs", row=1, col=1)
    fig.update_xaxes(title_text="Epochs", row=2, col=1)
    fig.update_xaxes(title_text="Epochs", row=3, col=1)

    # Update yaxis properties
    fig.update_yaxes(title_text="Loss", row=1, col=1)
    fig.update_yaxes(title_text="Recall", row=2, col=1)
    fig.update_yaxes(title_text="AUC", row=3, col=1)

    fig.update_layout(title_text="Learning Curves", height=1500)

    fig.show()

In [None]:
def plot_roc_curve(*datas):
    fig = go.Figure()
    for data in datas:
        name = data[0]
        y_true = data[1]
        pred = data[2]

        fpr, tpr, _ = roc_curve(y_true, pred)
        fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name))

    fig.update_yaxes(title_text="TPR")
    fig.update_xaxes(title_text="FPR")
    fig.update_layout(title_text="ROC Curve")
    fig.show()

In [None]:
def evaluate_model(y_true, pred, threshold=0.5):
    pred_new = [1 if i > threshold else 0 for i in pred]

    # Print classification report
    print('===== Classification Report ======')
    print(classification_report(y_true, pred_new))
    print()

    # Print confusion matrix
    print('===== Confusion Matrix =====')
    print(confusion_matrix(y_true, pred_new))
    print()

    # Print other metrics
    print('===== Other Metrics =====')
    roc_auc = keras.metrics.AUC(curve='ROC')
    roc_auc.update_state(y_true, pred)
    print(f"ROC_AUC: {roc_auc.result().numpy()}")

In [None]:
history_efficientnet = np.load('history_efficientnet.npy',allow_pickle='TRUE').item()
history_baseline = np.load('history_baseline.npy',allow_pickle='TRUE').item()

model_baseline = load_model('model_baseline.h5', compile=False, custom_objects={'KerasLayer': hub.KerasLayer})
model_efficientnet = load_model('model_efficientnet.h5', compile=False, custom_objects={'KerasLayer': hub.KerasLayer})

In [None]:
plot_learning_curves(['ResnetV2-50', history_resnet])

In [None]:
plot_learning_curves(['EfficientNetV2-B1', history_efficientnet])

In [None]:
test_test_gen.reset()
pred = model.predict(test_test_gen)
test_test_gen.reset()
pred_eff = model_efficientnet.predict(test_test_gen)
test_test_gen.reset()
pred_res = model_resnet.predict(test_test_gen)

y_true = [0 if x == 'Negative' else 1 for x in test['TARGET']]

plot_roc_curve(['Baseline', y_true, pred],['EfficientNetV2-B1', y_true, pred_eff], ['ResNetV2-50', y_true, pred_res])

In [None]:
evaluate_model(y_true, pred_eff, threshold=0.35)

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2039
           1       0.90      0.91      0.91       723

    accuracy                           0.95      2762
   macro avg       0.94      0.94      0.94      2762
weighted avg       0.95      0.95      0.95      2762


===== Confusion Matrix =====
[[1968   71]
 [  62  661]]

===== Other Metrics =====
ROC_AUC: 0.985183835029602
