## Install package in TWCC (tensorflow-20.02-tf1-py3:latest)

In [None]:
!sudo -i pip install --upgrade pip
!sudo -i pip install fastai
!sudo -i python -m pip install --upgrade nbformat
!sudo pip show fastai
!python -V

<a href="https://colab.research.google.com/github/ajsanjoaquin/COVID-19-Scanner/blob/master/NCOV19_X_ray_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Coronavirus 2019 (COVID-19) Classifer using Posteroanterior views (PA) of Chest Radiograph Images (CXR)
Accompanying information [here](https://towardsdatascience.com/using-deep-learning-to-detect-ncov-19-from-x-ray-images-1a89701d1acd).

NOTICE: This notebook is provided as-is with no guarantee of accurate diagnosis. The model was trained on heavily skewed data and is not suitable for deployment. It is currently meant to be a proof of concept for now. All images used were publicly accessible and usable at the time of training.

In [None]:
!git clone https://github.com/ieee8023/covid-chestxray-dataset.git

In [None]:
import pandas as pd
import numpy as np
import os, shutil
from fastai.vision import *
from fastai.widgets import ClassConfusion

#Preprocessing
##Extracting Images 

In [None]:
metadata_path='covid-chestxray-dataset/metadata.csv'
df=pd.read_csv(metadata_path)

#types we're interested in
covid_patients=df['finding']=='COVID-19'
CT=df['view']=='CT'
PA=df['view']=='PA'

# %%
df[covid_patients & CT].shape
df[covid_patients & PA].shape
# %%
PA_covid=df[covid_patients & PA]
Others=df[~covid_patients & PA]
covid_files=[files for files in PA_covid['filename']]
other_files=[files for files in Others['filename']]

In [None]:
#our test folder. manually included files via upload.
test_files=[file for file in sorted(os.listdir('test'))]
df_test=pd.DataFrame(test_files, columns=['filename'])

In [None]:
#create data folder and positive & negative cases folder, and test folder
destpath = 'data/covid','data/other', 'data/test'
srcpath = 'covid-chestxray-dataset/images'

for root, dirs, files in os.walk(srcpath):
  if not os.path.isdir(destpath[0]):
    os.makedirs(destpath[0])
  if not os.path.isdir(destpath[1]):
    os.makedirs(destpath[1])
  if not os.path.isdir(destpath[2]):
    os.makedirs(destpath[2])
    
  for file in files:
    if file in covid_files:
      shutil.copy(os.path.join(root, file),destpath[0])
    if file in other_files:
      shutil.copy(os.path.join(root,file),destpath[1])
    if file in test_files:
      shutil.copy(os.path.join(root,file),destpath[2])


In [None]:
#see number of files
path, dirs, files2 = os.walk("data/other").__next__()
path, dirs, files1 = os.walk("data/covid").__next__()
path, dirs, files3 = os.walk("data/test").__next__()
print("Number of images in Other: {}".format(len(files2)),"Number of images in Covid: {}".format(len(files1)),"Number of images in Test Set: {}".format(len(files3)) )

## Loading and Splitting Data
We first declare the labels to be used (corresponding with the folder names). We then wrap it around a dataloader from fastai. 

We allocate 20% of the data for validation, and we reserve a test set from a folder called "test". We resize all images to 512 x 512 pixels. 

In [None]:
classes=['covid','other']

In [None]:
#include a test folder named test before running this block
#function assumes test set is located in the path (first arg) by default
data = ImageDataBunch.from_folder('data', train=".", valid_pct=0.25,test='test',
        ds_tfms=get_transforms(), bs=8, size=512, num_workers=4).normalize(imagenet_stats)

In [None]:
data.classes

In [None]:
#show size of our datasets
print(len(data.train_ds),len(data.valid_ds),len(data.test_ds.x))

In [None]:
#sample of our images with labels
data.show_batch(rows=5, figsize=(7,8))

#Training
We use a Resnet 50 for transfer learning.
Initially we run the fit one cycle policy for a few epochs and then using fastai's **lrfinder** to find an optimal range for our learning rate.

We use precision and recall to measure the incidents of false positives and false negatives, as well as AUC to account for performance given the skewed data.

In [None]:
precision=Precision()
recall=Recall()
AUC=AUROC()

In [None]:
learn = cnn_learner(data, models.resnet50, metrics=(accuracy,precision,recall,AUC))

In [None]:
learn.fit_one_cycle(1)

At this stage, we realize the model is underfitting, so we continue to progressively increase the number of epochs from here on in an effort to reduce training loss while maintaining the low validation loss.

In [None]:
learn.fit_one_cycle(2)

In [None]:
learn.lr_find()
learn.recorder.plot()

In [None]:
#@title Defining custom checkpoints
#Customizing where our checkpoints are saved and loaded
#if not os.path.isdir('checkpoints'): 
 # os.mkdir('checkpoints')
os.mkdir('check')
def custom_path_save(self, name:PathOrStr, path='check', return_path:bool=False, with_opt:bool=True):
        "Save model and optimizer state (if `with_opt`) with `name` to `self.model_dir`."
        # delete #  path = self.path/self.model_dir/f'{name}.pth'
        # my addition: start
        if path=='': path = self.path/self.model_dir/f'{name}.pth'
        else: path = f'{path}/{name}.pth'
        # end
        if not with_opt: state = get_model(self.model).state_dict()
        else: state = {'model': get_model(self.model).state_dict(), 'opt':self.opt.state_dict()}
        torch.save(state, path)
        if return_path: return path

def custom_path_load(self, name:PathOrStr, path='check', device:torch.device=None, strict:bool=True, with_opt:bool=None,purge=False):
        "Load model and optimizer state (if `with_opt`) `name` from `self.model_dir` using `device`."
        if device is None: device = self.data.device
        # delete # state = torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device)
        # my addition: start
        if path=='': path = self.path/self.model_dir/f'{name}.pth'
        else: path = f'{path}/{name}.pth'
        state = torch.load(path, map_location=device) 
        # end
        if set(state.keys()) == {'model', 'opt'}:
            get_model(self.model).load_state_dict(state['model'], strict=strict)
            if ifnone(with_opt,True):
                if not hasattr(self, 'opt'): opt = self.create_opt(defaults.lr, self.wd)
                try:    self.opt.load_state_dict(state['opt'])
                except: pass
        else:
            if with_opt: warn("Saved filed doesn't contain an optimizer state.")
            get_model(self.model).load_state_dict(state, strict=strict)
        return self

learn.save = custom_path_save.__get__(learn)
learn.load = custom_path_load.__get__(learn)
model_path ='check'

In [None]:
learn.save('Corona_model_stage1')

In [None]:
#learn.load('Corona_model_stage1')

In [None]:
learn.unfreeze()

In [None]:
learn.fit_one_cycle(10, max_lr=slice(9e-07,1e-06))

In [None]:
learn.save('Corona_model_stage2')

In [None]:
#confusion matrix for the first 2 iterations
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()

In [None]:
ClassConfusion(interp, classes, is_ordered=False, figsize=(8,8))

In [None]:
learn.lr_find()
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(30, max_lr=slice(6e-07,7e-06))

In [None]:
learn.save('Corona_model_stage3.pth')

In [None]:
learn.lr_find()
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(40, max_lr=slice(8e-06,1e-05))

In [None]:
learn.save('Corona_model_stage4.pth')

In [None]:
learn.lr_find()
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(5, max_lr=2e-06)

In [None]:
learn.save('Corona_model_stage5')

# Results on Validation Set and Predictions on Test Set

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()

In [None]:
#preds, _ = learn.get_preds(ds_type = DatasetType.Test, ordered=True)
preds, _ = learn.get_preds(ds_type = DatasetType.Test)

In [None]:
df_test

In [None]:
#WARNING: PREDICTIONS ARE NOT SORTED AND DO NOT MATCH THEIR ACTUAL CORRESPONDING IMAGES
'''
model_classes = learn.data.classes
preds = preds.tolist()
confidences = [{c: p for c,p*100 in zip(model_classes, probs)} for probs in preds]
final_df = pd.DataFrame({'ID_code': df_test['filename'], 'target': confidences})
final_df.to_csv('NCOV_test_results.csv', header=True, index=False)
'''



In [None]:
#safer to use a dictionary data structure
#save predictions on test set in csv
images={filename:open_image('data/test/'+filename) for filename in test_files}
results={filename:learn.predict(images[filename]) for filename in test_files}
final_df=pd.DataFrame.from_dict(results,orient='index')
final_df.to_csv('NCOV_test_results.csv', header=True)
