# Loading Dataset:

In [None]:
# !git clone "https://github.com/muhammedtalo/COVID-19.git"
# !mv "COVID-19/X-Ray Image DataSet" "."

In [1]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!ls '/content/drive/MyDrive/COVID Project - Summer 2022/Colab_Notebooks'

AlexNet  GoogleNet  Pneumonia	      ResNet50_dropout	VGG16_v2
covid	 normal     ResNet18_dropout  VGG16


In [2]:
our_covid_dataset = '/content/drive/MyDrive/COVID Project - Summer 2022/Colab_Notebooks/covid'

Kaggle Dataset:
Dataset link is: `https://drive.google.com/file/d/1bum9Sehb3AzUMHLhBMuowPKyr_PCrB3a/view?usp=sharing`

In [None]:
!pip install gdown
!gdown 1bum9Sehb3AzUMHLhBMuowPKyr_PCrB3a

In [None]:
!unzip COVID-19_Radiography_Dataset.zip

In [49]:
from pathlib import Path

kaggle_dataset = Path('COVID-19_Radiography_Dataset')

In [53]:
# add other sources of images here:
data = {'covid': [our_covid_dataset, kaggle_dataset / "COVID/images"],
        'normal': [kaggle_dataset / "Normal/images"],
        'pneumonia': [kaggle_dataset / "Viral Pneumonia/images"], }

In [70]:
!rm -r dataset

# create gathered dataset
import os
from distutils.dir_util import copy_tree

DATASET_ROOT = Path('dataset')
os.mkdir(DATASET_ROOT)
for image_class, image_sources in data.items():
    class_path = DATASET_ROOT / image_class
    print(str(class_path))
    os.mkdir(class_path)
    if not isinstance(image_sources, list):
        image_sources = [image_sources]
    for source in image_sources:
        copy_tree(source, str(class_path))

dataset/covid
dataset/normal
dataset/pneumonia


Loading pretrained model:

In [None]:
!git clone "https://github.com/arnoweng/CheXNet.git"
!mv "./CheXNet/model.pth.tar" "."

In [None]:
# DATASET_ROOT = 'dataset'
PRETRAINED_MODEL = 'model.pth.tar'

# Importing from libraries:

In [None]:
import os
from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt
import re

import numpy as np
import pandas as pd
import time

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import random_split
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms as T
from torchvision.transforms import functional as TF
from torchvision.transforms.functional import InterpolationMode
from torchvision.datasets import VOCSegmentation
from torchvision import models

Imports from my code:

In [None]:
% load_ext autoreload
% autoreload 2

Setting seed:

In [None]:
torch.manual_seed(24)
np.random.seed(24)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model:

In [None]:
class DenseNet121(nn.Module):
    def __init__(self, num_classes, weights=True):
        super(DenseNet121, self).__init__()
        self.densenet121 = torchvision.models.densenet121()
        num_features = self.densenet121.classifier.in_features
        if weights:
            self.densenet121.classifier = nn.Sequential(
                nn.Linear(num_features, 14)
            )
            load_weights(self)
        self.densenet121.classifier = nn.Linear(num_features, num_classes)

    def forward(self, x):
        x = self.densenet121(x)
        return x


def load_weights(model, device='cpu'):
    # Code modified from torchvision densenet source for loading from pre .4 densenet weights.
    checkpoint = torch.load(PRETRAINED_MODEL, map_location=torch.device(device))
    state_dict = checkpoint['state_dict']
    remove_data_parallel = True  # Change if you don't want to use nn.DataParallel(model)

    pattern = re.compile(
        r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
    for key in list(state_dict.keys()):
        match = pattern.match(key)
        new_key = match.group(1) + match.group(2) if match else key
        new_key = new_key[7:] if remove_data_parallel else new_key
        state_dict[new_key] = state_dict[key]
        # Delete old key only if modified.
        if match or remove_data_parallel:
            del state_dict[key]

    model.load_state_dict(state_dict)

Loading pretrained model:

In [None]:
model = DenseNet121(3).to(device)
model

# Dataset:

Dataset for loading images:

In [None]:
class CovidDataset(Dataset):
    def __init__(self, root, transform=None, shuffle=True, balanced=True):
        if not isinstance(root, Path):
            root = Path(root)
        self.root = root
        self.transform = transform
        data = []
        for y in os.listdir(self.root):
            for x in os.listdir(os.path.join(self.root, y)):
                data.append({'image': x, 'label': y})
        self.dataframe = pd.DataFrame(data)
        self.label_names, labels = np.unique(self.dataframe['label'], return_inverse=True)
        self.dataframe['label'] = labels
        self.original_label_counts = self.dataframe['label'].value_counts()
        if balanced:
            g = self.dataframe.groupby('class')
            self.dataframe = g.apply(lambda class_df: class_df.sample(g.size().min()).reset_index(drop=True))
        if shuffle:
            self.dataframe = self.dataframe.sample(frac=1).reset_index(drop=True)
        self.label_counts = self.dataframe['label'].value_counts()
        self.label_weights = len(self.dataframe) / self.label_counts
        self.label_weights = self.label_weights / self.label_weights.sum()

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        item = self.dataframe.loc[index]
        image_name, label = item['image'], item['label']
        label_name = self.label_names[label]
        image_path = self.root / label_name / image_name
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label

In [None]:
normalize = T.Normalize([0.485, 0.456, 0.406],
                        [0.229, 0.224, 0.225])

# dataset = CovidDataset(DATASET_ROOT, transform=T.Compose([
#                                     T.Resize(256),
#                                     T.TenCrop(224),
#                                     T.Lambda
#                                     (lambda crops: torch.stack([T.ToTensor()(crop) for crop in crops])),
#                                     T.Lambda
#                                     (lambda crops: torch.stack([normalize(crop) for crop in crops]))
#                                 ]))
dataset = CovidDataset(DATASET_ROOT,
                       transform=T.Compose([
                           T.Resize((256, 256)),
                           T.ToTensor(),
                           normalize
                       ]))

In [None]:
dataset.label_weights

In [None]:
dataset[0][0].shape

In [None]:
model(dataset[0][0].to(device).unsqueeze(0))

Dataloaders:

In [None]:
# splitting train and test sets
voc_len = len(dataset)
train_len = int(0.8 * voc_len)
test_len = voc_len - train_len
train_set, test_set = random_split(dataset, [train_len, test_len])

In [None]:
# splitting train and val sets
train_len = int(0.8 * len(train_set))
val_len = len(train_set) - train_len
train_set, val_set = random_split(train_set, [train_len, val_len])

In [None]:
train_loader = DataLoader(train_set, 64, shuffle=True)
val_loader = DataLoader(val_set, 64, shuffle=True)
test_loader = DataLoader(test_set, 64, shuffle=True)

# Training:

Training functions:

In [None]:
import time, tqdm


def train(model, train_loader, criterion, optimizer, epoch):
    train_loss = 0
    N_train = len(train_loader.dataset)

    model.train()
    with tqdm.tqdm(enumerate(train_loader), total=len(train_loader)) as pbar:
        for i, (x, y) in pbar:
            optimizer.zero_grad()

            x = x.to(device)
            y = y.to(device)
            p = model(x)

            loss = criterion(p, y)
            train_loss += loss.item() * len(x)

            pbar.set_description(f'Epoch:{epoch}, Train Loss: {train_loss / N_train:.3e}')

            loss.backward()
            optimizer.step()

    train_loss /= N_train
    return train_loss


def validate(model, val_loader, criterion, epoch=0, metrics=None):
    val_loss = 0
    N_val = len(val_loader.dataset)
    Y = []
    Y_pred = []
    model.eval()
    with torch.no_grad(), tqdm.tqdm(enumerate(val_loader), total=len(val_loader)) as pbar:
        for i, (x, y) in pbar:
            x = x.to(device)
            y = y.to(device)
            p = model(x)
            y_pred = p.argmax(dim=-1)
            loss = criterion(p, y)
            val_loss += loss.item() * len(x)

            pbar.set_description(f'Epoch:{epoch}, Val Loss: {val_loss / N_val:.3e}')
            Y.append(y.cpu().numpy())
            Y_pred.append(y_pred.cpu().numpy())
    Y = np.concatenate(Y)
    Y_pred = np.concatenate(Y_pred)
    val_loss /= N_val
    result = {'loss': val_loss}
    if metrics is not None:
        result.update({metric: metric_func(Y, Y_pred) for metric, metric_func in metrics.items()})
    return result

In [None]:
train_losses, val_losses = list(), list()

In [None]:
def train_model(model, criterion, dataloaders, optimizer, num_epochs, model_name='pytorch_model', validation_metrics=None):
    val = len(dataloaders) == 2
    if val:
        train_loader, val_loader = dataloaders
    else:
        train_loader, = dataloaders

    if validation_metrics is None:
        validation_metrics = dict()

    metrics_history = {metric: [] for metric in validation_metrics}
    val_loss_min = np.inf

    for epoch in range(num_epochs):
        train_loss = train(model, train_loader, criterion, optimizer, epoch)
        train_losses.append(train_loss)
        if val:
            result = validate(model, val_loader, criterion, epoch, metrics=validation_metrics)
            val_loss = result['loss']
            val_losses.append(val_loss)
            for metric, metric_history in metrics_history:
                metric_history.append(result[metric])

            if val_loss <= val_loss_min:
                torch.save(model.state_dict(), f'{model_name}.pt')
                val_loss_min = val_loss
        print('\n', '---' * 20)
    plt.plot(train_losses, label='train')
    if val:
        # load best model during different epochs
        model.load_state_dict(torch.load(f'{model_name}.pt'))
        plt.plot(val_losses, label='val')
        if len(metrics_history):
            plt.legend()
            plt.show()
            for metric, metric_history in metrics_history:
                plt.plot(metric_history, label=metric)
    plt.legend()
    plt.show()

    return model

Freezing pretrained layers:

In [None]:
model = model.to(device)
for param in model.parameters():
    param.requires_grad = False
for param in model.densenet121.classifier.parameters():
    param.requires_grad = True

Learning Config:

In [None]:
lr = 1e-4
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
criterion = nn.CrossEntropyLoss(weight=torch.tensor(dataset.label_weights.sort_index().tolist())).to(device)

Overfitting on a small dataset:

In [None]:
small_len = int(0.1 * len(train_set))
print(small_len)
other_len = len(train_set) - small_len
_, small_set = random_split(train_set, [other_len, small_len])
small_loader = DataLoader(small_set, 64, shuffle=True)

In [None]:
lr = 1e-5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
train_model(model, criterion, [small_loader], optimizer, 30);

In [None]:
validate(model, small_loader, criterion, metrics={'accuracy': lambda y1, y2: (y1 == y2).mean()})

Tuning hyper-parameters:

In [None]:
# todo!

Training model:

In [None]:
# reinitialize model and losses
train_losses, val_losses = list(), list()
model = DenseNet121(3).to(device)

In [None]:
lr = 1e-6
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
model = train_model(model, criterion, [train_loader, val_loader], optimizer, 30, 'covid-classification');

# Evaluation:

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

average_policy = 'macro'
metrics = {'accuracy': accuracy_score, 'precision': lambda y1, y2: precision_score(y1, y2, average=average_policy),
           'recall': lambda y1, y2: recall_score(y1, y2, average=average_policy),
           'f1': lambda y1, y2: f1_score(y1, y2, average=average_policy)}

In [None]:
validate(model, test_loader, criterion, metrics=metrics)

In [None]:
from google.colab import files

files.download('covid-classification.pt')