## Performance Metrics and Logging ##
Logging of performance metrics during training an evaluation

In [1]:
# Imports
import os
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
import glob

# Matplotlib for plotting
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm

# PyTorch packages
import torch
import torch.nn as nn
import lightning.pytorch as pl
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch.callbacks import ModelCheckpoint
import torchmetrics

# Albumentations library
import albumentations as alb

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import dentexmodel as dm
from dentexmodel.fileutils import FileOP
from dentexmodel.imageproc import ImageData
from dentexmodel.torchdataset import DatasetFromDF, load_and_process_image
print(f'dentexmodel package version:  {dm.__version__}')

dentexmodel package version:  0.0.post1.dev101+g9852c2c


In [2]:
# Path settings 
dentex_dir = os.path.join(os.environ['HOME'], 'data', 'dentex')
data_dir = os.path.join(dentex_dir, 'dentex_classification')
image_dir = os.path.join(data_dir, 'quadrant-enumeration-disease', 'xrays', 'crop')
data_file_name = 'dentex_disease_datasplit.parquet'
data_file = os.path.join(dentex_dir, data_file_name)

In [3]:
# %% Package and GPU checks
# GPU checks
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
is_cuda = torch.cuda.is_available()
print(f'CUDA available: {is_cuda}')
print(f'Number of GPUs found:  {torch.cuda.device_count()}')
if is_cuda:
    print(f'Current device ID:     {torch.cuda.current_device()}')
    print(f'GPU device name:       {torch.cuda.get_device_name(0)}')
    print(f'CUDNN version:         {torch.backends.cudnn.version()}')

CUDA available: False
Number of GPUs found:  0


### Create PyTorch datasets from data frame ###

In [4]:
data_df = pd.read_parquet(data_file)
# Convert class names to labels
cl_names = sorted(list(data_df['label'].unique()))
# Get the class labels
cl_numbers = [data_df.loc[data_df['label'] == label, 'cl'].values[0] for label in cl_names]
label_dict = dict(zip(cl_names, cl_numbers))
cl_dict = dict(zip(cl_numbers, cl_names))
# Show the class labels
display(pd.DataFrame(label_dict, index=[0]).iloc[0])

Caries               1
Deep Caries          3
Impacted             0
Periapical Lesion    2
Name: 0, dtype: int64

In [5]:
# Select the samples for training, validation and testing from our data frame
train_df = data_df.loc[data_df['dataset']=='train']
val_df = data_df.loc[data_df['dataset']=='val']
test_df = data_df.loc[data_df['dataset']=='test']

train_samples = sorted(list(train_df['box_name'].unique()))
print(f'Found {len(train_samples)} samples in the training set.')
val_samples = sorted(list(val_df['box_name'].unique()))
print(f'Found {len(val_samples)} samples in the validation set.')
test_samples = sorted(list(test_df['box_name'].unique()))
print(f'Found {len(test_samples)} samples in the test set.')
print()

Found 3289 samples in the training set.
Found 120 samples in the validation set.
Found 120 samples in the test set.



In [6]:
# Augmentations
# Image augmentations is part of the PyTorch dataset

# The output of this transformation must match the required input size for the model
max_image_size = 550
im_size = 224

# Definition of the image augmentations for the training set
train_transform = alb.Compose([
    alb.Resize(im_size + 32, im_size + 32),
    alb.RandomCrop(im_size, im_size),
    alb.HorizontalFlip(),
    alb.ShiftScaleRotate(),
    alb.Blur(),
    alb.RandomGamma(),
    alb.Sharpen(),
    alb.GaussNoise(),
    alb.CoarseDropout(16, 32, 32),
    alb.CLAHE(),
    alb.Normalize(mean=ImageData().image_net_mean, 
                  std=ImageData().image_net_std)])

# For validation and testing, we do not want any augmentations
# but we will still need the correct input size and image normalization
val_transform = alb.Compose([
    alb.Resize(im_size, im_size),
    alb.Normalize(mean=ImageData().image_net_mean, 
                  std=ImageData().image_net_std)])

In [7]:
# Create the data sets from the data frame
train_dataset = DatasetFromDF(data=train_df,
                              file_col='box_file',
                              label_col='cl',
                              max_image_size=max_image_size,
                              transform=train_transform,
                              validate=True)

val_dataset = DatasetFromDF(data=val_df,
                            file_col='box_file',
                            label_col='cl',
                            max_image_size=max_image_size,
                            transform=val_transform,
                            validate=True)

test_dataset = DatasetFromDF(data=test_df,
                             file_col='box_file',
                             label_col='cl',
                             max_image_size=max_image_size,
                             transform=val_transform,
                             validate=True)

### Load model from checkpoint ###

In [9]:
from dentexmodel.models.toothmodel_fancy import ToothModel
link = 'https://dsets.s3.amazonaws.com/dentex/toothmodel_fancy_40.ckpt'
checkpoint_file = FileOP().download_from_url(url=link, download_dir=dentex_dir)

toothmodel_fancy_40.ckpt: 297MB [00:16, 18.5MB/s]                              
File extension is unexpected .ckpt.


File: .ckpt loaded.


In [11]:
model = ToothModel.load_from_checkpoint(checkpoint_file,
                                        test_dataset=test_dataset, 
                                        map_location=device,
                                        batch_size = 16,
                                        num_classes = 4,
                                        num_workers= 1)

In [12]:
# We try the metrics with a trained model

# Load a test batch
dl = model.test_dataloader()
test_image_batch, test_label_batch = next(iter(dl))
display(test_image_batch.shape)
display(test_label_batch.shape)

# Forward - pass on the test batch
pred = model(test_image_batch.to(device))
#pred = model(test_image_batch.cuda())

torch.Size([16, 3, 224, 224])

torch.Size([16])

### Performance metrics: sklearn.metrics library ###
Here is a good description of performance metrics for multi-class classification
https://www.evidentlyai.com/classification-metrics/multi-class-metrics

In [13]:
# sklearn method for accuracy
pred_cl = torch.argmax(pred, dim=1).detach().cpu().numpy()
true_cl = test_label_batch.numpy()
# From seeing this, the accuracy should be as shown
display(pred_cl)
display(true_cl)
print(f'The accuracy should be {11/16}')
print()

# Accuracy
from sklearn.metrics import accuracy_score
sk_acc = accuracy_score(y_true=true_cl, y_pred=pred_cl)
print(f'Accuracy: {sk_acc}')

# Precision
from sklearn.metrics import precision_score
sk_prec = precision_score(y_true=true_cl, y_pred=pred_cl, average='macro')
print(f'Precision:{sk_prec}')

# Recall
from sklearn.metrics import recall_score
sk_rec = recall_score(y_true=true_cl, y_pred=pred_cl, average='macro')
print(f'Recall:   {sk_rec}')

# F1 score
from sklearn.metrics import f1_score
sk_f1 = f1_score(y_true=true_cl, y_pred=pred_cl, average='macro')
print(f'F1:       {sk_f1}')

# AUC score
from sklearn.metrics import roc_auc_score
# For this, we need probability estimates for each class
sm = nn.Softmax(dim=1)
pred_cl_score = sm(pred).detach().cpu().numpy()
sk_auc = roc_auc_score(y_true=true_cl, y_score=pred_cl_score, average='macro', multi_class='ovr')
print(f'AUC:      {sk_auc}')

array([1, 2, 1, 2, 2, 0, 0, 0, 0, 3, 2, 3, 1, 0, 1, 3])

array([3, 0, 3, 0, 0, 2, 2, 1, 2, 2, 0, 2, 3, 2, 3, 2])

The accuracy should be 0.6875

Accuracy: 0.0
Precision:0.0
Recall:   0.0
F1:       0.0
AUC:      0.3095734126984127


### Performance metrics: torchmetrics.classification library ###

In [14]:
true_cl = test_label_batch.to(device)
display(true_cl)
# Probability estimates for each class
sm = nn.Softmax(dim=1)
pred_cl_score = sm(pred)
pred_cl = torch.argmax(pred_cl_score, dim=1)
display(pred_cl)
print()

# Accuracy
from torchmetrics.classification import MulticlassAccuracy
acc = MulticlassAccuracy(num_classes=4, average='micro').to(device)
tm_acc = acc(preds=pred_cl_score, target=true_cl)
print(f'Accuracy: {tm_acc}')

# Precision
from torchmetrics.classification import MulticlassPrecision
prec = MulticlassPrecision(num_classes=4, average='macro').to(device)
tm_prec = prec(preds=pred_cl_score, target=true_cl)
print(f'Precision:{tm_prec}')

# Recall
from torchmetrics.classification import MulticlassRecall
rec = MulticlassRecall(num_classes=4, average='macro').to(device)
tm_rec = rec(preds=pred_cl_score, target=true_cl)
print(f'Recall:    {tm_rec}')  

# F1
from torchmetrics.classification import MulticlassF1Score
f1 = MulticlassF1Score(num_classes=4, average='macro').to(device)
tm_f1 = f1(preds=pred_cl_score, target=true_cl)
print(f'F1:        {tm_f1}')

# AUC
from torchmetrics.classification import MulticlassAUROC
auc = MulticlassAUROC(num_classes=4, average='macro').to(device)
tm_auc = auc(preds=pred_cl_score, target=true_cl)
print(f'AUC:       {tm_auc}')

tensor([3, 0, 3, 0, 0, 2, 2, 1, 2, 2, 0, 2, 3, 2, 3, 2])

tensor([1, 2, 1, 2, 2, 0, 0, 0, 0, 3, 2, 3, 1, 0, 1, 3])


Accuracy: 0.0
Precision:0.0
Recall:    0.0
F1:        0.0
AUC:       0.3095734119415283


In [15]:
test_dict = nn.ModuleDict()

metric_dict = nn.ModuleDict({'acc': MulticlassAccuracy(num_classes=4, average='micro').to(device),
                             'prec': MulticlassPrecision(num_classes=4, average='macro').to(device)})

metric_prefix = 'train'
preds = sm(pred)
target = true_cl

performance_dict_1 = {}
for metric_name, metric in metric_dict.items():
    performance_dict_1.update({f'{metric_prefix}_{metric_name}': metric(preds=preds, target=target)})

performance_dict_2 = {}
for metric_name, metric in metric_dict.items():
    performance_dict_2.update({f'{metric_prefix}_{metric_name}': metric(preds=preds, target=target)+0.1})

performance_list = [performance_dict_1, performance_dict_2]
display(performance_list)

[{'train_acc': tensor(0.), 'train_prec': tensor(0.)},
 {'train_acc': tensor(0.1000), 'train_prec': tensor(0.1000)}]

In [16]:
# Now, we have to average the tensors in the dictionaries
key_list = list(performance_list[0].keys())
print(key_list)
key = key_list[0]
print(key)

epoch_performance_dict = {}
for metric_name in key_list:
    m = torch.stack([x.get(metric_name) for x in performance_list])
    n = m.mean().detach().cpu().numpy().round(2)
    epoch_performance_dict.update({metric_name: n})

print(epoch_performance_dict)

['train_acc', 'train_prec']
train_acc
{'train_acc': 0.05, 'train_prec': 0.05}
