## Advanced model with learning rate scheduler and performance metrics ##

In [1]:
# Imports
import os
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
import glob

# Matplotlib for plotting
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm

# PyTorch packages
import torch
import torch.nn as nn
from lightning.pytorch.loggers import TensorBoardLogger
import lightning.pytorch as pl
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateFinder, LearningRateMonitor
import torchmetrics

# Albumentations library
import albumentations as alb

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import computervision as dm
from computervision.fileutils import FileOP
from computervision.imageproc import ImageData
from computervision.models.toothmodel_fancy import ToothModel, FineTuneLearningRateFinder
from computervision.torchdataset import DatasetFromDF, load_and_process_image
print(f'dentexmodel package version:  {dm.__version__}')

dentexmodel package version:  0.0.post1.dev48+g56bd7cc.d20240808


In [2]:
# GPU checks
is_cuda = torch.cuda.is_available()
print(f'CUDA available: {is_cuda}')
print(f'Number of GPUs found:  {torch.cuda.device_count()}')

if is_cuda:
    print(f'Current device ID:     {torch.cuda.current_device()}')
    print(f'GPU device name:       {torch.cuda.get_device_name(0)}')
    print(f'CUDNN version:         {torch.backends.cudnn.version()}')
    device_str = 'cuda:0'
    torch.cuda.empty_cache() 
else:
    device_str = 'cpu'
device = torch.device(device_str)
print()
print(f'Device for model training/inference: {device}')

CUDA available: True
Number of GPUs found:  1
Current device ID:     0
GPU device name:       NVIDIA GeForce RTX 3070 Laptop GPU
CUDNN version:         90201

Device for model training/inference: cuda:0


In [3]:
# Path settings 
# Main data directory (defined as environment variable in docker-compose.yml)
data_root = os.environ.get('DATA_ROOT')

# Download directory (change as needed)
dentex_dir = os.path.join(data_root, 'dentex')
model_dir = os.path.join(data_root, 'model')
data_dir = os.path.join(dentex_dir, 'dentex_classification')

# This image directory is where the xrays are in the archive, so should be left as-is
image_dir = os.path.join(data_dir, 'quadrant-enumeration-disease', 'xrays')
cropped_image_dir = os.path.join(image_dir, 'crop')

# Directory for the output
output_dir = os.path.join(data_dir, 'output')

data_file_name = 'dentex_disease_datasplit.parquet'
data_file = os.path.join(data_dir, data_file_name)

### Create PyTorch datasets from data frame ###

In [4]:
data_df = pd.read_parquet(data_file)
# Convert class names to labels
cl_names = sorted(list(data_df['label'].unique()))
# Get the class labels
cl_numbers = [data_df.loc[data_df['label'] == label, 'cl'].values[0] for label in cl_names]
label_dict = dict(zip(cl_names, cl_numbers))
cl_dict = dict(zip(cl_numbers, cl_names))
# Show the class labels
display(pd.DataFrame(label_dict, index=[0]).iloc[0])

Caries               1
Deep Caries          3
Impacted             0
Periapical Lesion    2
Name: 0, dtype: int64

In [5]:
# Select the samples for training, validation and testing from our data frame
train_df = data_df.loc[data_df['dataset']=='train']
val_df = data_df.loc[data_df['dataset']=='val']
test_df = data_df.loc[data_df['dataset']=='test']

train_samples = sorted(list(train_df['box_name'].unique()))
print(f'Found {len(train_samples)} samples in the training set.')
val_samples = sorted(list(val_df['box_name'].unique()))
print(f'Found {len(val_samples)} samples in the validation set.')
test_samples = sorted(list(test_df['box_name'].unique()))
print(f'Found {len(test_samples)} samples in the test set.')
print()

Found 3289 samples in the training set.
Found 120 samples in the validation set.
Found 120 samples in the test set.



In [6]:
# Augmentations
# Image augmentations is part of the PyTorch dataset

# The output of this transformation must match the required input size for the model
max_image_size = 550
im_size = 224

# Definition of the image augmentations for the training set
train_transform = alb.Compose([
    alb.Resize(im_size + 32, im_size + 32),
    alb.RandomCrop(im_size, im_size),
    alb.HorizontalFlip(),
    alb.ShiftScaleRotate(),
    alb.Blur(),
    alb.RandomGamma(),
    alb.Sharpen(),
    alb.GaussNoise(),
    alb.CoarseDropout(16, 32, 32),
    alb.CLAHE(),
    alb.Normalize(mean=ImageData().image_net_mean, 
                  std=ImageData().image_net_std)])

# For validation and testing, we do not want any augmentations
# but we will still need the correct input size and image normalization
val_transform = alb.Compose([
    alb.Resize(im_size, im_size),
    alb.Normalize(mean=ImageData().image_net_mean, 
                  std=ImageData().image_net_std)])

In [7]:
# Create the data sets from the data frame
train_dataset = DatasetFromDF(data=train_df,
                              file_col='box_file',
                              label_col='cl',
                              max_image_size=max_image_size,
                              transform=train_transform,
                              validate=True)

val_dataset = DatasetFromDF(data=val_df,
                            file_col='box_file',
                            label_col='cl',
                            max_image_size=max_image_size,
                            transform=val_transform,
                            validate=True)

test_dataset = DatasetFromDF(data=test_df,
                             file_col='box_file',
                             label_col='cl',
                             max_image_size=max_image_size,
                             transform=val_transform,
                             validate=True)

INFO:computervision.imageproc:All files validated.
INFO:computervision.imageproc:All files validated.
INFO:computervision.imageproc:All files validated.


### Training the model with learning rate scheduling ###

In [9]:
# Model parameters and name
seed = 234
model_name = 'FancyLR'
model_version = 1
# Train for 40 epochs to get good results
max_epochs = 5
num_classes = 4
num_workers = 2
batch_size = 16
initial_lr = 1.0e-3
check_val_every_n_epoch = 1
checkpoint_every_n_epoch = 2
save_top_k = 3

In [10]:
# Create the model
model = ToothModel(train_dataset=train_dataset,
                   val_dataset=val_dataset,
                   test_dataset=test_dataset,
                   batch_size=batch_size,
                   num_classes=num_classes,
                   num_workers=num_workers,
                   lr=initial_lr)

# Setup logger
logger = TensorBoardLogger(save_dir=model_dir,
                           name=model_name,
                           version=model_version)

# Checkpoint callback
checkpoint_dir = os.path.join(model_dir, 
                              model_name,
                              f'version_{model_version}',
                              'checkpoints')

Path(checkpoint_dir).mkdir(exist_ok=True, parents=True)
chk_callback = ModelCheckpoint(dirpath=checkpoint_dir,
                               filename='dentexmodel-{epoch}',
                               monitor='val_loss',
                               mode='min',
                               save_last=True,
                               every_n_epochs=checkpoint_every_n_epoch,
                               save_on_train_epoch_end=True,
                               save_top_k=save_top_k)

In [11]:
lr_finder = FineTuneLearningRateFinder(milestones=(5, 10), 
                                       min_lr=1.0e-8,  
                                       max_lr=0.01, 
                                       num_training_steps=100,
                                       mode='exponential',
                                       early_stop_threshold=None,
                                       update_attr=True)

lr_starter = LearningRateFinder(min_lr=1.0e-8,  
                                max_lr=0.01, 
                                num_training_steps=300,
                                mode='exponential',
                                early_stop_threshold=None,
                                update_attr=True)

lr_monitor = LearningRateMonitor(logging_interval='epoch',
                                 log_momentum=True)

In [12]:
print(f'Training the "{model_name}" model for {max_epochs} epochs.')
print()

tr = Trainer(max_epochs=max_epochs,
             default_root_dir=model_dir,
             callbacks=[chk_callback, lr_finder, lr_monitor],
             logger=logger,
             check_val_every_n_epoch=check_val_every_n_epoch)
tr.fit(model)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training the "FancyLR" model for 5 epochs.




  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | ResNet           | 24.6 M | train
1 | criterion | CrossEntropyLoss | 0      | train
2 | metrics   | ModuleDict       | 0      | train
-------------------------------------------------------
24.6 M    Trainable params
0         Non-trainable params
24.6 M    Total params
98.237    Total estimated model params size (MB)
161       Modules in train mode
0         Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]                                


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s][A
Finding best initial lr:   1%|          | 1/100 [00:00<00:16,  5.96it/s][A

Finding best initial lr:   5%|▌         | 5/100 [00:00<00:08, 11.38it/s][A
Finding best initial lr:   7%|▋         | 7/100 [00:00<00:07, 11.70it/s][A
Finding best initial lr:   9%|▉         | 9/100 [00:00<00:07, 11.79it/s][A
Finding best initial lr:  11%|█         | 11/100 [00:00<00:07, 11.90it/s][A
Finding best initial lr:  13%|█▎        | 13/100 [00:01<00:07, 11.90it/s][A
Finding best initial lr:  15%|█▌        | 15/100 [00:01<00:07, 11.95it/s][A
Finding best initial lr:  17%|█▋        | 17/100 [00:01<00:06, 11.94it/s][A
Finding best initial lr:  19%|█▉        | 19/100 [00:01<00:06, 11.90it/s][A
Finding best initial lr:  21%|██        | 21/100 [00:01<00:06, 11.90it/s][A
Finding best initial lr:  23%|██▎       | 23/100 [00:01<00:06, 11.84it/s][A
Finding best initial lr:  25%|██▌       | 25/100 [00:02<00:06, 11.89it/s][A
Finding be

Epoch 0:   1%|          | 2/206 [00:00<00:14, 14.41it/s, v_num=1, val_loss=1.390, val_accuracy=0.211, val_precision=0.106, val_recall=0.205, val_f1=0.138, val_auroc=0.458, val_lr=0.001]



Epoch 1: 100%|██████████| 206/206 [00:19<00:00, 10.55it/s, v_num=1, val_loss=1.390, val_accuracy=0.211, val_precision=0.106, val_recall=0.205, val_f1=0.138, val_auroc=0.458, val_lr=0.001]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/8 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s][A
Validation DataLoader 0:  12%|█▎        | 1/8 [00:00<00:00, 17.03it/s][A
Validation DataLoader 0:  25%|██▌       | 2/8 [00:00<00:00, 17.77it/s][A
Validation DataLoader 0:  38%|███▊      | 3/8 [00:00<00:00, 18.50it/s][A
Validation DataLoader 0:  50%|█████     | 4/8 [00:00<00:00, 18.83it/s][A
Validation DataLoader 0:  62%|██████▎   | 5/8 [00:00<00:00, 19.28it/s][A
Validation DataLoader 0:  75%|███████▌  | 6/8 [00:00<00:00, 19.67it/s][A
Validation DataLoader 0:  88%|████████▊ | 7/8 [00:00<00:00, 19.99it/s][A
Validation DataLoader 0: 100%|██████████| 8/8 [00:00<00:00, 20.80it/s][A
Epoch 2: 100%|██████████| 206/206 [00:20<00:00,  9.8

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 206/206 [00:25<00:00,  8.20it/s, v_num=1, val_loss=2.070, val_accuracy=0.445, val_precision=0.301, val_recall=0.452, val_f1=0.334, val_auroc=0.688, val_lr=0.000479]
