## Advanced model with learning rate scheduler and performance metrics ##

In [17]:
# Imports
import os
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
import glob

# Matplotlib for plotting
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm

# PyTorch packages
import torch
import torch.nn as nn
from lightning.pytorch.loggers import TensorBoardLogger
import lightning.pytorch as pl
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateFinder, LearningRateMonitor
import torchmetrics

# Albumentations library
import albumentations as alb

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import dentexmodel as dm
from dentexmodel.fileutils import FileOP
from dentexmodel.imageproc import ImageData
from dentexmodel.models.toothmodel_fancy import ToothModel, FineTuneLearningRateFinder
from dentexmodel.torchdataset import DatasetFromDF, load_and_process_image
print(f'dentexmodel package version:  {dm.__version__}')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
dentexmodel package version:  0.0.post1.dev142+g4a80787.d20240219


In [18]:
# GPU checks
is_cuda = torch.cuda.is_available()
print(f'CUDA available: {is_cuda}')
print(f'Number of GPUs found:  {torch.cuda.device_count()}')

if is_cuda:
    print(f'Current device ID:     {torch.cuda.current_device()}')
    print(f'GPU device name:       {torch.cuda.get_device_name(0)}')
    print(f'CUDNN version:         {torch.backends.cudnn.version()}')
    device_str = 'cuda:0'
    torch.cuda.empty_cache() 
else:
    device_str = 'cpu'
device = torch.device(device_str)
print()
print(f'Device for model training/inference: {device}')

CUDA available: True
Number of GPUs found:  1
Current device ID:     0
GPU device name:       NVIDIA GeForce RTX 3070 Laptop GPU
CUDNN version:         8902

Device for model training/inference: cuda:0


In [19]:
# Path settings 
dentex_dir = os.path.join(os.environ['HOME'], 'data', 'dentex')
data_dir = os.path.join(dentex_dir, 'dentex_classification')
model_dir = os.path.join(os.environ['HOME'], 'data', 'model')

image_dir = os.path.join(data_dir, 'quadrant-enumeration-disease', 'xrays', 'crop')
data_file_name = 'dentex_disease_datasplit.parquet'
data_file = os.path.join(data_dir, data_file_name)

### Create PyTorch datasets from data frame ###

In [20]:
data_df = pd.read_parquet(data_file)
# Convert class names to labels
cl_names = sorted(list(data_df['label'].unique()))
# Get the class labels
cl_numbers = [data_df.loc[data_df['label'] == label, 'cl'].values[0] for label in cl_names]
label_dict = dict(zip(cl_names, cl_numbers))
cl_dict = dict(zip(cl_numbers, cl_names))
# Show the class labels
display(pd.DataFrame(label_dict, index=[0]).iloc[0])

Caries               1
Deep Caries          3
Impacted             0
Periapical Lesion    2
Name: 0, dtype: int64

In [21]:
# Select the samples for training, validation and testing from our data frame
train_df = data_df.loc[data_df['dataset']=='train']
val_df = data_df.loc[data_df['dataset']=='val']
test_df = data_df.loc[data_df['dataset']=='test']

train_samples = sorted(list(train_df['box_name'].unique()))
print(f'Found {len(train_samples)} samples in the training set.')
val_samples = sorted(list(val_df['box_name'].unique()))
print(f'Found {len(val_samples)} samples in the validation set.')
test_samples = sorted(list(test_df['box_name'].unique()))
print(f'Found {len(test_samples)} samples in the test set.')
print()

Found 3289 samples in the training set.
Found 120 samples in the validation set.
Found 120 samples in the test set.



In [22]:
# Augmentations
# Image augmentations is part of the PyTorch dataset

# The output of this transformation must match the required input size for the model
max_image_size = 550
im_size = 224

# Definition of the image augmentations for the training set
train_transform = alb.Compose([
    alb.Resize(im_size + 32, im_size + 32),
    alb.RandomCrop(im_size, im_size),
    alb.HorizontalFlip(),
    alb.ShiftScaleRotate(),
    alb.Blur(),
    alb.RandomGamma(),
    alb.Sharpen(),
    alb.GaussNoise(),
    alb.CoarseDropout(16, 32, 32),
    alb.CLAHE(),
    alb.Normalize(mean=ImageData().image_net_mean, 
                  std=ImageData().image_net_std)])

# For validation and testing, we do not want any augmentations
# but we will still need the correct input size and image normalization
val_transform = alb.Compose([
    alb.Resize(im_size, im_size),
    alb.Normalize(mean=ImageData().image_net_mean, 
                  std=ImageData().image_net_std)])

In [23]:
# Create the data sets from the data frame
train_dataset = DatasetFromDF(data=train_df,
                              file_col='box_file',
                              label_col='cl',
                              max_image_size=max_image_size,
                              transform=train_transform,
                              validate=True)

val_dataset = DatasetFromDF(data=val_df,
                            file_col='box_file',
                            label_col='cl',
                            max_image_size=max_image_size,
                            transform=val_transform,
                            validate=True)

test_dataset = DatasetFromDF(data=test_df,
                             file_col='box_file',
                             label_col='cl',
                             max_image_size=max_image_size,
                             transform=val_transform,
                             validate=True)

### Figure out the optimal learning rate ###
Use the automatic learing rate finder in lightning to determine an initial learning rate

In [26]:
# Model parameters and name
seed = 234
model_name = 'FancyLR'
model_version = 1
max_epochs = 20
num_classes = 4
num_workers = 1
batch_size = 16
initial_lr = 1.0e-3
check_val_every_n_epoch = 1
checkpoint_every_n_epoch = 2
save_top_k = 3

In [27]:
# Create the model
model = ToothModel(train_dataset=train_dataset,
                   val_dataset=val_dataset,
                   test_dataset=test_dataset,
                   batch_size=batch_size,
                   num_classes=num_classes,
                   num_workers=num_workers,
                   lr=initial_lr)

# Setup logger
logger = TensorBoardLogger(save_dir=model_dir,
                           name=model_name,
                           version=model_version)

# Checkpoint callback
chk_callback = ModelCheckpoint(dirpath=model_dir,
                               filename='dentexmodel-{epoch}',
                               monitor='val_loss',
                               mode='min',
                               save_last=True,
                               every_n_epochs=checkpoint_every_n_epoch,
                               save_on_train_epoch_end=True,
                               save_top_k=save_top_k)

In [28]:
lr_finder = FineTuneLearningRateFinder(milestones=(5, 10), 
                                       min_lr=1.0e-8,  
                                       max_lr=0.01, 
                                       num_training_steps=100,
                                       mode='exponential',
                                       early_stop_threshold=None,
                                       update_attr=True)

lr_starter = LearningRateFinder(min_lr=1.0e-8,  
                                max_lr=0.01, 
                                num_training_steps=300,
                                mode='exponential',
                                early_stop_threshold=None,
                                update_attr=True)

lr_monitor = LearningRateMonitor(logging_interval='epoch',
                                 log_momentum=True)

In [29]:
tr = Trainer(max_epochs=max_epochs,
             default_root_dir=model_dir,
             callbacks=[chk_callback, lr_finder, lr_monitor],
             logger=logger,
             check_val_every_n_epoch=check_val_every_n_epoch)

tr.fit(model)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:652: Checkpoint directory /app/data/model exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | ResNet           | 24.6 M
1 | criterion | CrossEntropyLoss | 0     
2 | metrics   | ModuleDict       | 0     
-----------------------------------------------
24.6 M    Trainable params
0         Non-trainable params
24.6 M    Total params
98.237    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.0016595869074375604
Restoring states from the checkpoint path at /app/data/model/.lr_find_188f015a-7f5e-4612-bb3b-a8b125fcd626.ckpt
Restored all states from the checkpoint at /app/data/model/.lr_find_188f015a-7f5e-4612-bb3b-a8b125fcd626.ckpt
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/training_epoch_loop.py:156: You're resuming from a checkpoint that ended before the epoch ended. This can cause unreliable results if further training is done. Consider using an end-of-epoch checkpoint


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1030` reached.
Learning rate set to 0.0033113112148259126
Restoring states from the checkpoint path at /app/data/model/.lr_find_cceb4714-d15a-43d1-887e-1c9c5d66d48a.ckpt
Restored all states from the checkpoint at /app/data/model/.lr_find_cceb4714-d15a-43d1-887e-1c9c5d66d48a.ckpt
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/training_epoch_loop.py:156: You're resuming from a checkpoint that ended before the epoch ended. This can cause unreliable results if further training is done. Consider using an end-of-epoch checkpoint


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=2060` reached.
Learning rate set to 0.0014454397707459273
Restoring states from the checkpoint path at /app/data/model/.lr_find_b647a05d-d14c-419c-bd06-9086463ba331.ckpt
Restored all states from the checkpoint at /app/data/model/.lr_find_b647a05d-d14c-419c-bd06-9086463ba331.ckpt
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Validation: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
