## Notebook 6: The Lightning Model Class ##
This trains a ResNet 50 model with pretrained weights. The model "toothmodel1" contains the bare minimum code to train an image classification model. No metrics or checkpoints are saved in this one.

In [3]:
# Imports
import os
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
import glob

# Matplotlib for plotting
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm

# PyTorch packages
import torch
import torch.nn as nn
import lightning.pytorch as pl
from lightning.pytorch import Trainer

# Albumentations library
import albumentations as alb

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import dentexmodel as dm
from dentexmodel.imageproc import ImageData
from dentexmodel.torchdataset import DatasetFromDF, load_and_process_image
print(f'dentexmodel package version:  {dm.__version__}')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
dentexmodel package version:  0.0.post1.dev36+gb44e647.d20240101


In [4]:
# Path settings 
dentex_dir = os.path.join(os.environ['HOME'], 'data', 'dentex')
data_dir = os.path.join(dentex_dir, 'dentex_disease')
image_dir = os.path.join(data_dir, 'quadrant-enumeration-disease', 'xrays', 'crop')
data_file_name = 'dentex_disease_datasplit.parquet'
data_file = os.path.join(dentex_dir, data_file_name)

In [5]:
# %% Package and GPU checks
print(f'PyTorch version:              {torch.__version__}')
print(f'PyTorch Lightning version:    {pl.__version__}')
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(torch.cuda.is_available())
print(f'Number of GPUs found:  {torch.cuda.device_count()}')
print(f'Current device ID:     {torch.cuda.current_device()}')
print(f'GPU device name:       {torch.cuda.get_device_name(0)}')
print(f'CUDNN version:         {torch.backends.cudnn.version()}')
torch.set_float32_matmul_precision(precision='high')

PyTorch version:              2.1.2+cu121
PyTorch Lightning version:    2.1.3
True
Number of GPUs found:  1
Current device ID:     0
GPU device name:       NVIDIA GeForce RTX 3060 Laptop GPU
CUDNN version:         8902


### Create PyTorch datasets from data frame ###

In [6]:
data_df = pd.read_parquet(data_file)
# Convert class names to labels
cl_names = sorted(list(data_df['label'].unique()))
# Let's assign number to the classes
label_dict = dict(zip(cl_names, range(len(cl_names))))
cl_dict = dict(zip(label_dict.values(), label_dict.keys()))

# Add the class labels to the data frame
seed = np.random.seed(123)
data_df = data_df.\
                assign(cl=data_df['label'].apply(lambda l: label_dict.get(l))).\
                sample(frac=1, random_state=seed).\
                reset_index(drop=True)

# Show the class labels
display(pd.DataFrame(label_dict, index=[0]).iloc[0])

# Save the data frame with the class labels
data_file_name_cl = 'dentex_disease_datasplit_cl.parquet'
data_file = os.path.join(dentex_dir, data_file_name_cl)
data_df.to_parquet(data_file)

Caries               0
Deep Caries          1
Impacted             2
Periapical Lesion    3
Name: 0, dtype: int64

In [7]:
# Select the samples for training, validation and testing from our data frame
train_df = data_df.loc[data_df['dataset']=='train']
val_df = data_df.loc[data_df['dataset']=='val']
test_df = data_df.loc[data_df['dataset']=='test']

train_samples = sorted(list(train_df['box_name'].unique()))
print(f'Found {len(train_samples)} samples in the training set.')
val_samples = sorted(list(val_df['box_name'].unique()))
print(f'Found {len(val_samples)} samples in the validation set.')
test_samples = sorted(list(test_df['box_name'].unique()))
print(f'Found {len(test_samples)} samples in the test set.')
print()

Found 3349 samples in the training set.
Found 60 samples in the validation set.
Found 120 samples in the test set.



In [8]:
# Augmentations
# Image augmentations is part of the PyTorch dataset

# The output of this transformation must match the required input size for the model
max_image_size = 550
im_size = 224

# Definition of the image augmentations for the training set
train_transform = alb.Compose([
    alb.Resize(im_size + 32, im_size + 32),
    alb.RandomCrop(im_size, im_size),
    alb.HorizontalFlip(),
    alb.ShiftScaleRotate(),
    alb.Blur(),
    alb.RandomGamma(),
    alb.Sharpen(),
    alb.GaussNoise(),
    alb.CoarseDropout(16, 32, 32),
    alb.CLAHE(),
    alb.Normalize(mean=ImageData().image_net_mean, 
                  std=ImageData().image_net_std)])

# Vor validation and testing, we do not want any augmentations
# but we will still need the correct input size and image normalization
val_transform = alb.Compose([
    alb.Resize(im_size, im_size),
    alb.Normalize(mean=ImageData().image_net_mean, 
                  std=ImageData().image_net_std)])

In [9]:
# Create the data sets from the data frame
train_dataset = DatasetFromDF(data=train_df,
                              file_col='box_file',
                              label_col='cl',
                              max_image_size=max_image_size,
                              transform=train_transform,
                              validate=True)

test_dataset = DatasetFromDF(data=test_df,
                             file_col='box_file',
                             label_col='cl',
                             max_image_size=max_image_size,
                             transform=val_transform,
                             validate=True)

### The image model with pre-trained weights ###
The torchvision.models subpackage contains definitions of models for addressing different tasks, including: image classification, pixelwise semantic segmentation, object detection, instance segmentation, person keypoint detection, video classification, and optical flow.

TorchVision offers pre-trained weights for every provided architecture, using the PyTorch torch.hub. Instancing a pre-trained model will download its weights to a cache directory. This directory can be set using the TORCH_HOME environment variable. See torch.hub.load_state_dict_from_url() for details.

A list of all models is here:
https://pytorch.org/vision/stable/models.html#classification

The ResNet50 model:
https://pytorch.org/vision/stable/models/generated/torchvision.models.resnet50.html#torchvision.models.resnet50

In [10]:
from torchvision.models import resnet50, ResNet50_Weights

class ResNet50Model:
    """ This is the ResNet50 model from torchvision.models """
    def __init__(self, n_outputs=4):
        self.n_outputs = n_outputs

    def create_model(self):
        model = resnet50(weights=ResNet50_Weights.DEFAULT)
        model.fc = nn.Sequential(
            nn.Linear(in_features=model.fc.in_features, out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=self.n_outputs)
        )
        return model

In [12]:
# toothmodel1 is a minimal Lightning model to train
# there is no trainig or validation metrics, just the bar minimum
from dentexmodel.models.toothmodel_basic import ToothModel
model = ToothModel(train_dataset=train_dataset,
                   batch_size=40,
                   num_workers=8,
                   model=ResNet50Model(n_outputs=4).create_model())

### Test the model output ###

In [13]:
# Run one batch of images through the model
dl = model.train_dataloader()
image_batch, label_batch = next(iter(dl))
print(image_batch.numpy().shape)
print(label_batch.numpy().shape)

(40, 3, 224, 224)
(40,)


### Train the model ###

In [14]:
# Create a checkpoint directory so that we can save the latest model
checkpoint_dir = os.path.join(dentex_dir, 'checkpoints_test')
Path(checkpoint_dir).mkdir(parents=True, exist_ok=True) 

In [15]:
# Create the trainer object and train the model for 20 epochs
max_epochs = 10
tr = Trainer(max_epochs=max_epochs,
             deterministic=True,
             accelerator='gpu',
             default_root_dir=checkpoint_dir)
# Run the training
tr.fit(model)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /home/andreas/data/dentex/checkpoints_test/lightning_logs
2024-01-07 20:41:07.090770: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-07 20:41:07.543030: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-07 20:41:07.543119: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-07 20:41:07.613881: E external/local_xla/xla

Training: |                                                                                                   …

`Trainer.fit` stopped: `max_epochs=10` reached.


In [16]:
print()
sm = nn.Softmax(dim=1)
display(sm(model(image_batch)))




tensor([[9.1063e-01, 7.7009e-02, 5.7225e-03, 6.6351e-03],
        [3.9158e-01, 7.8333e-02, 5.2702e-01, 3.0635e-03],
        [6.7680e-01, 1.9486e-01, 7.5903e-04, 1.2758e-01],
        [8.5736e-01, 1.0547e-01, 6.4368e-03, 3.0725e-02],
        [3.3564e-01, 1.4578e-01, 5.1167e-01, 6.9097e-03],
        [1.1596e-01, 7.4793e-01, 1.3124e-01, 4.8678e-03],
        [7.5128e-01, 1.1181e-01, 9.0656e-02, 4.6258e-02],
        [1.3047e-03, 1.1261e-02, 9.8740e-01, 3.5594e-05],
        [8.5907e-01, 1.0631e-01, 2.2981e-04, 3.4396e-02],
        [5.4078e-01, 3.6396e-01, 8.1081e-02, 1.4183e-02],
        [7.6980e-01, 1.9473e-01, 8.4000e-03, 2.7073e-02],
        [8.0755e-01, 1.2225e-01, 5.8281e-02, 1.1922e-02],
        [8.1396e-01, 1.3205e-01, 3.9883e-02, 1.4112e-02],
        [6.2633e-01, 3.3829e-01, 5.3240e-03, 3.0052e-02],
        [9.3687e-01, 5.7429e-02, 3.2127e-03, 2.4885e-03],
        [9.5250e-01, 4.0683e-02, 1.8190e-03, 4.9961e-03],
        [8.9037e-01, 9.8355e-02, 6.0761e-03, 5.1946e-03],
        [3.260

In [18]:
# Load the model from the checkpoint
saved_checkpoint_dir = os.path.join(dentex_dir, 'checkpoints')
model_checkpoint_dir = os.path.join(saved_checkpoint_dir, 'lightning_logs', 
                                    'version_0', 'checkpoints')
checkpoint_file = glob.glob(os.path.join(model_checkpoint_dir, 'tooth*'))[0]
print(model_checkpoint_dir)
print(checkpoint_file)
# Load the model from the latest checkpoint
loaded_model = ToothModel.load_from_checkpoint(checkpoint_file)

/home/andreas/data/dentex/checkpoints/lightning_logs/version_0/checkpoints
/home/andreas/data/dentex/checkpoints/lightning_logs/version_0/checkpoints/toothmodel1_50.ckpt


In [19]:
# Now, we can compare the outputs for the trained model
display(sm(model(image_batch))[:5,:])
print()
# With the outputs for the loaded model
display(sm(loaded_model(image_batch.cuda()))[:5,:])
print()
# OK, and now we compare this with an untrained new model
new_model = ToothModel(train_dataset=train_dataset,
                       batch_size=4,
                       num_workers=1,
                       model=ResNet50Model(n_outputs=4).create_model())
display(sm(new_model(image_batch))[:5,:])

tensor([[9.1063e-01, 7.7009e-02, 5.7225e-03, 6.6351e-03],
        [3.9158e-01, 7.8333e-02, 5.2702e-01, 3.0635e-03],
        [6.7680e-01, 1.9486e-01, 7.5903e-04, 1.2758e-01],
        [8.5736e-01, 1.0547e-01, 6.4368e-03, 3.0725e-02],
        [3.3564e-01, 1.4578e-01, 5.1167e-01, 6.9097e-03]],
       grad_fn=<SliceBackward0>)




tensor([[9.4948e-01, 5.0314e-02, 4.0946e-06, 2.0496e-04],
        [2.6589e-03, 5.4265e-03, 9.9183e-01, 8.7793e-05],
        [3.7348e-01, 6.2275e-01, 1.8148e-06, 3.7767e-03],
        [1.7629e-01, 1.1502e-01, 6.8900e-01, 1.9687e-02],
        [3.7930e-01, 3.9475e-01, 1.9434e-01, 3.1606e-02]], device='cuda:0',
       grad_fn=<SliceBackward0>)




tensor([[0.2391, 0.2594, 0.2550, 0.2465],
        [0.2419, 0.2557, 0.2359, 0.2665],
        [0.2616, 0.2300, 0.2390, 0.2694],
        [0.2520, 0.2506, 0.2421, 0.2553],
        [0.2412, 0.2633, 0.2537, 0.2417]], grad_fn=<SliceBackward0>)

In [26]:
# Link to the ckeckpoint
link = 'https://dsets.s3.amazonaws.com/dentex/toothmodel1_50.ckpt'