### DETR model training ###
Build the model training script

In [1]:
import sys
import os
import json
import numpy as np
import pandas as pd
import datetime
import logging
from pathlib import Path
from matplotlib import pyplot as plt
from matplotlib import patches

# PyTorch
import torch

# Hugging Face Library
from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
from transformers import TrainingArguments, Trainer

%load_ext autoreload
%autoreload 2
import computervision
from computervision.imageproc import is_image
from computervision.datasets import DTRdataset, get_gpu_info
from computervision.transformations import AugmentationTransform
from computervision.mapeval import MAPEvaluator

print(f'Project version: {computervision.__version__}')
print(f'Python version:  {sys.version}')

  return torch._C._cuda_getDeviceCount() > 0


Project version: v0.0.1
Python version:  3.12.3 (main, Jun 18 2025, 17:59:45) [GCC 13.3.0]


In [2]:
# Set training device
device, device_str = get_gpu_info()
# Save the date in a string
date_str = datetime.date.today().strftime('%y%m%d')
print(f'Date: {date_str}')

CUDA available: False
Number of GPUs found:  1
Date: 250925


In [3]:
# Directories and files
data_dir = os.path.join(os.environ.get('HOME'), 'data')
train_image_dir = os.path.join(data_dir, 'dentex', 'cropped')
val_image_dir = os.path.join(train_image_dir, 'test')
model_dir = os.path.join(data_dir, 'model')
train_annotation_file_name = 'train_quadrant_enumeration_dset.parquet'
train_annotation_file = os.path.join(train_image_dir, train_annotation_file_name)
val_annotation_file_name = 'train_quadrant_enumeration_test_set.parquet'
val_annotation_file = os.path.join(val_image_dir, val_annotation_file_name)

# Column names for the annotation files
label_col = 'label'
label_name_col = 'ada'
file_name_col = 'file_name'
bbox_col = 'bbox'
dset_col = 'dset'

# Training and model parameters
model_version = 1
im_size = 640
model_name = f'rtdetr_{date_str}_{str(model_version).zfill(2)}'
print(model_name)

# Important information about the model that we want to save
model_info = {'model_version': model_version,
              'project_version': computervision.__version__,
              'model_name': model_name,
              'model_dir': model_dir,
              'train_image_dir': train_image_dir,
              'val_image_dir': val_image_dir,
              'model_dir': model_dir,
              'image_width': im_size,
              'image_height': im_size,
              'hf_checkpoint': 'PekingU/rtdetr_v2_r101vd',
              'train_transform': 'train_1',
              'val_transform': 'val_1'}

# Specific arguments for the Trainer.
# See: https://huggingface.co/docs/transformers/en/main_classes/trainer#trainer
training_args = {'output_dir': os.path.join(model_dir, model_name),
                 'num_train_epochs': 20,
                 'max_grad_norm': 0.1,
                 'learning_rate': 5e-5,
                 'warmup_steps': 300,
                 'per_device_train_batch_size': 4,
                 'dataloader_num_workers': 2,
                 'metric_for_best_model': 'eval_map',
                 'greater_is_better': True,
                 'load_best_model_at_end': True,
                 'eval_strategy': 'epoch',
                 'save_strategy': 'epoch',
                 'save_total_limit': 2,
                 'remove_unused_columns': False,
                 'eval_do_concat_batches': False}

# We want to maintain the aspect ratio of the images
# So, we resize the image first and then pad it
processor_params = {'do_resize': True,
                    'size': {'max_height': im_size,
                             'max_width': im_size},
                    'do_pad': True,
                    'pad_size': {'height': im_size,
                                 'width': im_size}}

model_dict = {'model_info': model_info, 
              'training_args': training_args, 
              'processor_params': processor_params}

# Dumpt the model parameters to a file
json_file = os.path.join(model_dir, f'{model_name}.json')
with open(json_file, 'w') as f:
    json.dump(model_dict, f, indent=4) # indent for pretty-printing

rtdetr_250923_01


### Verify the image data ###
Make sure that the directories and annotation files are correct.

In [4]:
train_df = pd.read_parquet(train_annotation_file)
train_df = train_df.loc[train_df[dset_col] == 'train']
val_df = pd.read_parquet(val_annotation_file)
# Filter the validation images and quadrants and take only the first augmentation
val_df = val_df.loc[
    (val_df['dset'] == 'val') & 
    (val_df['quadrants'].isin([14, 23])) & 
    (val_df['transformation'] == 0)]

# Check the images on disk
train_file_list = list(train_df[file_name_col].unique())
train_checked = np.sum([is_image(os.path.join(train_image_dir, file)) for file in train_file_list])
print(f'Images in training data:         {len(train_file_list)}')
print(f'Files checked in training data:  {train_checked}')
print(f'Annotations in training data:    {train_df.shape[0]}')
print()
val_file_list = list(val_df[file_name_col].unique())
val_checked = np.sum([is_image(os.path.join(val_image_dir, file)) for file in val_file_list])
print(f'Images in validation data:       {len(val_file_list)}')
print(f'Files checked in val data:       {val_checked}')
print(f'Annotations in validation data:  {val_df.shape[0]}')

Images in training data:         2391
Files checked in training data:  2391
Annotations in training data:    34166

Images in validation data:       32
Files checked in val data:       32
Annotations in validation data:  372


### Set up the logger ###

In [5]:
# Set up logger
log_file_name = f'{model_name}.log'
log_file = os.path.join(model_dir, log_file_name)
dtfmt = '%y%m%d-%H:%M'
logfmt = '%(asctime)s-%(name)s-%(levelname)s-%(message)s'

logging.basicConfig(filename=log_file,
                    filemode='w',
                    level=logging.INFO,
                    format=logfmt,
                    datefmt=dtfmt,
                    force=True)

logger = logging.getLogger(name=__name__)

### Datasets ###

In [14]:
# Create the label ids (tooth position, but starting from 0)
# The model needs label ids, not labels. So we need to add a label id column
label_name_list = sorted(list(train_df[label_name_col].unique()))
id2label = dict(zip(range(len(label_name_list)), label_name_list))
id2label = {int(label_id): str(label_name) for label_id, label_name in id2label.items()}
label2id = {str(label_name): int(label_id) for label_id, label_name in id2label.items()}

train_df = train_df.assign(label=train_df[label_name_col].apply(lambda name: label2id.get(str(name))))
val_df = val_df.assign(label=val_df[label_name_col].apply(lambda name: label2id.get(str(name))))
display(train_df.head(2))
display(val_df.head(2))

Unnamed: 0,bbox,segmentation,height,width,file_name,file_base_name,quadrants,quadrant,pos,fdi,ada,dset,label
0,"[666, 102, 103, 376]","[[757, 478, 769, 102, 678, 113, 666, 469]]",494,1473,train_0_12.png,train_0,12,1,1,11,8,train,7
1,"[593, 107, 85, 377]","[[666, 484, 678, 110, 607, 107, 604, 299, 619,...",494,1473,train_0_12.png,train_0,12,1,2,12,7,train,6


Unnamed: 0,bbox,quadrant,ada,file_name,file_base_name,quadrants,height,width,transformation,transformation_name,dset,label
125,"[459, 0, 11, 323]",1,8,val_train_143_14_00.png,train_143,14,471,516,0,test_set,val,7
126,"[399, 8, 70, 294]",1,7,val_train_143_14_00.png,train_143,14,471,516,0,test_set,val,6


In [20]:
# Load the augmentation transforms
aug = AugmentationTransform(image_width=model_info.get('image_width'), 
                            image_height=model_info.get('image_height'))
train_transform = aug.get_transforms(name=model_info.get('train_transform'))
val_transform = aug.get_transforms(name=model_info.get('val_transform'))
print(*train_transform, sep='\n')
print()
print(*val_transform, sep='\n')

# Load the image processor
model_checkpoint = model_info.get('hf_checkpoint')
image_processor = RTDetrImageProcessor.\
    from_pretrained(model_checkpoint, **processor_params)

# Load model from checkpoint
model = RTDetrV2ForObjectDetection.\
    from_pretrained(model_checkpoint,
                    id2label=id2label,
                    label2id=label2id,
                    anchor_image_size=None,
                    ignore_mismatched_sizes=True)

# Set the evaluation metrics
eval_compute_metrics_fn = MAPEvaluator(image_processor=image_processor, threshold=0.01, id2label=id2label)
training_args = TrainingArguments(**training_args)

# Create the data sets
train_dataset = DTRdataset(data=train_df,
                           image_processor=image_processor,
                           image_dir=train_image_dir,
                           file_name_col=file_name_col,
                           label_id_col='label',
                           bbox_col=bbox_col,
                           transforms=train_transform)

val_dataset = DTRdataset(data=train_df,
                         image_processor=image_processor,
                         image_dir=train_image_dir,
                         file_name_col=file_name_col,
                         label_id_col='label',
                         bbox_col=bbox_col,
                         transforms=val_transform)

RandomCropFromBorders(p=1.0, crop_bottom=0.5, crop_left=0.3, crop_right=0.3, crop_top=0.5)
CenterCrop(p=1.0, border_mode=0, fill=0.0, fill_mask=0.0, height=640, pad_if_needed=True, pad_position='center', width=640)
Affine(p=0.5, balanced_scale=False, border_mode=0, fill=0.0, fill_mask=0.0, fit_output=False, interpolation=1, keep_ratio=False, mask_interpolation=0, rotate=(1.0, 1.0), rotate_method='largest_box', scale={'x': (0.8, 1.2), 'y': (0.8, 1.2)}, shear={'x': (0.0, 0.0), 'y': (0.0, 0.0)}, translate_percent=None, translate_px={'x': (0, 0), 'y': (0, 0)})
RandomBrightnessContrast(p=0.5, brightness_by_max=True, brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), ensure_safe_range=False)
Sharpen(p=0.5, alpha=(0.2, 0.5), kernel_size=5, lightness=(0.5, 1.0), method='kernel', sigma=1.0)
CLAHE(p=0.5, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))

CenterCrop(p=1.0, border_mode=0, fill=0.0, fill_mask=0.0, height=640, pad_if_needed=True, pad_position='center', width=640)


Some weights of RTDetrV2ForObjectDetection were not initialized from the model checkpoint at PekingU/rtdetr_v2_r101vd and are newly initialized because the shapes did not match:
- model.decoder.class_embed.0.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([32]) in the model instantiated
- model.decoder.class_embed.0.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([32, 256]) in the model instantiated
- model.decoder.class_embed.1.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([32]) in the model instantiated
- model.decoder.class_embed.1.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([32, 256]) in the model instantiated
- model.decoder.class_embed.2.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([32]) in the model instantiated
- model.decoder.class_embed.2.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([32, 256]) in the model instantiated
- mode

NameError: name 'MAPEvaluator' is not defined