# Prep

Setting up some prior functionality

In [1]:
import torch, torchvision
print(torch.__version__, torch.cuda.is_available())

2.0.1 False


In [6]:
import argparse
import json

import numpy as np

import util.misc as utils
from models import build_model
import torch
from torch import nn


# Modified DETR Architecture 


In [2]:
from models import build_model


def get_args_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('--lr', default=1e-4, type=float)
    parser.add_argument('--lr_backbone', default=1e-5, type=float)
    parser.add_argument('--batch_size', default=2, type=int)
    parser.add_argument('--weight_decay', default=1e-4, type=float)
    parser.add_argument('--epochs', default=300, type=int)
    parser.add_argument('--lr_drop', default=200, type=int)
    parser.add_argument('--clip_max_norm', default=0.1, type=float,
                        help='gradient clipping max norm')

    # Model parameters
    parser.add_argument('--num_classes', type=int, default=None,
                        help="Number of classes in dataset+1")
    parser.add_argument('--frozen_weights', type=str, default=None,
                        help="Path to the pretrained model. If set, only the mask head will be trained")
    # * Backbone
    parser.add_argument('--backbone', default='resnet50', type=str,
                        help="Name of the convolutional backbone to use")
    parser.add_argument('--dilation', action='store_true',
                        help="If true, we replace stride with dilation in the last convolutional block (DC5)")
    parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'),
                        help="Type of positional embedding to use on top of the image features")

    # * Transformer
    parser.add_argument('--enc_layers', default=6, type=int,
                        help="Number of encoding layers in the transformer")
    parser.add_argument('--dec_layers', default=6, type=int,
                        help="Number of decoding layers in the transformer")
    parser.add_argument('--dim_feedforward', default=2048, type=int,
                        help="Intermediate size of the feedforward layers in the transformer blocks")
    parser.add_argument('--hidden_dim', default=256, type=int,
                        help="Size of the embeddings (dimension of the transformer)")
    parser.add_argument('--dropout', default=0.1, type=float,
                        help="Dropout applied in the transformer")
    parser.add_argument('--nheads', default=8, type=int,
                        help="Number of attention heads inside the transformer's attentions")
    parser.add_argument('--num_queries', default=100, type=int,
                        help="Number of query slots")
    parser.add_argument('--pre_norm', action='store_true')

    ####################### @amirhnazerii #######################
    ##### start 03/27/2025
    # * Classification head
    parser.add_argument('--new_layer_dim', default=None, type=int,
                        help="classification head added fc-layer dim")
    ##### end 03/27/2025
    
    
    # * Segmentation
    parser.add_argument('--masks', action='store_true',
                        help="Train segmentation head if the flag is provided")

    # Loss
    parser.add_argument('--no_aux_loss', dest='aux_loss', action='store_false',
                        help="Disables auxiliary decoding losses (loss at each layer)")
    # * Matcher
    parser.add_argument('--set_cost_class', default=1, type=float,
                        help="Class coefficient in the matching cost")
    parser.add_argument('--set_cost_bbox', default=5, type=float,
                        help="L1 box coefficient in the matching cost")
    parser.add_argument('--set_cost_giou', default=2, type=float,
                        help="giou box coefficient in the matching cost")
    # * Loss coefficients
    parser.add_argument('--mask_loss_coef', default=1, type=float)
    parser.add_argument('--dice_loss_coef', default=1, type=float)
    parser.add_argument('--bbox_loss_coef', default=5, type=float)
    parser.add_argument('--giou_loss_coef', default=2, type=float)
    parser.add_argument('--eos_coef', default=0.1, type=float,
                        help="Relative classification weight of the no-object class")

    # dataset parameters
    parser.add_argument('--dataset_file', default='coco')
    parser.add_argument('--coco_path', type=str)
    parser.add_argument('--coco_panoptic_path', type=str)
    parser.add_argument('--remove_difficult', action='store_true')

    parser.add_argument('--output_dir', default='',
                        help='path where to save, empty for no saving')
    parser.add_argument('--device', default='cuda',
                        help='device to use for training / testing')
    parser.add_argument('--seed', default=42, type=int)
    parser.add_argument('--resume', default='', help='resume from checkpoint')
    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
                        help='start epoch')
    parser.add_argument('--eval', action='store_true')
    parser.add_argument('--num_workers', default=2, type=int)

    # distributed training parameters
    parser.add_argument('--world_size', default=1, type=int,
                        help='number of distributed processes')
    return parser

In [7]:
if __name__ == '__main__':
    parser = get_args_parser()
    args = parser.parse_args(['--new_layer_dim', '128'])
    model, _, _ = build_model(args)



In [8]:
args

Namespace(lr=0.0001, lr_backbone=1e-05, batch_size=2, weight_decay=0.0001, epochs=300, lr_drop=200, clip_max_norm=0.1, num_classes=None, frozen_weights=None, backbone='resnet50', dilation=False, position_embedding='sine', enc_layers=6, dec_layers=6, dim_feedforward=2048, hidden_dim=256, dropout=0.1, nheads=8, num_queries=100, pre_norm=False, new_layer_dim=128, masks=False, aux_loss=True, set_cost_class=1, set_cost_bbox=5, set_cost_giou=2, mask_loss_coef=1, dice_loss_coef=1, bbox_loss_coef=5, giou_loss_coef=2, eos_coef=0.1, dataset_file='coco', coco_path=None, coco_panoptic_path=None, remove_difficult=False, output_dir='', device='cuda', seed=42, resume='', start_epoch=0, eval=False, num_workers=2, world_size=1)

In [6]:
model

Modified_DETR(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerDecoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_feature

```
(class_embed): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=92, bias=True)
)
```

In [8]:
print(model.class_embed[0].weight.shape)
print(model.class_embed[0].bias.shape)
print()

torch.Size([128, 256])
torch.Size([128])


# Load original weights



In [47]:
# Get pretrained weights
checkpoint = torch.hub.load_state_dict_from_url(
            url='https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth',        
            # url='https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth',
            # url='https://dl.fbaipublicfiles.com/detr/detr-r101-2c7b67e5.pth',
            # url='https://dl.fbaipublicfiles.com/detr/detr-r101-dc5-a2e86def.pth',
            map_location='cpu',
            check_hash=True)

In [12]:
# checkpoint["model"]
checkpoint["model"].keys()

odict_keys(['transformer.encoder.layers.0.self_attn.in_proj_weight', 'transformer.encoder.layers.0.self_attn.in_proj_bias', 'transformer.encoder.layers.0.self_attn.out_proj.weight', 'transformer.encoder.layers.0.self_attn.out_proj.bias', 'transformer.encoder.layers.0.linear1.weight', 'transformer.encoder.layers.0.linear1.bias', 'transformer.encoder.layers.0.linear2.weight', 'transformer.encoder.layers.0.linear2.bias', 'transformer.encoder.layers.0.norm1.weight', 'transformer.encoder.layers.0.norm1.bias', 'transformer.encoder.layers.0.norm2.weight', 'transformer.encoder.layers.0.norm2.bias', 'transformer.encoder.layers.1.self_attn.in_proj_weight', 'transformer.encoder.layers.1.self_attn.in_proj_bias', 'transformer.encoder.layers.1.self_attn.out_proj.weight', 'transformer.encoder.layers.1.self_attn.out_proj.bias', 'transformer.encoder.layers.1.linear1.weight', 'transformer.encoder.layers.1.linear1.bias', 'transformer.encoder.layers.1.linear2.weight', 'transformer.encoder.layers.1.linear2

In [86]:
checkpoint["model"]['class_embed.weight'][91].size()

torch.Size([256])

# Modify model architecture 

The original classification head (`model.class_embed`) in DETR is `Linear(256, 92)`, meaning:


`model.class_embed.weight.shape = torch.Size([92, 256])` \
`model.class_embed.bias.shape =torch.Size([92])`


Our newly added classification head has two layers:

- `nn.Linear(256, 128)`: This transforms the 256-dimensional decoder output to 128 dimensions.

- `nn.Linear(128, 92)`: This maps the 128-dimensional intermediate features to 92 class logits.



# Modify weights

In [48]:
## Check OLD classification head size:
# checkpoint["model"]["class_embed.weight"].shape  # torch.Size([92, 256])
# checkpoint["model"]["class_embed.bias"].shape  # torch.Size([92])

In [49]:

"""
(class_embed): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=92, bias=True)
)
"""

#COPY bias for layer 2 because it mataches
kept_bias_vector = checkpoint["model"]["class_embed.bias"]

In [50]:
hidden_dim = model.class_embed[0].in_features #256
new_layer_dim =  model.class_embed[0].out_features #128
output_dim = model.class_embed[2].out_features #92

In [51]:
## Remove OLD classification head architecture parameters:
del checkpoint["model"]["class_embed.weight"]
del checkpoint["model"]["class_embed.bias"]   

In [52]:

checkpoint["model"]["class_embed.0.weight"] = nn.init.xavier_uniform_(
    torch.empty(new_layer_dim, hidden_dim)
)
checkpoint["model"]["class_embed.0.bias"] = torch.zeros(new_layer_dim)

checkpoint["model"]["class_embed.2.weight"] = nn.init.xavier_uniform_(
    torch.empty(output_dim, new_layer_dim)
)
checkpoint["model"]["class_embed.2.bias"] = kept_bias_vector

In [61]:
# checkpoint['model']["class_embed.2.weight"]

tensor([[ 0.0984, -0.0830,  0.1219,  ..., -0.0645, -0.1471, -0.0400],
        [-0.0561,  0.1276,  0.0486,  ...,  0.1135, -0.0857,  0.1623],
        [ 0.1393,  0.0384, -0.1296,  ...,  0.1426,  0.0405,  0.0259],
        ...,
        [ 0.0399, -0.1557, -0.0984,  ..., -0.0087, -0.0694,  0.0376],
        [-0.1121, -0.0906,  0.0083,  ..., -0.0690,  0.0472,  0.1281],
        [-0.0671,  0.1335, -0.0384,  ..., -0.1646,  0.0270, -0.1368]])

## Apply changes

In [62]:
# Save the modified checkpoint
torch.save(checkpoint, "detr-r50-modifhead-128fc92fc.pth")

print("Modified checkpoint saved successfully!")

Modified checkpoint saved successfully!


In [71]:
### test:
# checkpoint["model"]
model.load_state_dict(checkpoint['model'], strict=False)  #### Modified by Amir: , strict=False


<All keys matched successfully>

In [1]:
model.class_embed[2].weight

NameError: name 'model' is not defined

In [9]:
### test2:
## load the new checkpoint into the modified detr
checkpoint = torch.load('detr-r50-modifhead-128fc92fc.pth', map_location='cpu')
model.load_state_dict(checkpoint['model'], strict=False)  

<All keys matched successfully>

In [23]:
for param in model.class_embed[2].parameters():
    print(param.requires_grad)

True
True


# KITTI

In [3]:
if __name__ == '__main__':
    parser = get_args_parser()
    args = parser.parse_args(['--new_layer_dim', '128', '--device', 'cpu', '--num_classes', '9'])
    model, _, _ = build_model(args)



In [4]:
model.class_embed

Sequential(
  (0): Linear(in_features=256, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=10, bias=True)
)

In [7]:
# Get pretrained weights
checkpoint = torch.hub.load_state_dict_from_url(
            url='https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth',        
            # url='https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth',
            # url='https://dl.fbaipublicfiles.com/detr/detr-r101-2c7b67e5.pth',
            # url='https://dl.fbaipublicfiles.com/detr/detr-r101-dc5-a2e86def.pth',
            map_location='cpu',
            check_hash=True)

In [8]:
## Remove OLD classification head architecture parameters:
del checkpoint["model"]["class_embed.weight"]
del checkpoint["model"]["class_embed.bias"]

In [9]:
hidden_dim = model.class_embed[0].in_features #256
new_layer_dim =  model.class_embed[0].out_features #128
output_dim = model.class_embed[2].out_features #10


checkpoint["model"]["class_embed.0.weight"] = nn.init.xavier_uniform_(
    torch.empty(new_layer_dim, hidden_dim)
)
checkpoint["model"]["class_embed.0.bias"] = torch.zeros(new_layer_dim)

checkpoint["model"]["class_embed.2.weight"] = nn.init.xavier_uniform_(
    torch.empty(output_dim, new_layer_dim)
)
checkpoint["model"]["class_embed.2.bias"] = torch.zeros(output_dim)

In [10]:
# Save the modified checkpoint
torch.save(checkpoint, "detr-r50-KITTI-modifhead-128fc92fc.pth")
print("Modified checkpoint saved successfully!")

Modified checkpoint saved successfully!


In [11]:
model.load_state_dict(checkpoint['model'], strict=False)  #### Modified by Amir: , strict=False

<All keys matched successfully>

# Dataset

Our dataset should be loadable as a COCO format

This allows us to use the pycocotools to load the data dict for the main python script

In [5]:
dataset_file = "coco" # alternatively, implement your own coco-type dataset loader in datasets and add this "key" to datasets/__init__.py

dataDir='/home/anazeri/fiftyone/kitti_coco/' # should lead to a directory with a train2017 and val2017 folder as well as an annotations folder
num_classes = 9+1 # this int should be the actual number of classes + 1 (for no class)

# 

outDir = 'outputs'
resume = "detr-r50_no-class-head.pth" if pretrained else ""

In [6]:
python main.py \
  --dataset_file $"coco" \
  --coco_path $'/home/anazeri/fiftyone/kitti_coco/' \
  --output_dir $'outputs' \
  --resume $"detr-r50_no-class-head.pth" \
  --num_classes $num_classes \
  --lr 1e-5 \
  --lr_backbone 1e-6 \
  --epochs 1

# Training

We use the main.py script to run our training

In [7]:
!python main.py \
  --dataset_file $dataset_file \
  --coco_path $dataDir \
  --output_dir $outDir \
  --resume $resume \
  --num_classes $num_classes \
  --lr 1e-5 \
  --lr_backbone 1e-6 \
  --epochs 1

Traceback (most recent call last):
  File "/home/anazeri/detr_finetune/main.py", line 10, in <module>
    import torch
ModuleNotFoundError: No module named 'torch'


# Results

Quick and easy overview of the training results

In [None]:
from util.plot_utils import plot_logs

from pathlib import Path

log_directory = [Path(outDir)]

In [None]:
fields_of_interest = (
    'loss',
    'mAP',
    )

plot_logs(log_directory,
          fields_of_interest)

In [None]:
fields_of_interest = (
    'loss_ce',
    'loss_bbox',
    'loss_giou',
    )

plot_logs(log_directory,
          fields_of_interest)

In [None]:
fields_of_interest = (
    'class_error',
    'cardinality_error_unscaled',
    )

plot_logs(log_directory,
          fields_of_interest)   