In [3]:
!pip install pycocotools


Collecting pycocotools
  Downloading pycocotools-2.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Downloading pycocotools-2.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (426 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.2/426.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pycocotools
Successfully installed pycocotools-2.0.7


In [4]:
import torchvision 
import os 
from transformers import DetrImageProcessor, DetrForObjectDetection, DetrConfig


class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(self, img_folder, processor, train=True):
        ann_file = '/kaggle/input/bc-coco/coco_1k/annotations/instances_train2017.json' if train else '/kaggle/input/bc-coco/coco_1k/annotations/instances_val2017.json'
        super(CocoDetection, self).__init__(img_folder, ann_file)
        self.processor = processor

    def __getitem__(self, idx):
        img, target = super(CocoDetection, self).__getitem__(idx)
        image_id = self.ids[idx] 
        target = {'image_id': image_id, 'annotations': target}
        encoding = self.processor(images=img, annotations=target, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze() 
        target = encoding["labels"][0] 

        return pixel_values, target


    
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
config = DetrConfig.from_pretrained(
    "facebook/detr-resnet-50-panoptic",
    num_labels=2,
    class_cost=1
)

train_dataset = CocoDetection(img_folder='/kaggle/input/bc-coco/coco_1k/train2017/', processor=processor,train=True)
val_dataset = CocoDetection(img_folder='/kaggle/input/bc-coco/coco_1k/val2017/', processor=processor, train=False)

2024-04-24 17:28:34.347246: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-24 17:28:34.347384: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-24 17:28:34.510957: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/290 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

loading annotations into memory...
Done (t=0.04s)
creating index...
index created!
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


In [7]:
from torch.utils.data import DataLoader

In [8]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(val_dataset))

Number of training examples: 2240
Number of validation examples: 218


In [9]:
def collate_fn(batch):
      pixel_values = [item[0] for item in batch]
      encoding = processor.pad(pixel_values, return_tensors="pt")
      labels = [item[1] for item in batch]
      batch = {}
      batch['pixel_values'] = encoding['pixel_values']
      batch['pixel_mask'] = encoding['pixel_mask']
      batch['labels'] = labels
      print(batch)
      return batch
        
        
train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=2, shuffle=True)
val_dataloader = DataLoader(val_dataset, collate_fn=collate_fn, batch_size=2)
batch = next(iter(train_dataloader))

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


{'pixel_values': tensor([[[[ 2.2489,  2.2489,  2.2489,  ..., -0.0801, -0.0801, -0.1314],
          [ 2.2489,  2.2489,  2.2489,  ..., -0.1657, -0.1657, -0.1657],
          [ 2.2489,  2.2489,  2.2489,  ..., -0.2684, -0.2684, -0.2513],
          ...,
          [ 2.2489,  2.2489,  2.2489,  ...,  0.6221,  0.6221,  0.6221],
          [ 2.2489,  2.2489,  2.2489,  ...,  0.6563,  0.6563,  0.6392],
          [ 2.2489,  2.2489,  2.2489,  ...,  0.7077,  0.7077,  0.6906]],

         [[ 2.4286,  2.4286,  2.4286,  ...,  0.0476,  0.0476, -0.0049],
          [ 2.4286,  2.4286,  2.4286,  ..., -0.0399, -0.0399, -0.0399],
          [ 2.4286,  2.4286,  2.4286,  ..., -0.1450, -0.1450, -0.1275],
          ...,
          [ 2.4286,  2.4286,  2.4286,  ...,  0.7654,  0.7654,  0.7654],
          [ 2.4286,  2.4286,  2.4286,  ...,  0.8004,  0.8004,  0.7829],
          [ 2.4286,  2.4286,  2.4286,  ...,  0.8529,  0.8529,  0.8354]],

         [[ 2.6400,  2.6400,  2.6400,  ...,  0.2696,  0.2696,  0.2173],
          [ 2

In [10]:
print(batch)
print(batch['pixel_values'][1].shape)

{'pixel_values': tensor([[[[ 2.2489,  2.2489,  2.2489,  ..., -0.0801, -0.0801, -0.1314],
          [ 2.2489,  2.2489,  2.2489,  ..., -0.1657, -0.1657, -0.1657],
          [ 2.2489,  2.2489,  2.2489,  ..., -0.2684, -0.2684, -0.2513],
          ...,
          [ 2.2489,  2.2489,  2.2489,  ...,  0.6221,  0.6221,  0.6221],
          [ 2.2489,  2.2489,  2.2489,  ...,  0.6563,  0.6563,  0.6392],
          [ 2.2489,  2.2489,  2.2489,  ...,  0.7077,  0.7077,  0.6906]],

         [[ 2.4286,  2.4286,  2.4286,  ...,  0.0476,  0.0476, -0.0049],
          [ 2.4286,  2.4286,  2.4286,  ..., -0.0399, -0.0399, -0.0399],
          [ 2.4286,  2.4286,  2.4286,  ..., -0.1450, -0.1450, -0.1275],
          ...,
          [ 2.4286,  2.4286,  2.4286,  ...,  0.7654,  0.7654,  0.7654],
          [ 2.4286,  2.4286,  2.4286,  ...,  0.8004,  0.8004,  0.7829],
          [ 2.4286,  2.4286,  2.4286,  ...,  0.8529,  0.8529,  0.8354]],

         [[ 2.6400,  2.6400,  2.6400,  ...,  0.2696,  0.2696,  0.2173],
          [ 2

In [11]:
categories = train_dataset.coco.cats
print(categories)

id2label = {k: v['name'] for k,v in categories.items()}

{0: {'id': 0, 'name': 'mal', 'supercategory': None}}


In [12]:
from transformers import DetrForObjectDetection



model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
model.config=config

model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50",
                                                             revision="no_timm",
                                                             num_labels=len(id2label),
                                                             ignore_mismatched_sizes=True)

config.json:   0%|          | 0.00/4.59k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


config.json:   0%|          | 0.00/6.60k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/167M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DetrForObjectDetection were not initialized from the model checkpoint at facebook/detr-resnet-50 and are newly initialized because the shapes did not match:
- class_labels_classifier.weight: found shape torch.Size([92, 256]) in the checkpoint and torch.Size([2, 256]) in the model instantiated
- class_labels_classifier.bias: found shape torch.Size([92]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
out=model(**batch)

In [21]:
print(out.loss)

tensor(4.7264, grad_fn=<AddBackward0>)


In [15]:
import torch
import torch.optim as optim
loss_dict = out.loss_dict
total_loss=sum(loss.item() for loss in loss_dict.values())
total_loss_tensor = torch.tensor(total_loss, requires_grad=True)
total_loss_tensor.backward()

In [18]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [36]:
def train_detr_model(model, optimizer, train_data, val_data, num_epochs):
    print("training the model")
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        model.train()
        avg_train_loss = 0.0
        num_batches = 0.0
        for batch in train_data:
            optimizer.zero_grad()
            outputs = model(**batch)
            total_loss = outputs.loss 
            total_loss.backward()
            optimizer.step()
            avg_train_loss += total_loss.item()
            num_batches += 1
        avg_train_loss /= num_batches
        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}')

In [None]:
train_detr_model(model, optimizer, train_data=train_dataloader, val_data=val_dataloader, num_epochs=10)

training the model
{'pixel_values': tensor([[[[-1.4672, -1.4672, -1.4672,  ...,  0.0000,  0.0000,  0.0000],
          [-1.5528, -1.5528, -1.5528,  ...,  0.0000,  0.0000,  0.0000],
          [-1.6213, -1.6384, -1.6384,  ...,  0.0000,  0.0000,  0.0000],
          ...,
          [-0.8678, -0.8678, -0.8849,  ...,  0.0000,  0.0000,  0.0000],
          [-0.7993, -0.8164, -0.8335,  ...,  0.0000,  0.0000,  0.0000],
          [-0.7650, -0.7822, -0.7993,  ...,  0.0000,  0.0000,  0.0000]],

         [[-1.3704, -1.3704, -1.3704,  ...,  0.0000,  0.0000,  0.0000],
          [-1.4580, -1.4580, -1.4580,  ...,  0.0000,  0.0000,  0.0000],
          [-1.5280, -1.5455, -1.5455,  ...,  0.0000,  0.0000,  0.0000],
          ...,
          [-0.7577, -0.7577, -0.7752,  ...,  0.0000,  0.0000,  0.0000],
          [-0.6877, -0.7052, -0.7227,  ...,  0.0000,  0.0000,  0.0000],
          [-0.6527, -0.6702, -0.6877,  ...,  0.0000,  0.0000,  0.0000]],

         [[-1.1421, -1.1421, -1.1421,  ...,  0.0000,  0.0000,  0.0