In [15]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.models as models
import torchvision.datasets as datasets
import matplotlib.pyplot as plt
from tqdm import tqdm
from torchvision.datasets import CocoDetection
from torch.utils.data import DataLoader,Dataset
from torch.nn.functional import relu
from collections import defaultdict
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from sklearn.metrics import precision_recall_fscore_support

In [27]:
!wget http://images.cocodataset.org/zips/train2017.zip
!wget http://images.cocodataset.org/zips/val2017.zip
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip

--2023-12-23 19:37:06--  http://images.cocodataset.org/zips/train2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 16.182.106.113, 3.5.28.254, 3.5.29.82, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|16.182.106.113|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19336861798 (18G) [application/zip]
Saving to: ‘train2017.zip’


2023-12-23 19:43:56 (45.1 MB/s) - ‘train2017.zip’ saved [19336861798/19336861798]

--2023-12-23 19:43:56--  http://images.cocodataset.org/zips/val2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.217.96.212, 54.231.133.241, 52.216.44.241, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|52.217.96.212|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 815585330 (778M) [application/zip]
Saving to: ‘val2017.zip’


2023-12-23 19:44:14 (44.2 MB/s) - ‘val2017.zip’ saved [815585330/815585330]

--2023-12-23 19:44:14--  http://images.cocodataset.or

Unzip the dataset

In [28]:
!unzip \*.zip
!rm *.zip

Archive:  train2017.zip
replace train2017/000000147328.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [29]:
# Define transformations
import torchvision.transforms as transforms
transform = transforms.Compose([
  transforms.Resize((224, 224)),
  transforms.ToTensor(),
  transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Load training data
train_data = datasets.CocoDetection(
  root="./train2017",
  annFile="./annotations/instances_train2017.json",
  transform=transform
)

# Take subset for debugging
train_data = torch.utils.data.Subset(train_data, list(range(18000)))

# Load validation data
val_data = datasets.CocoDetection(
  root="./val2017",
  annFile="./annotations/instances_val2017.json",
  transform=transform
)

# Take subset for debugging
val_data = torch.utils.data.Subset(val_data, list(range(2000)))

print(f'Number of training samples: {len(train_data)}')
print(f'Number of validation samples: {len(val_data)}')

loading annotations into memory...
Done (t=22.94s)
creating index...
index created!
loading annotations into memory...
Done (t=0.67s)
creating index...
index created!
Number of training samples: 18000
Number of validation samples: 2000


In [30]:
class CocoDataset(Dataset):
    def __init__(self, num_objects, dataset):
        self.dataset = dataset
        self.num_objects = num_objects

    def __getitem__(self, index):
        image, targets = self.dataset[index]

        # Convert target to one-hot encoding
        target_onehot = torch.zeros(self.num_objects)
        for obj in targets:
            target_onehot[obj['category_id']-1] = 1

        return image, target_onehot

    def __len__(self):
        return len(self.dataset)

In [31]:
# define train dataset
train_dataset = CocoDataset(90,train_data)
# define validation dataset
val_dataset = CocoDataset(90,val_data)

batch_size = 32
# Create a data loader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [42]:
class DeformableConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, deformable_groups=1):
        super().__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.deformable_groups = deformable_groups

        self.offset_conv = nn.Conv2d(
            in_channels,
            deformable_groups * 2,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation
        )

        self.modulator = nn.Conv2d(
            in_channels,
            deformable_groups,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation
        )

        self.feature_conv = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation
        )

    def forward(self, x):
        # Get deformable offsets
        offsets = self.offset_conv(x)

        # Get modulation masks
        masks = torch.sigmoid(self.modulator(x))

        # Deform the input features
        x = self.deform_features(x, offsets, masks)

        # Convolve deformed features
        x = self.feature_conv(x)
        return x

    def deform_features(self, x, offsets, masks):
        """Deform the input features with offsets and modulation."""
        N, C, H, W = x.shape
        tensor_mesh = self.get_mesh_grid(H, W, x.device)
        tensor_mesh = tensor_mesh.reshape(2, H, W)

        grid_y , grid_x = tensor_mesh[0, :,: ], tensor_mesh[1, :, :]

        # Add offsets to sampling grid
        grid_y = grid_y + offsets[:, 0, :, :]
        grid_x = grid_x + offsets[:, 1, :, :]

        sampling_locs = torch.stack((grid_x, grid_y), dim=-1)

        # Interpolate features
        x = nn.functional.grid_sample(
            x, sampling_locs, mode='bilinear', padding_mode='zeros', align_corners=False
        )

        # Modulate features
        x = x * masks
        return x

    def get_mesh_grid(self, H, W, device):
        """Generate mesh grid for sampling."""
        y, x = torch.meshgrid(torch.arange(H), torch.arange(W))
        y = (2*y/(H-1) - 1).to(device)
        x = (2*x/(W-1) - 1).to(device)
        mesh_grid = torch.stack((x, y), dim=-1)

        return mesh_grid

In [43]:
deformable_conv = DeformableConv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1, deformable_groups=1)
test = torch.randn(6, 32, 224, 224)
out = deformable_conv(test)
print(out.shape)

torch.Size([6, 64, 224, 224])


In [44]:
import torch.nn as nn
import torch.nn.functional as F

class Block(nn.Module):

    def __init__(self, in_channels, out_channels, deformable=False):
        super().__init__()
        conv = DeformableConv2d if deformable else nn.Conv2d

        self.conv1 = conv(in_channels, out_channels, kernel_size=3, padding=1)
        self.conv2 = conv(out_channels, out_channels, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        return x


class ObjectDetector(nn.Module):

    def __init__(self, n_class, deformable=False):
        super().__init__()

        self.block1 = Block(3, 64)
        self.block2 = Block(64, 128, deformable=deformable)
        self.block3 = Block(128, 256, deformable=deformable)
        self.block4 = Block(256, 512, deformable=deformable)

        self.out_conv = nn.Conv2d(512, n_class, 1)

        self.classifier = nn.Sequential(
            nn.Linear(17640, 1024),
            nn.ReLU(),
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Linear(256, n_class)
        )

        self.bn = nn.BatchNorm1d(n_class)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)

        x = self.out_conv(x)
        x = self.classifier(x.view(x.size(0), -1))
        x = self.bn(x)
        x = self.sigmoid(x)
        return x

In [49]:
# number of classes
num_class = 90
# set device to be cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# define model
model1 = ObjectDetector(num_class).to(device)

# define optimizer
optimizer = torch.optim.Adam(model1.parameters(), lr=0.001)

# define learning rate scheduler
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

# set criterion to be BCEWithLogitsLoss
criterion = nn.BCEWithLogitsLoss()

In [None]:
import copy
best_loss = float('inf')
best_model_wts = copy.deepcopy(model1.state_dict())

num_epochs = 10
for epoch in range(num_epochs):

  print(f'Epoch {epoch+1}/{num_epochs}')

  # Training loop
  model1.train()
  train_loss = 0.0
  for inputs, targets in train_loader:
    optimizer.zero_grad()
    inputs, targets = inputs.to(device), targets.to(device)
    outputs = model1(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()
    train_loss += loss.item() * inputs.size(0)

  train_loss /= len(train_loader.dataset)

  # Validation loop
  model1.eval()
  val_loss = 0.0
  for inputs, targets in val_loader:
    inputs, targets = inputs.to(device), targets.to(device)
    outputs = model1(inputs)
    loss = criterion(outputs, targets)
    val_loss += loss.item() * inputs.size(0)

  val_loss /= len(val_loader.dataset)

  # Print progress
  print(f'Train loss: {train_loss:.4f}\tVal loss: {val_loss:.4f}')

  # Save best model
  if val_loss < best_loss:
    best_loss = val_loss
    best_model_wts = copy.deepcopy(model1.state_dict())

# Load best model weights
model1.load_state_dict(best_model_wts)
torch.save(model1.state_dict(), 'best_model1.pt')

Epoch 1/10
Train loss: 0.9114	Val loss: 0.8687
Epoch 2/10
Train loss: 0.8463	Val loss: 0.8233
Epoch 3/10
Train loss: 0.8007	Val loss: 0.7846
Epoch 4/10
Train loss: 0.7673	Val loss: 0.7594
Epoch 5/10
Train loss: 0.7460	Val loss: 0.7390
Epoch 6/10


It took 68 minutes for 20 epochs so i'm gonna set eppochs to 10 :)

In [None]:
!pip install numba

from numba import cuda
device = cuda.get_current_device()
device.reset()
torch.cuda.empty_cache()

In [None]:
model2 = ObjectDetector(num_class,deformable=True).to(device)

# define optimizer
optimizer = torch.optim.Adam(model2.parameters(), lr=0.001)

num_epochs = 10

In [None]:
best_loss = float('inf')
best_model_wts = copy.deepcopy(model2.state_dict())

for epoch in range(num_epochs):

  print(f'Epoch {epoch+1}/{num_epochs}')

  # Training loop
  model2.train()
  train_loss = 0.0
  for inputs, targets in train_loader:
    optimizer.zero_grad()
    inputs, targets = inputs.to(device), targets.to(device)
    outputs = model2(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()
    train_loss += loss.item() * inputs.size(0)

  train_loss /= len(train_loader.dataset)

  # Validation loop
  model2.eval()
  val_loss = 0.0
  for inputs, targets in val_loader:
    inputs, targets = inputs.to(device), targets.to(device)
    outputs = model2(inputs)
    loss = criterion(outputs, targets)
    val_loss += loss.item() * inputs.size(0)

  val_loss /= len(val_loader.dataset)

  # Print progress
  print(f'Train loss: {train_loss:.4f}\tVal loss: {val_loss:.4f}')

  # Save best model
  if val_loss < best_loss:
    best_loss = val_loss
    best_model_wts = copy.deepcopy(model2.state_dict())

# Load best model weights
model2.load_state_dict(best_model_wts)
torch.save(model2.state_dict(), 'best_model2.pt')

In [None]:
# Load best model weights
model1.load_state_dict(torch.load('best_model1.pth'))
model2.load_state_dict(torch.load('best_model2.pth'))

# Set model to evaluation mode
model1.eval()
model2.eval()

# Compute precision, recall, F1 score
y_true = []
y_pred1 = []
y_pred2 = []

for inputs, targets in tqdm(test_loader):
    inputs = inputs.to(device)
    targets = targets.to(device)

    with torch.set_grad_enabled(False):
        outputs1 = model1(inputs)
        outputs2 = model2(inputs)

        y_true.extend(targets.cpu().numpy().tolist())
        y_pred1.extend(outputs1.cpu().numpy().tolist())
        y_pred2.extend(outputs2.cpu().numpy().tolist())

y_true = np.array(y_true)
y_pred1 = np.array(y_pred1)
y_pred2 = np.array(y_pred2)

y_pred1[y_pred1 >= 0.5] = 1
y_pred1[y_pred1 < 0.5] = 0

y_pred2[y_pred2 >= 0.5] = 1
y_pred2[y_pred2 < 0.5] = 0

precision1, recall1, f1_score1, _ = precision_recall_fscore_support(y_true, y_pred1, average='micro')
precision2, recall2, f1_score2, _ = precision_recall_fscore_support(y_true, y_pred2, average='micro')

print(f'Precision without deformable convolutions: {precision1:.4f}')
print(f'Recall without deformable convolutions: {recall1:.4f}')
print(f'F1 score without deformable convolutions: {f1_score1:.4f}')
print()
print(f'Precision with deformable convolutions: {precision2:.4f}')
print(f'Recall with deformable convolutions: {recall2:.4f}')
print(f'F1 score with deformable convolutions: {f1_score2:.4f}')



