The main modules are:

Backbone Network (Feature Extraction)

Region Proposal Network (RPN)

Dense Local Regression Module

Discriminative RoI Pooling

Binary Overlap Prediction

Object Classification

Instance Segmentation Extension (optional for segmentation tasks)

Backbone Network (Feature Extraction)
Description: This block extracts feature maps from the input image using a ResNet backbone with Feature Pyramid Network (FPN).

Input:
input_image (torch.Tensor): Shape [B, 3, H, W], where B is batch size, 3 is the RGB channel, H and W are height and width.

Output:
feature_maps (dict of torch.Tensor): A dictionary containing multiple feature maps with different scales, e.g., {P3: [B, 256, H/8, W/8], P4: [B, 256, H/16, W/16], ...}.

In [201]:
# Core Imports
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision.ops import FeaturePyramidNetwork, MultiScaleRoIAlign
from torchvision.transforms import functional as F

# Visualization and Utilities (for testing and debugging)
import matplotlib.pyplot as plt
import numpy as np
import torch.nn.functional as F

# RPN Imports
from torchvision.ops import nms
import torch.nn.functional as F
from torchvision.ops import nms
import torch
from torchvision.ops import roi_align
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets import CocoDetection
from torchvision import transforms
from torch.utils.data import DataLoader, Subset
import os
import random
from torch.utils.data import random_split
import numpy as np
import torch  # Add import for torch if it's missing
!pip install -q pycocotools
import torch
import torch.optim as optim
from tqdm import tqdm
import os

# Ensure CUDA is available for faster processing
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

!pip freeze > requirements.txt


Using device: cpu


In [2]:


class CustomFeaturePyramidNetwork(nn.Module):
    def __init__(self):
        super(CustomFeaturePyramidNetwork, self).__init__()

        # Lateral connections to project the feature maps without altering channels
        self.lateral_p3 = nn.Conv2d(256, 256, kernel_size=1)
        self.lateral_p4 = nn.Conv2d(512, 512, kernel_size=1)
        self.lateral_p5 = nn.Conv2d(1024, 1024, kernel_size=1)
        self.lateral_p6 = nn.Conv2d(2048, 2048, kernel_size=1)

        # Top-down pathway adjustments to make the channels match across levels
        self.adjust_p5 = nn.Conv2d(2048, 1024, kernel_size=1)  # Adjust p6 channels to 1024 for adding to p5
        self.adjust_p4 = nn.Conv2d(1024, 512, kernel_size=1)   # Adjust p5 channels to 512 for adding to p4
        self.adjust_p3 = nn.Conv2d(512, 256, kernel_size=1)    # Adjust p4 channels to 256 for adding to p3

    def forward(self, features):
        """
        Args:
            features (dict): Dictionary of feature maps from backbone with different channel depths.

        Returns:
            pyramid (dict): Multi-scale feature pyramid with different channel depths.
        """
        # Apply lateral connections to get the same spatial dimensions
        p3 = self.lateral_p3(features['P3'])
        p4 = self.lateral_p4(features['P4'])
        p5 = self.lateral_p5(features['P5'])
        p6 = self.lateral_p6(features['P6'])

        # Top-down pathway with channel adjustment
        p5 = p5 + F.interpolate(self.adjust_p5(p6), size=p5.shape[-2:], mode="nearest")
        p4 = p4 + F.interpolate(self.adjust_p4(p5), size=p4.shape[-2:], mode="nearest")
        p3 = p3 + F.interpolate(self.adjust_p3(p4), size=p3.shape[-2:], mode="nearest")

        # Return the pyramid with different channels maintained
        pyramid = {'P3': p3, 'P4': p4, 'P5': p5, 'P6': p6}
        return pyramid

        return pyramid

def backbone_network(input_image):
    """
    Extracts multi-scale feature maps from the input image using ResNet with a custom FPN that maintains
    different channel sizes at each pyramid level.

    Args:
        input_image (torch.Tensor): Shape [B, 3, H, W]

    Returns:
        feature_maps (dict): Dictionary containing feature maps at multiple scales with different channels.
    """
    # Load ResNet and place it on the correct device
    resnet = models.resnet101(weights="ResNet101_Weights.IMAGENET1K_V1").to(device)

    # Extract specific layers that match the original FPN's expected inputs
    layer1 = nn.Sequential(*list(resnet.children())[:5])   # Outputs 256 channels
    layer2 = nn.Sequential(*list(resnet.children())[5])    # Outputs 512 channels
    layer3 = nn.Sequential(*list(resnet.children())[6])    # Outputs 1024 channels
    layer4 = nn.Sequential(*list(resnet.children())[7])    # Outputs 2048 channels

    # Move layers to the appropriate device
    layer1, layer2, layer3, layer4 = layer1.to(device), layer2.to(device), layer3.to(device), layer4.to(device)

    # Forward pass through each layer to get feature maps
    with torch.no_grad():
        x = input_image
        features = {}
        x = layer1(x)
        features['P3'] = x  # 256 channels
        x = layer2(x)
        features['P4'] = x  # 512 channels
        x = layer3(x)
        features['P5'] = x  # 1024 channels
        x = layer4(x)
        features['P6'] = x  # 2048 channels

    # Use the custom FPN to create a feature pyramid without altering channel depths
    custom_fpn = CustomFeaturePyramidNetwork().to(device)
    feature_maps = custom_fpn(features)

    return feature_maps


In [None]:
# Testing the backbone function with shape output
def test_backbone_network():
    # Create a dummy input image tensor [B, 3, H, W] with batch size 1, 3 color channels, 256x256 resolution
    dummy_image = torch.rand(1, 3, 256, 256).to(device)

    # Pass the dummy image through the backbone network
    feature_maps = backbone_network(dummy_image)

    # Print the shape of each feature map
    for level, fmap in feature_maps.items():
        print(f"{level} feature map shape: {fmap.shape}")

# Run the test
test_backbone_network()


P3 feature map shape: torch.Size([1, 256, 64, 64])
P4 feature map shape: torch.Size([1, 512, 32, 32])
P5 feature map shape: torch.Size([1, 1024, 16, 16])
P6 feature map shape: torch.Size([1, 2048, 8, 8])


Anchor Generator: Generates a set of anchors (bounding boxes) for each spatial location on the feature maps, using various sizes and aspect ratios.

Objectness and Box Regression Heads: Predicts scores indicating if an anchor contains an object and provides bounding box adjustments.

Proposal Filter: Selects a subset of high-quality proposals using post-processing techniques like non-maximum suppression (NMS) and score filtering.

In [4]:
class AnchorGenerator:
    def __init__(self, sizes, aspect_ratios):
        """
        Initializes the Anchor Generator.

        Args:
            sizes (list of tuples): Each tuple contains anchor sizes for one feature map level.
            aspect_ratios (list of tuples): Aspect ratios for anchors at each level.
        """
        self.sizes = sizes
        self.aspect_ratios = aspect_ratios

    def generate_anchors(self, feature_map_shapes):
        """
        Generates anchors for each feature map level.

        Args:
            feature_map_shapes (list of tuples): Shapes of feature maps as (height, width).

        Returns:
            anchors (list of torch.Tensor): A list of tensors containing anchors for each feature map level.
        """
        anchors = []
        for size, aspect_ratio, shape in zip(self.sizes, self.aspect_ratios, feature_map_shapes):
            # Generate anchors for this feature map shape
            # Placeholder: Implement anchor generation logic here
            anchors.append(torch.rand(shape[0], shape[1], len(size) * len(aspect_ratio), 4))  # Shape [H, W, num_anchors, 4]
        return anchors


class MultiLevelRPN(nn.Module):
    def __init__(self, num_anchors):
        """
        Initializes the RPN with separate objectness and regression heads for each feature map level.

        Args:
            num_anchors (int): Number of anchors per spatial location.
        """
        super(MultiLevelRPN, self).__init__()

        # Define separate RPN heads for each FPN level
        self.shared_convs = nn.ModuleDict({
            "P3": nn.Conv2d(256, 512, kernel_size=3, padding=1),
            "P4": nn.Conv2d(512, 512, kernel_size=3, padding=1),
            "P5": nn.Conv2d(1024, 512, kernel_size=3, padding=1),
            "P6": nn.Conv2d(2048, 512, kernel_size=3, padding=1)
        })

        # Define objectness and regression heads
        self.objectness_heads = nn.ModuleDict({
            "P3": nn.Conv2d(512, num_anchors, kernel_size=1),
            "P4": nn.Conv2d(512, num_anchors, kernel_size=1),
            "P5": nn.Conv2d(512, num_anchors, kernel_size=1),
            "P6": nn.Conv2d(512, num_anchors, kernel_size=1)
        })
        self.box_regression_heads = nn.ModuleDict({
            "P3": nn.Conv2d(512, num_anchors * 4, kernel_size=1),
            "P4": nn.Conv2d(512, num_anchors * 4, kernel_size=1),
            "P5": nn.Conv2d(512, num_anchors * 4, kernel_size=1),
            "P6": nn.Conv2d(512, num_anchors * 4, kernel_size=1)
        })

    def forward(self, feature_maps):
        """
        Forward pass through the RPN.

        Args:
            feature_maps (dict): Dictionary of feature maps from FPN levels with varying channel depths.

        Returns:
            objectness (list of torch.Tensor): Objectness scores for each anchor.
            box_regression (list of torch.Tensor): Box adjustments for each anchor.
        """
        objectness, box_regression = [], []

        for level, fmap in feature_maps.items():
            # Shared conv layer specific to each level
            t = F.relu(self.shared_convs[level](fmap))  # Output shape: [B, 512, H, W]

            # Objectness and regression heads for each level
            obj = self.objectness_heads[level](t)       # Shape: [B, num_anchors, H, W]
            reg = self.box_regression_heads[level](t)   # Shape: [B, num_anchors * 4, H, W]

            objectness.append(obj)
            box_regression.append(reg)

        return objectness, box_regression


In [None]:
def test_rpn():
    # Dummy feature maps from 4 FPN levels
    feature_maps = {
        "P3": torch.rand(1, 256, 64, 64).to(device),
        "P4": torch.rand(1, 512, 32, 32).to(device),
        "P5": torch.rand(1, 1024, 16, 16).to(device),
        "P6": torch.rand(1, 2048, 8, 8).to(device)
    }

    # Instantiate the RPN
    num_anchors = 9  # Assume 3 scales and 3 aspect ratios per level
    rpn = MultiLevelRPN(num_anchors=num_anchors).to(device)

    # Run the RPN and get objectness and box regression outputs
    objectness, box_regression = rpn(feature_maps)

    # Print the shape of each output for verification
    for i, level in enumerate(feature_maps.keys()):
        print(f"{level} objectness shape: {objectness[i].shape}")
        print(f"{level} box regression shape: {box_regression[i].shape}")

# Run the test
test_rpn()


P3 objectness shape: torch.Size([1, 9, 64, 64])
P3 box regression shape: torch.Size([1, 36, 64, 64])
P4 objectness shape: torch.Size([1, 9, 32, 32])
P4 box regression shape: torch.Size([1, 36, 32, 32])
P5 objectness shape: torch.Size([1, 9, 16, 16])
P5 box regression shape: torch.Size([1, 36, 16, 16])
P6 objectness shape: torch.Size([1, 9, 8, 8])
P6 box regression shape: torch.Size([1, 36, 8, 8])


Filter Proposals

In [121]:



def filter_proposals(objectness, box_regression, anchors, image_size, pre_nms_top_n=1000, post_nms_top_n=300, nms_thresh=0.7):
    """
    Filters proposals using objectness scores, box regression, and NMS.

    Args:
        objectness (list of torch.Tensor): List of objectness scores for each anchor, one per FPN level.
        box_regression (list of torch.Tensor): List of box adjustments for each anchor, one per FPN level.
        anchors (list of torch.Tensor): List of anchors for each FPN level.
        image_size (tuple): Tuple of (height, width) of the input image.
        pre_nms_top_n (int): Number of top proposals to keep before NMS.
        post_nms_top_n (int): Number of top proposals to keep after NMS.
        nms_thresh (float): Threshold for non-maximum suppression.

    Returns:
        proposals (torch.Tensor): Filtered proposals after NMS, shape [post_nms_top_n, 4].
    """
    proposals = []
    scores = []

    for level, (obj, box_reg, anchor) in enumerate(zip(objectness, box_regression, anchors)):
        # Flatten objectness and box_regression predictions
        obj = obj.view(-1)  # Shape: [num_anchors * H * W]
        box_reg = box_reg.view(-1, 4)  # Shape: [num_anchors * H * W, 4]
        anchor = anchor.view(-1, 4)  # Shape: [num_anchors * H * W, 4]

        # Debug prints
        #print(f"Level {level} - Objectness shape: {obj.shape}, Box regression shape: {box_reg.shape}, Anchor shape: {anchor.shape}")

        # Ensure shapes match before decoding
        if obj.shape[0] != box_reg.shape[0] or box_reg.shape[0] != anchor.shape[0]:
            print(f"Shape mismatch at level {level}: obj {obj.shape[0]}, box_reg {box_reg.shape[0]}, anchor {anchor.shape[0]}")
            continue  # Skip this level if there's a mismatch

        # Decode the box regression outputs to generate proposals
        proposals_level = decode_boxes(anchor, box_reg)

        # Clip proposals to image bounds
        proposals_level[:, [0, 2]] = proposals_level[:, [0, 2]].clamp(0, image_size[1])
        proposals_level[:, [1, 3]] = proposals_level[:, [1, 3]].clamp(0, image_size[0])

        # Adjust topk selection to the actual number of available scores
        num_scores = obj.size(0)
        topk = min(pre_nms_top_n, num_scores)  # Use the minimum of available scores and pre_nms_top_n
        topk_indices = obj.topk(topk).indices

        proposals.append(proposals_level[topk_indices])
        scores.append(obj[topk_indices])

    # Concatenate all proposals and scores across FPN levels
    proposals = torch.cat(proposals, dim=0) if proposals else torch.empty(0, 4).to(anchors[0].device)  # Shape: [total_proposals, 4]
    scores = torch.cat(scores, dim=0) if scores else torch.empty(0).to(anchors[0].device)              # Shape: [total_proposals]

    # Debug: check concatenated shapes
    #print("Concatenated proposals shape:", proposals.shape)
    #print("Concatenated scores shape:", scores.shape)

    # Apply NMS to the proposals based on the scores
    if proposals.shape[0] > 0:
        keep_indices = nms(proposals, scores, nms_thresh)
        keep_indices = keep_indices[:post_nms_top_n]
        proposals = proposals[keep_indices]
    else:
        proposals = torch.empty(post_nms_top_n, 4).to(anchors[0].device)

    return proposals






def decode_boxes(anchors, box_deltas):
    """
    Decodes box regression deltas to predicted box coordinates.

    Args:
        anchors (torch.Tensor): Anchor boxes, shape [N, 4].
        box_deltas (torch.Tensor): Box regression deltas, shape [N, 4].

    Returns:
        proposals (torch.Tensor): Decoded proposals, shape [N, 4].
    """
    # Check and reshape box_deltas if the shapes don't align
    # if anchors.shape[0] != box_deltas.shape[0]:
    #     # Reshape box_deltas to match anchors in the first dimension
    #     box_deltas = box_deltas.view(-1, 4)  # Ensures it has [N, 4] shape

    #print("Anchors shape:", anchors.shape)
    #print("Box deltas shape:", box_deltas.shape)

    # Calculate widths, heights, and centers of the anchors
    widths = anchors[:, 2] - anchors[:, 0]
    heights = anchors[:, 3] - anchors[:, 1]
    ctr_x = anchors[:, 0] + 0.5 * widths
    ctr_y = anchors[:, 1] + 0.5 * heights

    # Apply box deltas to anchors
    dx = box_deltas[:, 0]
    dy = box_deltas[:, 1]
    dw = box_deltas[:, 2]
    dh = box_deltas[:, 3]

    pred_ctr_x = ctr_x + dx * widths
    pred_ctr_y = ctr_y + dy * heights
    dw = torch.clamp(dw, min=-10, max=10)  # Adjust the range as necessary
    dh = torch.clamp(dh, min=-10, max=10)

    pred_w = widths * torch.exp(dw)
    pred_h = heights * torch.exp(dh)

    # Calculate the coordinates of the proposed boxes
    proposals = torch.zeros_like(box_deltas)
    proposals[:, 0] = pred_ctr_x - 0.5 * pred_w
    proposals[:, 1] = pred_ctr_y - 0.5 * pred_h
    proposals[:, 2] = pred_ctr_x + 0.5 * pred_w
    proposals[:, 3] = pred_ctr_y + 0.5 * pred_h



    return proposals



In [None]:
def test_filter_proposals():
    # Dummy inputs based on previous RPN output and anchor generation
    # Assume 9 anchors per spatial location, so 9 * H * W anchors per level
    objectness = [torch.rand(1, 9, 64, 64).view(-1) for _ in range(4)]  # Flattened scores per level
    box_regression = [torch.rand(1, 36, 64, 64).view(-1, 4) for _ in range(4)]  # Shape [num_anchors * H * W, 4]
    anchors = [torch.rand(64 * 64 * 9, 4) for _ in range(4)]  # Shape [num_anchors * H * W, 4] per level

    # Example image size
    image_size = (256, 256)

    # Run filter_proposals
    proposals = filter_proposals(objectness, box_regression, anchors, image_size)

    print("Filtered proposals shape:", proposals.shape)

# Run the test
test_filter_proposals()


Filtered proposals shape: torch.Size([300, 4])


Combined testing of all code so far

In [6]:
def combined_test():
    # Step 1: Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Step 2: Create a dummy input image tensor and move it to the correct device
    dummy_image = torch.rand(1, 3, 256, 256).to(device)

    # Step 3: Pass the image through the backbone to obtain feature maps
    feature_maps = backbone_network(dummy_image)

    # Step 4: Initialize the RPN and generate objectness scores and box regressions
    num_anchors = 9  # Assuming 3 scales and 3 aspect ratios per level
    rpn = MultiLevelRPN(num_anchors=num_anchors).to(device)
    objectness, box_regression = rpn(feature_maps)

    # Step 5: Generate anchors for each feature map level and move anchors to device
    sizes = [(32, 64, 128), (64, 128, 256), (128, 256, 512), (256, 512, 1024)]
    aspect_ratios = [(0.5, 1.0, 2.0)] * 4  # Same aspect ratios for each FPN level
    anchor_gen = AnchorGenerator(sizes, aspect_ratios)
    feature_map_shapes = [(fmap.shape[2], fmap.shape[3]) for fmap in feature_maps.values()]

    # Generate anchors and move them to the correct device
    anchors = [anchor.to(device) for anchor in anchor_gen.generate_anchors(feature_map_shapes)]

    # Step 6: Use the filter_proposals function to generate final proposals
    image_size = (256, 256)
    proposals = filter_proposals(objectness, box_regression, anchors, image_size)

    # Print out the result to verify everything is working together
    print("Combined test - final filtered proposals shape:", proposals.shape)

# Run the combined test
combined_test()


Combined test - final filtered proposals shape: torch.Size([300, 4])


Discriminative RoI Pooling
Function: Extracts discriminative features for classification.

Input: Candidate proposal features from RPN

Output: Pooled RoI features

Description: Uses adaptive weighted pooling to enhance features for classification.

In [7]:


class CustomRoIPooling:
    def __init__(self, output_size=(7, 7), sampling_ratio=2):
        """
        Custom RoI Pooling to handle feature maps with different channel depths.

        Args:
            output_size (tuple): The size (height, width) of the output features after pooling.
            sampling_ratio (int): Sampling ratio for RoIAlign (controls granularity of interpolation).
        """
        self.output_size = output_size
        self.sampling_ratio = sampling_ratio

    def forward(self, feature_maps, proposals, image_size):
        """
        Applies RoI pooling to each feature map level independently.

        Args:
            feature_maps (dict): Dictionary of feature maps from backbone/FPN.
            proposals (torch.Tensor): Proposed regions of interest, shape [num_proposals, 4].
            image_size (tuple): Original image size (height, width).

        Returns:
            pooled_features (torch.Tensor): Combined pooled features.
        """
        pooled_features_list = []

        for level, fmap in feature_maps.items():
            # Perform RoI Align on the current feature map
            pooled = roi_align(
                fmap, [proposals],
                output_size=self.output_size,
                spatial_scale=fmap.shape[-2] / image_size[0],  # Scale based on feature map level
                sampling_ratio=self.sampling_ratio
            )
            pooled_features_list.append(pooled)

        # Concatenate along the channel dimension
        pooled_features = torch.cat(pooled_features_list, dim=1)

        return pooled_features

In [None]:
# Modify the test function to use CustomRoIPooling
def test_custom_roi_pooling():
    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Create a dummy input image tensor and move it to the correct device
    dummy_image = torch.rand(1, 3, 256, 256).to(device)
    print("Dummy image shape:", dummy_image.shape)

    # Pass the image through the backbone to obtain feature maps
    feature_maps = backbone_network(dummy_image)
    for level, fmap in feature_maps.items():
        print(f"Feature map {level} shape:", fmap.shape)

    # Initialize the RPN and generate objectness scores and box regressions
    num_anchors = 9  # Assuming 3 scales and 3 aspect ratios per level
    rpn = MultiLevelRPN(num_anchors=num_anchors).to(device)
    objectness, box_regression = rpn(feature_maps)
    print("Objectness shapes:", [obj.shape for obj in objectness])
    print("Box regression shapes:", [box.shape for box in box_regression])

    # Generate anchors and filter proposals
    sizes = [(32, 64, 128), (64, 128, 256), (128, 256, 512), (256, 512, 1024)]
    aspect_ratios = [(0.5, 1.0, 2.0)] * 4
    anchor_gen = AnchorGenerator(sizes, aspect_ratios)
    feature_map_shapes = [(fmap.shape[2], fmap.shape[3]) for fmap in feature_maps.values()]
    anchors = [anchor.to(device) for anchor in anchor_gen.generate_anchors(feature_map_shapes)]

    image_size = (256, 256)
    proposals = filter_proposals(objectness, box_regression, anchors, image_size)
    print("Filtered proposals shape:", proposals.shape)

    # Initialize and apply custom RoI Pooling
    custom_roi_pooling = CustomRoIPooling(output_size=(7, 7), sampling_ratio=2)
    pooled_features = custom_roi_pooling.forward(feature_maps, proposals, image_size)

    # Print out the shape of the pooled features to verify correctness
    print("Pooled features shape:", pooled_features.shape)

# Run the test
test_custom_roi_pooling()


Dummy image shape: torch.Size([1, 3, 256, 256])
Feature map P3 shape: torch.Size([1, 256, 64, 64])
Feature map P4 shape: torch.Size([1, 512, 32, 32])
Feature map P5 shape: torch.Size([1, 1024, 16, 16])
Feature map P6 shape: torch.Size([1, 2048, 8, 8])
Objectness shapes: [torch.Size([1, 9, 64, 64]), torch.Size([1, 9, 32, 32]), torch.Size([1, 9, 16, 16]), torch.Size([1, 9, 8, 8])]
Box regression shapes: [torch.Size([1, 36, 64, 64]), torch.Size([1, 36, 32, 32]), torch.Size([1, 36, 16, 16]), torch.Size([1, 36, 8, 8])]
Filtered proposals shape: torch.Size([300, 4])
Pooled features shape: torch.Size([300, 3840, 7, 7])


Dense Local Regeression

In [8]:


class DenseLocalRegression(nn.Module):
    def __init__(self, input_channels, output_size=4, intermediate_dim=512):
        """
        Enhanced Dense Local Regression with convolutional layers and attention.

        Args:
            input_channels (int): Number of input channels from pooled features.
            output_size (int): Number of outputs per proposal (4 for bbox regression: dx, dy, dw, dh).
            intermediate_dim (int): Dimension for intermediate fully connected layers.
        """
        super(DenseLocalRegression, self).__init__()

        # Convolutional layers to process spatial features
        self.conv1 = nn.Conv2d(input_channels, 256, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(256)
        self.conv2 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(256)

        # Spatial Attention Layer
        self.spatial_attn_conv = nn.Conv2d(256, 1, kernel_size=1)  # 1x1 Conv for spatial attention

        # Fully connected regression head
        self.fc1 = nn.Linear(256 * 7 * 7, intermediate_dim)
        self.fc2 = nn.Linear(intermediate_dim, intermediate_dim)
        self.fc_out = nn.Linear(intermediate_dim, output_size)

        # Dropout for regularization
        self.dropout = nn.Dropout(0.3)

        self.relu = nn.ReLU()

    def forward(self, pooled_features):
        """
        Forward pass through the Enhanced Dense Local Regression module.

        Args:
            pooled_features (torch.Tensor): RoI pooled features, shape [num_proposals, C, H, W].

        Returns:
            torch.Tensor: Adjustments for bounding boxes, shape [num_proposals, 4].
        """
        # Step 1: Apply initial convolutions and batch normalization
        x = self.relu(self.bn1(self.conv1(pooled_features)))
        x = self.relu(self.bn2(self.conv2(x)))

        # Step 2: Apply spatial attention
        spatial_attn = torch.sigmoid(self.spatial_attn_conv(x))  # Shape: [num_proposals, 1, H, W]
        x = x * spatial_attn  # Element-wise multiplication with attention map

        # Step 3: Flatten the features for fully connected layers
        x = x.view(x.size(0), -1)  # Shape: [num_proposals, 256 * 7 * 7]

        # Step 4: Apply fully connected layers with dropout for regression
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        bbox_deltas = self.fc_out(x)  # Shape: [num_proposals, 4]

        return bbox_deltas


In [None]:
def test_dense_local_regression():
    # Step 1: Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Step 2: Create a dummy input image tensor and move it to the correct device
    dummy_image = torch.rand(1, 3, 256, 256).to(device)
    print("Dummy image shape:", dummy_image.shape)

    # Step 3: Pass the image through the backbone to obtain feature maps
    feature_maps = backbone_network(dummy_image)
    for level, fmap in feature_maps.items():
        print(f"Feature map {level} shape:", fmap.shape)

    # Step 4: Initialize the RPN and generate objectness scores and box regressions
    num_anchors = 9  # Assuming 3 scales and 3 aspect ratios per level
    rpn = MultiLevelRPN(num_anchors=num_anchors).to(device)
    objectness, box_regression = rpn(feature_maps)
    print("Objectness shapes:", [obj.shape for obj in objectness])
    print("Box regression shapes:", [box.shape for box in box_regression])

    # Step 5: Generate anchors and filter proposals
    sizes = [(32, 64, 128), (64, 128, 256), (128, 256, 512), (256, 512, 1024)]
    aspect_ratios = [(0.5, 1.0, 2.0)] * 4
    anchor_gen = AnchorGenerator(sizes, aspect_ratios)
    feature_map_shapes = [(fmap.shape[2], fmap.shape[3]) for fmap in feature_maps.values()]
    anchors = [anchor.to(device) for anchor in anchor_gen.generate_anchors(feature_map_shapes)]

    image_size = (256, 256)
    proposals = filter_proposals(objectness, box_regression, anchors, image_size)
    print("Filtered proposals shape:", proposals.shape)

    # Step 6: Initialize and apply custom RoI Pooling
    custom_roi_pooling = CustomRoIPooling(output_size=(7, 7), sampling_ratio=2)
    pooled_features = custom_roi_pooling.forward(feature_maps, proposals, image_size)
    print("Pooled features shape:", pooled_features.shape)

    # Step 7: Initialize Dense Local Regression module and apply it to the pooled features
    input_channels = pooled_features.shape[1]  # This should be the combined channels from FPN levels
    dense_local_regression = DenseLocalRegression(input_channels=input_channels).to(device)
    bbox_deltas = dense_local_regression(pooled_features)
    print("Bounding box deltas shape:", bbox_deltas.shape)

    # Check if output has correct shape [num_proposals, 4]
    assert bbox_deltas.shape == (pooled_features.shape[0], 4), \
        f"Expected shape {(pooled_features.shape[0], 4)}, but got {bbox_deltas.shape}"

    print("Test passed: Dense Local Regression module outputs correctly shaped bbox deltas.")

# Run the test
test_dense_local_regression()


Dummy image shape: torch.Size([1, 3, 256, 256])
Feature map P3 shape: torch.Size([1, 256, 64, 64])
Feature map P4 shape: torch.Size([1, 512, 32, 32])
Feature map P5 shape: torch.Size([1, 1024, 16, 16])
Feature map P6 shape: torch.Size([1, 2048, 8, 8])
Objectness shapes: [torch.Size([1, 9, 64, 64]), torch.Size([1, 9, 32, 32]), torch.Size([1, 9, 16, 16]), torch.Size([1, 9, 8, 8])]
Box regression shapes: [torch.Size([1, 36, 64, 64]), torch.Size([1, 36, 32, 32]), torch.Size([1, 36, 16, 16]), torch.Size([1, 36, 8, 8])]
Filtered proposals shape: torch.Size([300, 4])
Pooled features shape: torch.Size([300, 3840, 7, 7])
Bounding box deltas shape: torch.Size([300, 4])
Test passed: Dense Local Regression module outputs correctly shaped bbox deltas.


Bounding Box Refinement

In [38]:
import torch

class BoundingBoxRefinement:
    def __init__(self):
        """
        Initializes the Bounding Box Refinement module.
        """
        pass  # No specific layers required as it's a geometric transformation

    def refine_boxes(self, proposals, bbox_deltas, image_size):
        """
        Refines bounding boxes by applying bbox deltas to proposals.

        Args:
            proposals (torch.Tensor): Tensor of proposals, shape [num_proposals, 4].
            bbox_deltas (torch.Tensor): Tensor of deltas to be applied to proposals, shape [num_proposals, 4].
            image_size (tuple): The size of the image (height, width).

        Returns:
            torch.Tensor: Refined bounding boxes with values clamped to image boundaries.
        """
        refined_boxes = proposals + bbox_deltas

        # Clamping refined boxes to stay within image boundaries
        refined_boxes = torch.stack([
            torch.clamp(refined_boxes[:, 0], 0, image_size[1]),  # x1
            torch.clamp(refined_boxes[:, 1], 0, image_size[0]),  # y1
            torch.clamp(refined_boxes[:, 2], 0, image_size[1]),  # x2
            torch.clamp(refined_boxes[:, 3], 0, image_size[0])   # y2
        ], dim=1)

        return refined_boxes



In [39]:
def test_bounding_box_refinement():
    # Step 1: Create dummy proposals and box deltas
    num_proposals = 300
    image_size = (256, 256)

    proposals = torch.rand(num_proposals, 4) * 256  # Random proposals within image bounds
    proposals[:, 2:] += proposals[:, :2]  # Ensure x2 > x1 and y2 > y1

    box_deltas = torch.randn(num_proposals, 4) * 0.1  # Small random deltas for refinement

    # Step 2: Initialize and apply BoundingBoxRefinement
    bbox_refiner = BoundingBoxRefinement()
    refined_boxes = bbox_refiner.refine_boxes(proposals, box_deltas, image_size)

    # Step 3: Print shapes and check if boxes are within image boundaries
    print("Refined boxes shape:", refined_boxes.shape)  # Expecting [num_proposals, 4]
    assert refined_boxes.shape == (num_proposals, 4), "Shape mismatch in refined boxes."

    # Check that all boxes are within image boundaries
    assert torch.all(refined_boxes[:, 0] >= 0) and torch.all(refined_boxes[:, 1] >= 0), \
        "Refined boxes have coordinates less than 0."
    assert torch.all(refined_boxes[:, 2] <= image_size[1]) and torch.all(refined_boxes[:, 3] <= image_size[0]), \
        "Refined boxes exceed image boundaries."

    print("Test passed: BoundingBoxRefinement outputs correctly shaped and bounded boxes.")

# Run the test
test_bounding_box_refinement()


Refined boxes shape: torch.Size([300, 4])
Test passed: BoundingBoxRefinement outputs correctly shaped and bounded boxes.


Discriminative RoI Pooling

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DiscriminativeRoIPooling(nn.Module):
    def __init__(self, input_channels, output_channels=256):
        """
        Enhanced Discriminative RoI Pooling with an attention mechanism and additional layers.

        Args:
            input_channels (int): Number of input channels from pooled features.
            output_channels (int): Number of output channels for enhanced features.
        """
        super(DiscriminativeRoIPooling, self).__init__()

        # Initial Convolutional Layers to refine features
        self.conv1 = nn.Conv2d(input_channels, output_channels, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(output_channels)
        self.conv2 = nn.Conv2d(output_channels, output_channels, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(output_channels)

        # Spatial Attention Layer
        self.spatial_attn_conv = nn.Conv2d(output_channels, 1, kernel_size=1)  # 1x1 Conv for spatial attention

        # Channel Attention Layer (Squeeze-and-Excitation)
        self.channel_attn_fc1 = nn.Linear(output_channels, output_channels // 4)  # Bottleneck layer
        self.channel_attn_fc2 = nn.Linear(output_channels // 4, output_channels)

        self.relu = nn.ReLU()

    def forward(self, pooled_features):
        """
        Forward pass through the enhanced Discriminative RoI Pooling module.

        Args:
            pooled_features (torch.Tensor): Pooled features from RoI pooling, shape [num_proposals, C, H, W].

        Returns:
            torch.Tensor: Enhanced features with attention, shape [num_proposals, output_channels, H, W].
        """
        # Step 1: Initial convolutions and batch normalization
        x = self.relu(self.bn1(self.conv1(pooled_features)))
        x = self.relu(self.bn2(self.conv2(x)))

        # Step 2: Spatial Attention
        spatial_attn = torch.sigmoid(self.spatial_attn_conv(x))  # Shape: [num_proposals, 1, H, W]
        x = x * spatial_attn  # Apply spatial attention

        # Step 3: Channel Attention (Squeeze-and-Excitation)
        b, c, h, w = x.size()
        channel_attn = F.adaptive_avg_pool2d(x, 1).view(b, c)  # Global average pooling to [num_proposals, C]
        channel_attn = self.relu(self.channel_attn_fc1(channel_attn))
        channel_attn = torch.sigmoid(self.channel_attn_fc2(channel_attn))  # Shape: [num_proposals, C]
        channel_attn = channel_attn.view(b, c, 1, 1)  # Reshape to broadcast
        x = x * channel_attn  # Apply channel attention

        return x


In [None]:
def test_discriminative_roi_pooling():
    # Step 1: Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Step 2: Create a dummy input image tensor and move it to the correct device
    dummy_image = torch.rand(1, 3, 256, 256).to(device)
    print("Dummy image shape:", dummy_image.shape)

    # Step 3: Pass the image through the backbone to obtain feature maps
    feature_maps = backbone_network(dummy_image)
    for level, fmap in feature_maps.items():
        print(f"Feature map {level} shape:", fmap.shape)

    # Step 4: Initialize the RPN and generate objectness scores and box regressions
    num_anchors = 9  # Assuming 3 scales and 3 aspect ratios per level
    rpn = MultiLevelRPN(num_anchors=num_anchors).to(device)
    objectness, box_regression = rpn(feature_maps)
    print("Objectness shapes:", [obj.shape for obj in objectness])
    print("Box regression shapes:", [box.shape for box in box_regression])

    # Step 5: Generate anchors and filter proposals
    sizes = [(32, 64, 128), (64, 128, 256), (128, 256, 512), (256, 512, 1024)]
    aspect_ratios = [(0.5, 1.0, 2.0)] * 4
    anchor_gen = AnchorGenerator(sizes, aspect_ratios)
    feature_map_shapes = [(fmap.shape[2], fmap.shape[3]) for fmap in feature_maps.values()]
    anchors = [anchor.to(device) for anchor in anchor_gen.generate_anchors(feature_map_shapes)]

    image_size = (256, 256)
    proposals = filter_proposals(objectness, box_regression, anchors, image_size)
    print("Filtered proposals shape:", proposals.shape)

    # Step 6: Initialize and apply custom RoI Pooling
    custom_roi_pooling = CustomRoIPooling(output_size=(7, 7), sampling_ratio=2)
    pooled_features = custom_roi_pooling.forward(feature_maps, proposals, image_size)
    print("Pooled features shape:", pooled_features.shape)

    # Step 7: Initialize Discriminative RoI Pooling module and apply it
    input_channels = pooled_features.shape[1]
    discriminative_roi_pooling = DiscriminativeRoIPooling(input_channels=input_channels, output_channels=256).to(device)
    enhanced_features = discriminative_roi_pooling(pooled_features)
    print("Enhanced features shape:", enhanced_features.shape)

    # Check if output has correct shape
    assert enhanced_features.shape == (pooled_features.shape[0], 256, 7, 7), \
        f"Expected shape {(pooled_features.shape[0], 256, 7, 7)}, but got {enhanced_features.shape}"

    print("Test passed: Discriminative RoI Pooling module outputs correctly shaped enhanced features.")

# Run the test
test_discriminative_roi_pooling()


Dummy image shape: torch.Size([1, 3, 256, 256])
Feature map P3 shape: torch.Size([1, 256, 64, 64])
Feature map P4 shape: torch.Size([1, 512, 32, 32])
Feature map P5 shape: torch.Size([1, 1024, 16, 16])
Feature map P6 shape: torch.Size([1, 2048, 8, 8])
Objectness shapes: [torch.Size([1, 9, 64, 64]), torch.Size([1, 9, 32, 32]), torch.Size([1, 9, 16, 16]), torch.Size([1, 9, 8, 8])]
Box regression shapes: [torch.Size([1, 36, 64, 64]), torch.Size([1, 36, 32, 32]), torch.Size([1, 36, 16, 16]), torch.Size([1, 36, 8, 8])]
Filtered proposals shape: torch.Size([300, 4])
Pooled features shape: torch.Size([300, 3840, 7, 7])
Enhanced features shape: torch.Size([300, 256, 7, 7])
Test passed: Discriminative RoI Pooling module outputs correctly shaped enhanced features.


Binary Overlap Prediction

In [11]:
import torch
import torch.nn as nn

class BinaryOverlapPrediction(nn.Module):
    def __init__(self, input_channels):
        """
        Initializes the Binary Overlap Prediction module.

        Args:
            input_channels (int): Number of channels in the input features.
        """
        super(BinaryOverlapPrediction, self).__init__()

        # Fully connected layers for overlap prediction
        self.fc1 = nn.Linear(input_channels * 7 * 7, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc_overlap = nn.Linear(512, 1)  # Single output per region proposal

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()  # Output probability for overlap

    def forward(self, pooled_features):
        """
        Forward pass through the Binary Overlap Prediction module.

        Args:
            pooled_features (torch.Tensor): Pooled RoI features, shape [num_proposals, C, H, W].

        Returns:
            torch.Tensor: Overlap scores per proposal, shape [num_proposals, 1].
        """
        # Flatten features and pass through the fully connected layers
        x = pooled_features.view(pooled_features.size(0), -1)  # Flatten to [num_proposals, C*H*W]
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        overlap_scores = self.sigmoid(self.fc_overlap(x))  # Probability of overlap [num_proposals, 1]

        return overlap_scores


In [None]:
def test_binary_overlap_prediction():
    # Step 1: Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Step 2: Create a dummy input image tensor and move it to the correct device
    dummy_image = torch.rand(1, 3, 256, 256).to(device)
    print("Dummy image shape:", dummy_image.shape)

    # Step 3: Pass the image through the backbone to obtain feature maps
    feature_maps = backbone_network(dummy_image)
    for level, fmap in feature_maps.items():
        print(f"Feature map {level} shape:", fmap.shape)

    # Step 4: Initialize the RPN and generate objectness scores and box regressions
    num_anchors = 9  # Assuming 3 scales and 3 aspect ratios per level
    rpn = MultiLevelRPN(num_anchors=num_anchors).to(device)
    objectness, box_regression = rpn(feature_maps)
    print("Objectness shapes:", [obj.shape for obj in objectness])
    print("Box regression shapes:", [box.shape for box in box_regression])

    # Step 5: Generate anchors and filter proposals
    sizes = [(32, 64, 128), (64, 128, 256), (128, 256, 512), (256, 512, 1024)]
    aspect_ratios = [(0.5, 1.0, 2.0)] * 4
    anchor_gen = AnchorGenerator(sizes, aspect_ratios)
    feature_map_shapes = [(fmap.shape[2], fmap.shape[3]) for fmap in feature_maps.values()]
    anchors = [anchor.to(device) for anchor in anchor_gen.generate_anchors(feature_map_shapes)]

    image_size = (256, 256)
    proposals = filter_proposals(objectness, box_regression, anchors, image_size)
    print("Filtered proposals shape:", proposals.shape)

    # Step 6: Initialize and apply custom RoI Pooling
    custom_roi_pooling = CustomRoIPooling(output_size=(7, 7), sampling_ratio=2)
    pooled_features = custom_roi_pooling.forward(feature_maps, proposals, image_size)
    print("Pooled features shape:", pooled_features.shape)

    # Step 7: Initialize Binary Overlap Prediction module and apply it
    input_channels = pooled_features.shape[1]
    binary_overlap_prediction = BinaryOverlapPrediction(input_channels=input_channels).to(device)
    overlap_scores = binary_overlap_prediction(pooled_features)
    print("Overlap scores shape:", overlap_scores.shape)

    # Check if output has correct shape [num_proposals, 1]
    assert overlap_scores.shape == (pooled_features.shape[0], 1), \
        f"Expected shape {(pooled_features.shape[0], 1)}, but got {overlap_scores.shape}"

    # Check if values are within [0, 1] range
    assert (overlap_scores >= 0).all() and (overlap_scores <= 1).all(), "Overlap scores are out of range [0, 1]"

    print("Test passed: Binary Overlap Prediction module outputs correctly shaped and bounded overlap scores.")

# Run the test
test_binary_overlap_prediction()


Dummy image shape: torch.Size([1, 3, 256, 256])
Feature map P3 shape: torch.Size([1, 256, 64, 64])
Feature map P4 shape: torch.Size([1, 512, 32, 32])
Feature map P5 shape: torch.Size([1, 1024, 16, 16])
Feature map P6 shape: torch.Size([1, 2048, 8, 8])
Objectness shapes: [torch.Size([1, 9, 64, 64]), torch.Size([1, 9, 32, 32]), torch.Size([1, 9, 16, 16]), torch.Size([1, 9, 8, 8])]
Box regression shapes: [torch.Size([1, 36, 64, 64]), torch.Size([1, 36, 32, 32]), torch.Size([1, 36, 16, 16]), torch.Size([1, 36, 8, 8])]
Filtered proposals shape: torch.Size([300, 4])
Pooled features shape: torch.Size([300, 3840, 7, 7])
Overlap scores shape: torch.Size([300, 1])
Test passed: Binary Overlap Prediction module outputs correctly shaped and bounded overlap scores.


Object Classification

In [12]:
import torch.nn as nn

class ObjectClassification(nn.Module):
    def __init__(self, input_channels, num_classes):
        """
        Initializes the Object Classification module with enhanced architecture.

        Args:
            input_channels (int): Number of input channels (should match the pooled feature map's channel dimension).
            num_classes (int): Total number of classes, including one for background.
        """
        super(ObjectClassification, self).__init__()

        # Fully connected layers for classification with BatchNorm and Dropout
        self.fc1 = nn.Linear(input_channels * 7 * 7, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.dropout1 = nn.Dropout(0.5)

        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.dropout2 = nn.Dropout(0.5)

        self.fc3 = nn.Linear(512, 256)
        self.bn3 = nn.BatchNorm1d(256)
        self.dropout3 = nn.Dropout(0.3)

        self.fc_out = nn.Linear(256, num_classes)  # Output layer with num_classes outputs

        # Activation functions
        self.relu = nn.LeakyReLU(0.1)  # Leaky ReLU for better gradient flow
        self.softmax = nn.Softmax(dim=1)  # Apply softmax along the class dimension

    def forward(self, pooled_features):
        """
        Forward pass through the enhanced Object Classification module.

        Args:
            pooled_features (torch.Tensor): RoI pooled features, shape [num_proposals, C, H, W].

        Returns:
            torch.Tensor: Class probabilities, shape [num_proposals, num_classes].
        """
        # Flatten the features for fully connected layers
        x = pooled_features.view(pooled_features.size(0), -1)  # Shape: [num_proposals, C*H*W]

        # Layer 1: Fully connected + BatchNorm + Dropout + Activation
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)

        # Layer 2: Fully connected + BatchNorm + Dropout + Activation
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)

        # Layer 3: Additional Fully connected + BatchNorm + Dropout + Activation
        x = self.relu(self.bn3(self.fc3(x)))
        x = self.dropout3(x)

        # Output layer with softmax activation for class probabilities
        class_logits = self.fc_out(x)  # Shape: [num_proposals, num_classes]
        class_probs = self.softmax(class_logits)  # Softmax to get class probabilities

        return class_probs


In [None]:
def test_object_classification_with_binary_overlap():
    # Step 1: Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Step 2: Create a dummy input image tensor and move it to the correct device
    dummy_image = torch.rand(1, 3, 256, 256).to(device)
    print("Dummy image shape:", dummy_image.shape)

    # Step 3: Pass the image through the backbone to obtain feature maps
    feature_maps = backbone_network(dummy_image)
    for level, fmap in feature_maps.items():
        print(f"Feature map {level} shape:", fmap.shape)

    # Step 4: Initialize the RPN and generate objectness scores and box regressions
    num_anchors = 9  # Assuming 3 scales and 3 aspect ratios per level
    rpn = MultiLevelRPN(num_anchors=num_anchors).to(device)
    objectness, box_regression = rpn(feature_maps)
    print("Objectness shapes:", [obj.shape for obj in objectness])
    print("Box regression shapes:", [box.shape for box in box_regression])

    # Step 5: Generate anchors and filter proposals
    sizes = [(32, 64, 128), (64, 128, 256), (128, 256, 512), (256, 512, 1024)]
    aspect_ratios = [(0.5, 1.0, 2.0)] * 4
    anchor_gen = AnchorGenerator(sizes, aspect_ratios)
    feature_map_shapes = [(fmap.shape[2], fmap.shape[3]) for fmap in feature_maps.values()]
    anchors = [anchor.to(device) for anchor in anchor_gen.generate_anchors(feature_map_shapes)]

    image_size = (256, 256)
    proposals = filter_proposals(objectness, box_regression, anchors, image_size)
    print("Filtered proposals shape:", proposals.shape)

    # Step 6: Initialize and apply custom RoI Pooling
    custom_roi_pooling = CustomRoIPooling(output_size=(7, 7), sampling_ratio=2)
    pooled_features = custom_roi_pooling.forward(feature_maps, proposals, image_size)
    print("Pooled features shape:", pooled_features.shape)

    # Step 7: Initialize Discriminative RoI Pooling module and apply it
    input_channels = pooled_features.shape[1]
    discriminative_roi_pooling = DiscriminativeRoIPooling(input_channels=input_channels, output_channels=256).to(device)
    enhanced_features = discriminative_roi_pooling(pooled_features)
    print("Enhanced features shape:", enhanced_features.shape)

    # Step 8: Initialize and apply Binary Overlap Prediction
    binary_overlap = BinaryOverlapPrediction(input_channels=256).to(device)
    overlap_scores = binary_overlap(enhanced_features)
    print("Overlap scores shape:", overlap_scores.shape)  # Expecting [num_proposals, 1]

    # Check if overlap scores have correct shape
    assert overlap_scores.shape == (enhanced_features.shape[0], 1), \
        f"Expected shape {(enhanced_features.shape[0], 1)}, but got {overlap_scores.shape}"

    # Step 9: Initialize and test the Object Classification module
    num_classes = 21  # Assuming 20 object classes + 1 background class
    object_classifier = ObjectClassification(input_channels=256, num_classes=num_classes).to(device)
    class_probs = object_classifier(enhanced_features)

    # Print shapes to verify correctness
    print("Class probabilities shape:", class_probs.shape)  # Expecting [num_proposals, num_classes]

    # Check if output has correct shape
    assert class_probs.shape == (enhanced_features.shape[0], num_classes), \
        f"Test failed: Expected shape {(enhanced_features.shape[0], num_classes)}, but got {class_probs.shape}"

    print("Test passed: Object Classification module outputs correctly shaped class probabilities.")
    print("Test passed: Binary Overlap Prediction module outputs correctly shaped overlap scores.")

# Run the test
test_object_classification_with_binary_overlap()


Dummy image shape: torch.Size([1, 3, 256, 256])
Feature map P3 shape: torch.Size([1, 256, 64, 64])
Feature map P4 shape: torch.Size([1, 512, 32, 32])
Feature map P5 shape: torch.Size([1, 1024, 16, 16])
Feature map P6 shape: torch.Size([1, 2048, 8, 8])
Objectness shapes: [torch.Size([1, 9, 64, 64]), torch.Size([1, 9, 32, 32]), torch.Size([1, 9, 16, 16]), torch.Size([1, 9, 8, 8])]
Box regression shapes: [torch.Size([1, 36, 64, 64]), torch.Size([1, 36, 32, 32]), torch.Size([1, 36, 16, 16]), torch.Size([1, 36, 8, 8])]
Filtered proposals shape: torch.Size([300, 4])
Pooled features shape: torch.Size([300, 3840, 7, 7])
Enhanced features shape: torch.Size([300, 256, 7, 7])
Overlap scores shape: torch.Size([300, 1])
Class probabilities shape: torch.Size([300, 21])
Test passed: Object Classification module outputs correctly shaped class probabilities.
Test passed: Binary Overlap Prediction module outputs correctly shaped overlap scores.


Instance Segmentation - Mask Prediction

In [101]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MaskPrediction(nn.Module):
    def __init__(self, input_channels, mask_size=28, num_classes=1):
        """
        Initializes the Mask Prediction module for instance segmentation.

        Args:
            input_channels (int): Number of input channels (should match the pooled feature map's channel dimension).
            mask_size (int): The height and width of the output mask (e.g., 28x28).
            num_classes (int): Number of classes. Set to 1 for binary masks or higher for class-specific masks.
        """
        super(MaskPrediction, self).__init__()

        # Convolutional layers to process the pooled RoI features
        self.conv1 = nn.Conv2d(input_channels, 256, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(256, 256, kernel_size=3, padding=1)

        # First upsample layer to increase spatial dimensions
        self.deconv = nn.ConvTranspose2d(256, 256, kernel_size=2, stride=2)

        # Final layer to output the mask with the desired size
        self.conv_out = nn.Conv2d(256, num_classes, kernel_size=1)

        # Activation function
        self.relu = nn.ReLU()

        # Target mask size
        self.mask_size = mask_size

    def forward(self, pooled_features):
        """
        Forward pass through the Mask Prediction module.

        Args:
            pooled_features (torch.Tensor): RoI pooled features, shape [num_proposals, C, H, W].

        Returns:
            torch.Tensor: Binary masks for each proposal, shape [num_proposals, num_classes, mask_size, mask_size].
        """
        # Target shape for pooled features (e.g., 14x14)
        target_height, target_width = self.mask_size // 2, self.mask_size // 2

        # Pad or crop pooled_features to the target shape if necessary
        pooled_features = self.pad_or_crop(pooled_features, target_height, target_width)

        # Forward through the convolutional layers
        x = self.relu(self.conv1(pooled_features))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))

        # First upsample step
        x = self.relu(self.deconv(x))

        # Final conv layer for mask prediction
        mask_logits = self.conv_out(x)

        # Upsample to target size (28x28) if necessary
        if mask_logits.shape[2] != self.mask_size or mask_logits.shape[3] != self.mask_size:
            mask_logits = F.interpolate(mask_logits, size=(self.mask_size, self.mask_size), mode='bilinear', align_corners=False)

        # Apply sigmoid for binary mask or softmax for multi-class mask
        if mask_logits.shape[1] == 1:
            mask_probs = torch.sigmoid(mask_logits)
        else:
            mask_probs = F.softmax(mask_logits, dim=1)

        return mask_probs

    def pad_or_crop(self, features, target_height, target_width):
        """
        Pad or crop features to the target height and width.

        Args:
            features (torch.Tensor): Input tensor of shape [N, C, H, W].
            target_height (int): Target height.
            target_width (int): Target width.

        Returns:
            torch.Tensor: Tensor of shape [N, C, target_height, target_width].
        """
        _, _, h, w = features.shape
        pad_h = max(0, target_height - h)
        pad_w = max(0, target_width - w)

        # If padding is needed, pad the height and width dimensions
        if pad_h > 0 or pad_w > 0:
            features = F.pad(features, (0, pad_w, 0, pad_h), mode='constant', value=0)

        # Crop if dimensions are larger than target
        if h > target_height or w > target_width:
            features = features[:, :, :target_height, :target_width]

        return features


In [102]:
def test_mask_prediction():
    # Step 1: Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Step 2: Create a dummy input image tensor and move it to the correct device
    dummy_image = torch.rand(1, 3, 256, 256).to(device)
    print("Dummy image shape:", dummy_image.shape)

    # Step 3: Pass the image through the backbone to obtain feature maps
    feature_maps = backbone_network(dummy_image)
    for level, fmap in feature_maps.items():
        print(f"Feature map {level} shape:", fmap.shape)

    # Step 4: Initialize the RPN and generate objectness scores and box regressions
    num_anchors = 9  # Assuming 3 scales and 3 aspect ratios per level
    rpn = MultiLevelRPN(num_anchors=num_anchors).to(device)
    objectness, box_regression = rpn(feature_maps)
    print("Objectness shapes:", [obj.shape for obj in objectness])
    print("Box regression shapes:", [box.shape for box in box_regression])

    # Step 5: Generate anchors and filter proposals
    sizes = [(32, 64, 128), (64, 128, 256), (128, 256, 512), (256, 512, 1024)]
    aspect_ratios = [(0.5, 1.0, 2.0)] * 4
    anchor_gen = AnchorGenerator(sizes, aspect_ratios)
    feature_map_shapes = [(fmap.shape[2], fmap.shape[3]) for fmap in feature_maps.values()]
    anchors = [anchor.to(device) for anchor in anchor_gen.generate_anchors(feature_map_shapes)]

    image_size = (256, 256)
    proposals = filter_proposals(objectness, box_regression, anchors, image_size)
    print("Filtered proposals shape:", proposals.shape)

    # Step 6: Initialize and apply custom RoI Pooling
    custom_roi_pooling = CustomRoIPooling(output_size=(7, 7), sampling_ratio=2)
    pooled_features = custom_roi_pooling.forward(feature_maps, proposals, image_size)
    print("Pooled features shape:", pooled_features.shape)

    # Step 7: Initialize Discriminative RoI Pooling module and apply it
    input_channels = pooled_features.shape[1]
    print("Input channels for Discriminative RoI Pooling:", input_channels)
    discriminative_roi_pooling = DiscriminativeRoIPooling(input_channels=input_channels, output_channels=256).to(device)
    enhanced_features = discriminative_roi_pooling(pooled_features)
    print("Enhanced features shape:", enhanced_features.shape)

    # Step 8: Initialize and apply Mask Prediction module
    mask_size = 28
    mask_prediction = MaskPrediction(input_channels=256, mask_size=mask_size, num_classes=1).to(device)
    masks = mask_prediction(enhanced_features)
    print("Masks shape:", masks.shape)

    # Check if output has correct shape
    assert masks.shape == (enhanced_features.shape[0], 1, mask_size, mask_size), \
        f"Expected shape {(enhanced_features.shape[0], 1, mask_size, mask_size)}, but got {masks.shape}"

    print("Test passed: Mask Prediction module outputs correctly shaped masks.")

# Run the test
test_mask_prediction()


Dummy image shape: torch.Size([1, 3, 256, 256])
Feature map P3 shape: torch.Size([1, 256, 64, 64])
Feature map P4 shape: torch.Size([1, 512, 32, 32])
Feature map P5 shape: torch.Size([1, 1024, 16, 16])
Feature map P6 shape: torch.Size([1, 2048, 8, 8])
Objectness shapes: [torch.Size([1, 9, 64, 64]), torch.Size([1, 9, 32, 32]), torch.Size([1, 9, 16, 16]), torch.Size([1, 9, 8, 8])]
Box regression shapes: [torch.Size([1, 36, 64, 64]), torch.Size([1, 36, 32, 32]), torch.Size([1, 36, 16, 16]), torch.Size([1, 36, 8, 8])]
Filtered proposals shape: torch.Size([300, 4])
Pooled features shape: torch.Size([300, 3840, 7, 7])
Input channels for Discriminative RoI Pooling: 3840
Enhanced features shape: torch.Size([300, 256, 7, 7])
Masks shape: torch.Size([300, 1, 28, 28])
Test passed: Mask Prediction module outputs correctly shaped masks.


Loss Functions

In [None]:
# def classification_loss(pred_class_probs, target_category_id):
#     """
#     Calculate the cross-entropy loss for the category classification.

#     Parameters:
#     - pred_class_probs (torch.Tensor): Tensor of shape (num_classes,) representing the predicted
#                                        probability distribution over classes for a specific proposal.
#     - target_category_id (int): The ground-truth category ID as an integer.

#     Returns:
#     - loss (torch.Tensor): The computed cross-entropy loss.
#     """
#     # Ensure pred_class_probs is of shape (1, num_classes) for CrossEntropyLoss
#     pred_class_probs = pred_class_probs.unsqueeze(0)  # Shape becomes (1, num_classes)

#     # Convert target_category_id to a tensor of shape [1]
#     target_class = torch.tensor([target_category_id], dtype=torch.long, device=pred_class_probs.device)

#     # Compute cross-entropy loss
#     loss = F.cross_entropy(pred_class_probs, target_class)

#     return loss

# def false_positive_loss(pred_boxes, pred_objectness, ground_truth_boxes, iou_threshold=0.3):
#     # Compute IoUs between predicted boxes and ground truth boxes
#     ious = compute_iou_matrix(pred_boxes, ground_truth_boxes)  # Custom function
#     max_iou, _ = ious.max(dim=1)

#     # False positives: Proposals with max IoU below the threshold
#     false_positive_mask = (max_iou < iou_threshold).float()
#     fp_loss = false_positive_mask * pred_objectness
#     return fp_loss.mean()


# def iou_loss(pred_boxes, target_boxes):
#     # Calculate intersection
#     inter_x1 = torch.max(pred_boxes[:, 0], target_boxes[:, 0])
#     inter_y1 = torch.max(pred_boxes[:, 1], target_boxes[:, 1])
#     inter_x2 = torch.min(pred_boxes[:, 2], target_boxes[:, 2])
#     inter_y2 = torch.min(pred_boxes[:, 3], target_boxes[:, 3])

#     inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * torch.clamp(inter_y2 - inter_y1, min=0)

#     # Calculate areas of each box and union
#     pred_area = (pred_boxes[:, 2] - pred_boxes[:, 0]) * (pred_boxes[:, 3] - pred_boxes[:, 1])
#     target_area = (target_boxes[:, 2] - target_boxes[:, 0]) * (target_boxes[:, 3] - target_boxes[:, 1])
#     union_area = pred_area + target_area - inter_area

#     iou = inter_area / union_area
#     return 1 - iou.mean()  # IoU loss: 1 - mean IoU


# import torch.nn.functional as F




# def objectness_loss(pred_objectness, target_objectness):
#     # Calculate binary cross-entropy loss between predicted and target objectness
#     return F.binary_cross_entropy(pred_objectness, target_objectness)

# import torch
# import torch.nn.functional as F

# def mask_loss(pred_masks, target_masks):
#     """
#     Calculate binary cross-entropy loss between predicted and target masks for multiple objects in an image.

#     Args:
#         pred_masks (torch.Tensor): Predicted masks, shape [num_proposals, mask_size, mask_size].
#         target_masks (list of torch.Tensor): List of ground truth masks, where each mask tensor has shape [mask_size, mask_size].

#     Returns:
#         torch.Tensor: Average mask loss (binary cross-entropy loss) across all objects.
#     """
#     # Ensure all elements in target_masks are tensors and move to the same device as pred_masks
#     target_masks = [mask.to(pred_masks.device) if isinstance(mask, torch.Tensor) else torch.tensor(mask, device=pred_masks.device)
#                     for mask in target_masks]

#     # Make sure target_masks is a stacked tensor with the same number of masks as predictions if possible
#     target_masks = torch.stack(target_masks)  # Shape: [num_objects, mask_size, mask_size]

#     # If the shapes differ (e.g., more predictions than ground truth objects), adjust to match the smaller set
#     num_objects = min(pred_masks.shape[0], target_masks.shape[0])
#     pred_masks = pred_masks[:num_objects]
#     target_masks = target_masks[:num_objects]

#     # Compute binary cross-entropy loss for each mask
#     individual_losses = []
#     for pred_mask, target_mask in zip(pred_masks, target_masks):
#         individual_loss = F.binary_cross_entropy(pred_mask, target_mask)
#         individual_losses.append(individual_loss)

#     # Average the loss over all masks
#     return torch.mean(torch.stack(individual_losses)) if individual_losses else torch.tensor(0.0, device=pred_masks.device)


Combine Loss Functions

In [114]:
import torch
import torch.nn.functional as F

def convert_to_corners(boxes):
    """
    Convert boxes from (x, y, width, height) to (x1, y1, x2, y2).
    """
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 0] + boxes[:, 2]  # x + width
    y2 = boxes[:, 1] + boxes[:, 3]  # y + height
    return torch.stack((x1, y1, x2, y2), dim=1)

class D2DetLoss:
    def __init__(self, lambda_cls=1.0, lambda_bbox=1.0, lambda_mask=1.0, lambda_obj=0.5, lambda_fp=0.5):
        self.lambda_cls = lambda_cls
        self.lambda_bbox = lambda_bbox
        self.lambda_mask = lambda_mask
        self.lambda_obj = lambda_obj
        self.lambda_fp = lambda_fp

    def compute_iou(self, boxes1, boxes2):
        """
        Compute the Intersection over Union (IoU) between two sets of boxes.
        boxes1 and boxes2 are expected to be tensors of shape [N, 4] and [M, 4].
        """
        # Convert from (x, y, width, height) to (x1, y1, x2, y2)
        boxes1 = convert_to_corners(boxes1)
        boxes2 = convert_to_corners(boxes2)

        # Calculate area of each box
        area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
        area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])

        # Calculate intersection
        x1 = torch.max(boxes1[:, None, 0], boxes2[:, 0])  # Shape: [N, M]
        y1 = torch.max(boxes1[:, None, 1], boxes2[:, 1])
        x2 = torch.min(boxes1[:, None, 2], boxes2[:, 2])
        y2 = torch.min(boxes1[:, None, 3], boxes2[:, 3])

        inter = (x2 - x1).clamp(min=0) * (y2 - y1).clamp(min=0)
        union = area1[:, None] + area2 - inter

        return inter / union  # Shape: [N, M]

    def classification_loss(self, pred_class_probs, pred_boxes, gt_boxes, gt_category_ids):
        """
        Computes classification loss by matching proposals to ground-truth objects based on the highest IoU.
        """
        # Step 1: Ensure `gt_boxes` has the right shape
        if gt_boxes.dim() == 1:
            gt_boxes = gt_boxes.unsqueeze(0)

        # Step 2: Calculate IoU between each proposal and each ground truth box
        gt_boxes = convert_to_corners(gt_boxes)
        ious = self.compute_iou(pred_boxes, gt_boxes)  # Shape: [num_proposals, num_objects]

        # Step 3: For each proposal, find the ground truth box with the highest IoU
        max_ious, matched_gt_indices = ious.max(dim=1)  # max_ious: [num_proposals], matched_gt_indices: [num_proposals]

        # Step 4: Get the ground truth category IDs for each matched ground truth box
        matched_gt_category_ids = gt_category_ids[matched_gt_indices]  # Shape: [num_proposals]

        # Step 5: Calculate classification loss on all proposals
        cls_loss = F.cross_entropy(pred_class_probs, matched_gt_category_ids)

        return cls_loss

    def match_proposals_to_ground_truth(self, pred_boxes, gt_boxes):
        """
        Matches each proposal (predicted box) to the ground truth box with the highest IoU.
        """
        # Ensure gt_boxes are in corner format
        gt_boxes = convert_to_corners(gt_boxes)

        # Compute IoU between each proposal and ground truth box
        ious = self.compute_iou(pred_boxes, gt_boxes)  # Shape: [num_proposals, num_objects]

        # Find the best matching ground truth box for each proposal based on highest IoU
        max_iou, max_iou_indices = ious.max(dim=1)  # max_iou: [num_proposals], max_iou_indices: [num_proposals]

        # Select the predicted boxes and the best matching ground truth boxes
        matched_pred_boxes = pred_boxes
        matched_gt_boxes = gt_boxes[max_iou_indices]  # Match each proposal with its best gt box

        return matched_pred_boxes, matched_gt_boxes

    def bbox_regression_loss(self, pred_boxes, gt_boxes):
        """
        Calculates the smooth L1 loss for bounding box regression.
        """
        # Match proposals to ground truth boxes by selecting the highest IoU match for each
        matched_pred_boxes, matched_gt_boxes = self.match_proposals_to_ground_truth(pred_boxes, gt_boxes)

        # Debugging: Check for NaN or Inf values in matched boxes
        if torch.isnan(matched_pred_boxes).any() or torch.isinf(matched_pred_boxes).any():
            print("Warning: NaN or Inf values found in matched_pred_boxes.")
        if torch.isnan(matched_gt_boxes).any() or torch.isinf(matched_gt_boxes).any():
            print("Warning: NaN or Inf values found in matched_gt_boxes.")

        # Ensure the shapes match for smooth L1 loss
        assert matched_pred_boxes.shape == matched_gt_boxes.shape, \
            f"Shape mismatch for bounding box regression after matching: {matched_pred_boxes.shape} vs {matched_gt_boxes.shape}"

        # Compute the smooth L1 loss for bounding box regression
        return F.smooth_l1_loss(matched_pred_boxes, matched_gt_boxes)


    def mask_loss(self, pred_masks, target_masks, pred_boxes, target_boxes):
        """
        Calculate binary cross-entropy loss between predicted and target masks for multiple objects in an image.

        Args:
            pred_masks (torch.Tensor): Predicted masks, shape [num_proposals, mask_size, mask_size].
            target_masks (list of torch.Tensor): List of ground truth masks, each of varying size.
            pred_boxes (torch.Tensor): Predicted bounding boxes, shape [num_proposals, 4].
            target_boxes (torch.Tensor): Ground truth bounding boxes, shape [num_objects, 4].

        Returns:
            torch.Tensor: Average mask loss (binary cross-entropy loss) across matched objects.
        """
        # Resize all target masks to match the size of `pred_masks`
        target_size = pred_masks.shape[-2:]  # (mask_size, mask_size)
        resized_target_masks = [
            F.interpolate(mask.unsqueeze(0).unsqueeze(0), size=target_size, mode='bilinear', align_corners=False).squeeze(0)
            for mask in target_masks
        ]

        # Stack resized target masks to create a tensor of shape [num_objects, mask_size, mask_size]
        target_masks = torch.stack(resized_target_masks)  # Now all masks have the same shape

        # ** Binarize the target masks **
        target_masks = (target_masks > 0.5).float()  # Set all non-zero pixels to 1, and zero pixels to 0

        # Compute IoU between each proposal and each ground truth box
        ious = self.compute_iou(pred_boxes, target_boxes)  # Shape: [num_proposals, num_objects]

        # Find the best match for each proposal based on highest IoU
        _, matched_gt_indices = ious.max(dim=1)  # matched_gt_indices: [num_proposals]

        # Select matched masks based on highest IoU
        matched_pred_masks = pred_masks  # All predicted masks
        matched_gt_masks = target_masks[matched_gt_indices]  # Ground truth masks with the best match for each proposal

        # Calculate binary cross-entropy loss for matched masks
        mask_loss_val = F.binary_cross_entropy(matched_pred_masks, matched_gt_masks)

        return mask_loss_val


    def objectness_loss(self, overlap_scores, target_bboxes, proposals, iou_threshold=0.5):
        """
        Calculates objectness loss by labeling proposals as foreground or background based on IoU with ground truth boxes.

        Args:
            overlap_scores (torch.Tensor): Predicted overlap scores for each proposal, shape [num_proposals, 1].
            target_bboxes (torch.Tensor): Ground truth bounding boxes, shape [num_objects, 4].
            proposals (torch.Tensor): Proposed regions from the model, shape [num_proposals, 4].
            iou_threshold (float): IoU threshold for determining foreground/background.

        Returns:
            torch.Tensor: Objectness loss.
        """
        # Step 1: Convert proposals and target_bboxes to corner format if they are in (x, y, width, height) format
        #proposals = convert_to_corners(proposals)
        target_bboxes = convert_to_corners(target_bboxes)

        # Step 2: Compute IoU between each proposal and each ground truth box
        ious = self.compute_iou(proposals, target_bboxes)  # Shape: [num_proposals, num_objects]
        # print(ious)

        # Step 3: Determine foreground/background for each proposal based on IoU
        max_ious, _ = ious.max(dim=1)  # Shape: [num_proposals]
        labels = (max_ious > iou_threshold).float()  # Shape: [num_proposals]

        # Step 4: Calculate binary cross-entropy loss between overlap scores and labels
        obj_loss = F.binary_cross_entropy(overlap_scores.squeeze(), labels)

        return obj_loss



    def __call__(self, predictions, targets):
        # Calculate each loss component
        cls_loss = self.classification_loss(
            predictions['class_probs'], predictions['refined_boxes'],
            targets['bbox'], targets['category_id']
        )

        bbox_loss = self.bbox_regression_loss(predictions['refined_boxes'], targets['bbox'])
        #mask_loss_val = self.mask_loss(predictions['masks'], targets['masks'], predictions['refined_boxes'], targets['bbox'])
        obj_loss = self.objectness_loss(predictions['overlap_scores'], targets['bbox'], predictions['refined_boxes'])
        #fp_loss = false_positive_loss(predictions['refined_boxes'], predictions['rpn_objectness'], targets['bbox'])

        # Combine all losses
        total_loss = (self.lambda_cls * cls_loss +
                      self.lambda_bbox * bbox_loss +
                      #self.lambda_mask * mask_loss_val +
                      self.lambda_obj * obj_loss
                      # + self.lambda_fp * fp_loss
                      )

        return total_loss, {
            'classification_loss': cls_loss,
            'bbox_loss': bbox_loss,
            #'mask_loss': mask_loss_val,
            'objectness_loss': obj_loss,
            #'false_positive_loss': fp_loss
        }


In [None]:
from torchvision.datasets import CocoDetection
from torchvision import transforms
from torch.utils.data import DataLoader, Subset
import os
import random
from torch.utils.data import random_split
import numpy as np
import torch  # Add import for torch if it's missing
!pip install -q pycocotools

# Define paths
current_dir = os.path.dirname(os.path.abspath(__file__))  # Current directory where the script is located
coco_mini_dir = os.path.join(current_dir, "coco_mini")  # Path to coco_mini folder

filtered_annotations_path = os.path.join(coco_mini_dir, "annotations", "instances_train2017_filtered.json")
train_images_path = os.path.join(coco_mini_dir, "train2017")

# Define transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((256, 256)),  # Resize for faster training
])

# Load the full dataset
full_train_dataset = CocoDetection(
    root=train_images_path,
    annFile=filtered_annotations_path,
    transform=transform
)

# Define the total dataset
total_dataset = CocoDetection(
    root=train_images_path,
    annFile=filtered_annotations_path,
    transform=transform
)

# Define split proportions
train_ratio = 0.9
train_size = int(train_ratio * len(total_dataset))
test_size = len(total_dataset) - train_size

# Randomly split the dataset
train_dataset, test_dataset = random_split(total_dataset, [train_size, test_size], generator=torch.Generator().manual_seed(42))

# Limit the dataset to a subset of samples for faster training
dataset_size = 100  # Number of samples you want in the subset
indices = random.sample(range(len(train_dataset)), dataset_size)
train_dataset = Subset(train_dataset, indices)

testset_size = 10  # Number of samples you want in the subset
indices1 = random.sample(range(len(test_dataset)), testset_size)
test_dataset = Subset(test_dataset, indices1)

# Create a DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=lambda x: tuple(zip(*x)), num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=lambda x: tuple(zip(*x)), num_workers=4)


'Enhanced' Main Loss Function

In [115]:
# # Assuming you have a DataLoader or Dataset object `train_loader`
# # and that `train_loader` yields data in the form (image, target)

# # Retrieve one batch of data
# # data_iter = iter(train_dataloader)
# # images, targets = next(data_iter)  # `targets` is usually a list of dictionaries for object detection tasks

# # Print out the first target (assuming batch size > 1)
# # print("Target structure for one image:")
# # print(targets[0])  # Print the first target in the batch


# model = D2DetWrapper(
#         backbone=backbone_network,
#         fpn=CustomFeaturePyramidNetwork(),
#         rpn=MultiLevelRPN(num_anchors=9),
#         roi_pool=CustomRoIPooling(output_size=(7, 7), sampling_ratio=2),
#         dense_local_reg=DenseLocalRegression(input_channels=256),
#         binary_overlap=BinaryOverlapPrediction(input_channels=256),
#         classifier=ObjectClassification(input_channels=256, num_classes=81),
#         mask_predictor=MaskPrediction(input_channels=256, mask_size=28, num_classes=1),
#         anchor_generator=AnchorGenerator(
#             sizes=[(32, 64, 128), (64, 128, 256), (128, 256, 512), (256, 512, 1024)],
#             aspect_ratios=[(0.5, 1.0, 2.0)] * 4
#         )
#     ).to(device)

# def test_classification_loss_function(model, dataset, device="cuda" if torch.cuda.is_available() else "cpu"):
#     # Set the model to evaluation mode
#     model.eval()

#     # Extract one image and its target from the dataset
#     image, target = dataset[0]

#     # Move the image to the correct device and add a batch dimension
#     image = image.to(device).unsqueeze(0)

#     # Extract the category ID from the target
#     target_category_id = target[0]['category_id']

#     print(target)

#     # Run the model on the image to get predictions
#     with torch.no_grad():
#         predictions = model(image)

#     # Select the first proposal's class probability distribution (21 classes)
#     # Assumption: `predictions['class_probs']` is a tensor of shape (300, 21)
#     pred_class_probs = predictions['class_probs'][0]  # Shape (21,)
#     print(len(pred_class_probs))

#     # Calculate the classification loss
#     loss = classification_loss(pred_class_probs, target_category_id)

#     # Print out the loss and additional debug info
#     # print("Predicted class probabilities for selected proposal:", pred_class_probs)
#     # print("Target category ID:", target_category_id)
#     print("Classification Loss:", loss.item())

# # Assuming `model` and `dataset` are defined, we call the test function
# #test_classification_loss_function(model, train_dataset)

# import torch

# def test_d2det_loss_function(model, dataset, loss_fn, device="cuda" if torch.cuda.is_available() else "cpu"):
#     # Set the model to evaluation mode
#     model.eval()
#     model.to(device)

#     # Extract one image and its target from the dataset
#     image, target = dataset[0]
#     image = image.to(device).unsqueeze(0)  # Add batch dimension for a single image

#     # Move the target to the correct device and format it as expected by the loss function
#     # target = [{k: torch.tensor(v).to(device) if isinstance(v, list) else v for k, v in t.items()} for t in target]

#     # Run the model on the image to get predictions
#     with torch.no_grad():
#         predictions = model(image)

#     # Reshape predictions if necessary (ensure the format matches what `D2DetLoss` expects)
#     predictions = {
#         'class_probs': predictions['class_probs'],             # [num_proposals, num_classes]
#         'refined_boxes': predictions['refined_boxes'],         # [num_proposals, 4]
#         'masks': predictions['masks'],                         # [num_proposals, mask_size, mask_size]
#         'overlap_scores': predictions['overlap_scores'],       # [num_proposals, 1]
#         'rpn_objectness': predictions['rpn_objectness'],    # Objectness for each proposal
#         'rpn_box_regression': predictions['rpn_box_regression'][0]  # Box regression for each proposal
#     }



#     # Define targets based on extracted data
#     gt_target = {
#         'category_id': torch.tensor([obj['category_id'] for obj in target], device=device),  # Collect all category IDs
#         'bbox': torch.stack([torch.tensor(obj['bbox'], device=device) for obj in target]),   # Collect all bounding boxes
#         #'masks': [torch.tensor(obj['segmentation'], device=device) for obj in target],       # Collect all segmentation masks
#         # 'objectness': torch.ones(predictions['rpn_objectness'].shape[0], device=device)      # Assume all proposals are valid
#     }

#     # print(gt_target)
#     #print(target)


#     # Calculate the total loss and component losses
#     total_loss, loss_dict = loss_fn(predictions, gt_target)

#     # Print out the losses for debugging
#     print(f"Total Loss: {total_loss.item()}")
#     for key, value in loss_dict.items():
#         print(f"{key.capitalize()} Loss: {value.item()}")

#     return total_loss, loss_dict

# # Assuming `model`, `train_dataset`, and `D2DetLoss` are already defined
# loss_fn = D2DetLoss(lambda_cls=1.0, lambda_bbox=1.0, lambda_mask=1.0, lambda_obj=0.5, lambda_fp=0.5)
# total_loss, loss_dict = test_d2det_loss_function(model, train_dataset, loss_fn)


Total Loss: 201.11473083496094
Classification_loss Loss: 4.394805431365967
Bbox_loss Loss: 196.36976623535156
Objectness_loss Loss: 0.700304388999939


End to End Wrapper - Instance Segmentation

In [16]:
class D2DetWrapper(nn.Module):
    def __init__(self, backbone, fpn, rpn, roi_pool, dense_local_reg, binary_overlap, classifier, mask_predictor, anchor_generator, image_size=(256, 256)):
        super(D2DetWrapper, self).__init__()

        # Initialize all components except discriminative ROI pooling
        self.backbone = backbone
        self.fpn = fpn
        self.rpn = rpn
        self.roi_pool = roi_pool
        self.dense_local_reg = dense_local_reg
        self.binary_overlap = binary_overlap
        self.classifier = classifier
        self.mask_predictor = mask_predictor
        self.anchor_generator = anchor_generator
        self.image_size = image_size
        self.discriminative_roi_pool = None  # To be initialized on the first forward pass

    def forward(self, images):
        """
        Forward pass for D2Det.

        Args:
            images (torch.Tensor): Input batch of images, shape [B, 3, H, W].

        Returns:
            dict: Dictionary containing bounding boxes, class probabilities, and masks.
        """
        # Step 1: Backbone feature extraction
        feature_maps = self.backbone(images)
        # print("Backbone features:", {k: v.shape for k, v in feature_maps.items()})

        # Step 2: Feature Pyramid Network (FPN) to get multi-scale features
        fpn_features = self.fpn(feature_maps)
        # print("FPN features:", {k: v.shape for k, v in fpn_features.items()})

        # Step 3: RPN - Obtain objectness scores and box regression for each FPN level
        objectness, box_regression = self.rpn(fpn_features)
        # print("Objectness shapes:", [obj.shape for obj in objectness])
        # print("Box regression shapes:", [box.shape for box in box_regression])

        # Step 4: Generate anchors for each FPN level
        feature_map_shapes = [(fmap.shape[2], fmap.shape[3]) for fmap in fpn_features.values()]
        anchors = self.anchor_generator.generate_anchors(feature_map_shapes)
        anchors = [anchor.to(images.device) for anchor in anchors]
        # print("Generated anchors per level:", [a.shape for a in anchors])

        # In the forward function or where proposals are processed
        # After obtaining objectness scores and box deltas from RPN
        # print("Objectness shapes:", [o.shape for o in objectness])
        # print("Box regression shapes:", [b.shape for b in box_regression])

        # Concatenate anchors and box deltas if needed, or adjust handling of RPN outputs and anchors
        # For each feature level, ensure anchors and box deltas are aligned
        # for level, (anchors_level, box_deltas_level) in enumerate(zip(anchors, box_regression)):
        #     print(f"Level {level}: Anchors shape {anchors_level.shape}, Box deltas shape {box_deltas_level.shape}")

        # Step 5: Filter proposals based on RPN objectness scores and box regressions
        proposals = filter_proposals(objectness, box_regression, anchors, self.image_size)
        # print("Filtered proposals shape:", proposals.shape)

        # Step 6: Initial RoI Pooling on filtered proposals
        pooled_features = self.roi_pool.forward(fpn_features, proposals, self.image_size)
        # print("Pooled features shape:", pooled_features.shape)

        # Step 7: Apply Discriminative RoI Pooling if not initialized yet
        if self.discriminative_roi_pool is None:
            input_channels = pooled_features.shape[1]
            self.discriminative_roi_pool = DiscriminativeRoIPooling(input_channels=input_channels, output_channels=256).to(images.device)
        # print("Discriminative RoI Pooling initialized with input channels:", pooled_features.shape[1])

        # Apply Discriminative RoI Pooling to get enhanced features
        enhanced_features = self.discriminative_roi_pool(pooled_features)
        # print("Enhanced features shape:", enhanced_features.shape)

        # Step 8: Dense Local Regression for bounding box refinement
        bbox_deltas = self.dense_local_reg(enhanced_features)
        refined_boxes = BoundingBoxRefinement().refine_boxes(proposals, bbox_deltas, self.image_size)
        # print("Refined boxes shape:", refined_boxes.shape)

        # Step 9: Binary Overlap Prediction for background vs. foreground
        overlap_scores = self.binary_overlap(enhanced_features)
        # print("Overlap scores shape:", overlap_scores.shape)

        # Step 10: Object Classification
        class_probs = self.classifier(enhanced_features)
        # print("Class probabilities shape:", class_probs.shape)

        # Step 11: Mask Prediction (Instance Segmentation)
        masks = self.mask_predictor(enhanced_features)
        # print("Masks shape:", masks.shape)

        # Step 12: Compile outputs into a dictionary
        output = {
            'refined_boxes': refined_boxes,       # Shape: [num_proposals, 4]
            'class_probs': class_probs,           # Shape: [num_proposals, num_classes]
            'masks': masks,                       # Shape: [num_proposals, mask_size, mask_size]
            'overlap_scores': overlap_scores,     # Shape: [num_proposals, 1]
            'rpn_objectness': objectness,         # List of objectness tensors from RPN
            'rpn_box_regression': box_regression  # List of box regression tensors from RPN
        }

        # Final output shapes printout (commented)
        # print("Final output shapes:")
        # for key, value in output.items():
        #     print(f"{key}: {value.shape}")

        return output


In [125]:
# Instantiate and Test Wrapper
def test_d2det_wrapper():
    # Instantiate each component with required parameters (assumes already defined classes and modules)
    backbone = backbone_network
    fpn = CustomFeaturePyramidNetwork()
    rpn = MultiLevelRPN(num_anchors=9)
    roi_pool = CustomRoIPooling(output_size=(7, 7), sampling_ratio=2)
    discriminative_roi_pool = DiscriminativeRoIPooling(input_channels=3840, output_channels=256).to(device)  # Added Discriminative RoI Pooling
    dense_local_reg = DenseLocalRegression(input_channels=256)  # Assuming RoI pooled features have 256 channels
    binary_overlap = BinaryOverlapPrediction(input_channels=256)
    classifier = ObjectClassification(input_channels=256, num_classes=81)  # 20 classes + background
    mask_predictor = MaskPrediction(input_channels=256, mask_size=28, num_classes=1)
    anchor_generator = AnchorGenerator(sizes=[(32, 64, 128), (64, 128, 256), (128, 256, 512), (256, 512, 1024)], aspect_ratios=[(0.5, 1.0, 2.0)] * 4)

    # Define wrapper
    d2det_model = D2DetWrapper(
        backbone=backbone,
        fpn=fpn,
        rpn=rpn,
        roi_pool=roi_pool,
        #discriminative_roi_pool=discriminative_roi_pool,  # Pass in discriminative pooling
        dense_local_reg=dense_local_reg,
        binary_overlap=binary_overlap,
        classifier=classifier,
        mask_predictor=mask_predictor,
        anchor_generator=anchor_generator
    ).to(device)

    # Dummy input image tensor
    dummy_images = torch.rand(1, 3, 256, 256).to(device)  # Batch of 1, 256x256 RGB image

    # Run forward pass
    output = d2det_model(dummy_images)

    # Output shapes
    print("Refined boxes shape:", output['refined_boxes'].shape)       # Expecting [num_proposals, 4]
    print("Class probabilities shape:", output['class_probs'].shape)   # Expecting [num_proposals, num_classes]
    print("Masks shape:", output['masks'].shape)                       # Expecting [num_proposals, 28, 28]
    print("Overlap scores shape:", output['overlap_scores'].shape)     # Expecting [num_proposals, 1]
    #print("RPN objectness shape:", output['rpn_objectness'].shape)  # Expecting [num_proposals, 1]
    print("RPN box regression shape:", output['rpn_box_regression'][0].shape)


# Run the test
test_d2det_wrapper()


Refined boxes shape: torch.Size([300, 4])
Class probabilities shape: torch.Size([300, 81])
Masks shape: torch.Size([300, 1, 28, 28])
Overlap scores shape: torch.Size([300, 1])
RPN box regression shape: torch.Size([1, 36, 64, 64])


Training Function for entire system

In [19]:
# from google.colab import drive
# drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
current_dir = os.path.dirname(os.path.abspath(__file__))
coco_mini_dir = os.path.join(current_dir, "coco_mini")

# Set the path for the annotations directory
annotations_dir = os.path.join(coco_mini_dir, "coco_annotations")

# List the files in the annotations directory (this will work if the directory structure is correct)
!ls {annotations_dir}

# Define the path to the training annotations file
train_ann_path = os.path.join(annotations_dir, "annotations", "instances_train2017.json")


annotations  annotations_trainval2017.zip


In [21]:
# import json
# import os

# # Load the full COCO annotations file
# annotations_dir = "/content/drive/MyDrive/coco_mini/coco_annotations"
# with open(f"{annotations_dir}/annotations/instances_train2017.json") as f:
#     coco_data = json.load(f)

# # Filter images and annotations based on available images
# filtered_images = []
# filtered_annotations = []

# # List available images (filenames without extensions)
# images_dir = "/content/drive/MyDrive/coco_mini"
# available_images = {os.path.splitext(filename)[0] for filename in os.listdir(images_dir)}

# # Map image IDs to filenames for quick lookup
# image_id_to_filename = {img["id"]: img["file_name"].split(".")[0] for img in coco_data["images"]}

# # Filter images and annotations based on available images
# for image_info in coco_data["images"]:
#     if image_info["file_name"].split(".")[0] in available_images:
#         filtered_images.append(image_info)

# # Filter annotations based on filtered image IDs
# filtered_image_ids = {img["id"] for img in filtered_images}
# for annotation in coco_data["annotations"]:
#     if annotation["image_id"] in filtered_image_ids:
#         filtered_annotations.append(annotation)

# # Create a new dictionary for filtered data
# filtered_coco_data = {
#     "info": coco_data["info"],
#     "licenses": coco_data["licenses"],
#     "images": filtered_images,
#     "annotations": filtered_annotations,
#     "categories": coco_data["categories"]
# }

# # Define the path for the filtered annotations file
# filtered_annotations_path = "/content/drive/MyDrive/coco_mini/annotations/instances_train2017_filtered.json"

# # Create the directory if it doesn't exist
# os.makedirs(os.path.dirname(filtered_annotations_path), exist_ok=True)

# # Save the filtered annotations
# with open(filtered_annotations_path, "w") as f:
#     json.dump(filtered_coco_data, f)

# print("Filtered annotations saved at:", filtered_annotations_path)


Filtered annotations saved at: /content/drive/MyDrive/coco_mini/annotations/instances_train2017_filtered.json


In [191]:
import torch
import torch.optim as optim
from tqdm import tqdm
import os

def pad_or_resize_mask(mask, target_size=(200, 200)):
    """
    Pads or resizes a mask to a fixed target size.

    Args:
        mask (torch.Tensor): Mask tensor of shape [H, W].
        target_size (tuple): Target size (H, W), e.g., (200, 200).

    Returns:
        torch.Tensor: Mask of shape [target_size[0], target_size[1]].
    """
    print("Original mask shape:", mask.shape)

    # Ensure the mask is a float tensor and add batch & channel dimensions
    mask = mask.float().unsqueeze(0).unsqueeze(0)  # Shape: [1, 1, H, W]

    # Explicitly resize to target_size
    mask = F.interpolate(mask, size=target_size, mode='bilinear', align_corners=False)

    # Remove batch & channel dimensions
    mask = mask.squeeze(0).squeeze(0)  # Shape: [target_size[0], target_size[1]]

    print("Resized mask shape:", mask.shape)
    return mask

def train_d2det(train_dataloader, val_dataloader, model_save_path, num_epochs=10, lr=0.001, device='cuda'):
    """
    Train the D2Det model.

    Args:
        train_dataloader (DataLoader): DataLoader for training dataset.
        val_dataloader (DataLoader): DataLoader for validation dataset.
        model_save_path (str): Path to save the trained model.
        num_epochs (int): Number of epochs to train for.
        lr (float): Learning rate for optimizer.
        device (str): Device to train on ('cuda' or 'cpu').

    Returns:
        D2DetWrapper: Trained D2Det model.
    """
    torch.autograd.set_detect_anomaly(True)

    # Initialize model, loss function, and optimizer
    d2det_model = D2DetWrapper(
        backbone=backbone_network,
        fpn=CustomFeaturePyramidNetwork(),
        rpn=MultiLevelRPN(num_anchors=9),
        roi_pool=CustomRoIPooling(output_size=(7, 7), sampling_ratio=2),
        dense_local_reg=DenseLocalRegression(input_channels=256),
        binary_overlap=BinaryOverlapPrediction(input_channels=256),
        classifier=ObjectClassification(input_channels=256, num_classes=101),
        mask_predictor=MaskPrediction(input_channels=256, mask_size=28, num_classes=1),
        anchor_generator=AnchorGenerator(
            sizes=[(32, 64, 128), (64, 128, 256), (128, 256, 512), (256, 512, 1024)],
            aspect_ratios=[(0.5, 1.0, 2.0)] * 4
        )
    ).to(device)

    # Initialize the new D2DetLoss
    loss_fn = D2DetLoss(lambda_cls=1.0, lambda_bbox=1.0, lambda_mask=1.0, lambda_obj=0.5, lambda_fp=0.5)
    optimizer = optim.Adam(d2det_model.parameters(), lr=lr)

    # Training loop
    for epoch in range(num_epochs):
        d2det_model.train()
        epoch_loss = 0.0
        print(f"Epoch [{epoch+1}/{num_epochs}]")

        for images, targets in tqdm(train_dataloader, desc="Training", leave=False):
            # Stack images to form a batch of shape [batch_size, C, H, W]
            images = torch.stack(images).to(device)  # Convert tuple of images to tensor
            #print("Targets structure:", targets)

            # Convert each target in the targets tuple to a dictionary with tensors moved to device
            target_dicts = targets[0]

            # Define ground-truth target structure
            gt_target = {
                'category_id': torch.tensor([obj['category_id'] for obj in target_dicts], device=device),
                'bbox': torch.stack([torch.tensor(obj['bbox'], device=device) for obj in target_dicts]),
                # 'masks': torch.stack([
                #     pad_or_resize_mask(torch.tensor(obj['segmentation'],
                #                                     device=device),
                #                        target_size=(200, 200))
                #     for obj in target_dicts
                # ])
                #'masks': [torch.tensor(obj['segmentation'], device=device) for obj in target_dicts]
            }


            # Forward pass: predictions from the model
            predictions_raw = d2det_model(images)

            # Process predictions as per your loss function requirements
            predictions = {
                'class_probs': predictions_raw['class_probs'],
                'refined_boxes': predictions_raw['refined_boxes'],
                'masks': predictions_raw['masks'],
                'overlap_scores': predictions_raw['overlap_scores'],
                'rpn_objectness': predictions_raw['rpn_objectness'],
                'rpn_box_regression': predictions_raw['rpn_box_regression']
            }

            # Calculate loss
            loss, loss_dict = loss_fn(predictions, gt_target)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(d2det_model.parameters(), max_norm=1.0)
            optimizer.step()

            # Accumulate batch loss to epoch_loss
            epoch_loss += loss.item()

            # Print loss components for debugging if needed
            print(f"Batch Loss: {loss.item():.4f}")
            for loss_name, loss_value in loss_dict.items():
                print(f"{loss_name}: {loss_value.item():.4f}")

        # Calculate and print average loss for the epoch
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"Average Training Loss for Epoch {epoch+1}: {avg_loss:.4f}")


        # Validation loop
        if val_dataloader is not None:
            val_loss = validate_d2det(d2det_model, val_dataloader, loss_fn, device)
            print(f"Validation Loss for Epoch {epoch+1}: {val_loss:.4f}")

        # Save model checkpoint
        save_checkpoint(d2det_model, optimizer, epoch, model_save_path)

    print("Training Complete.")
    return d2det_model


def validate_d2det(model, val_dataloader, loss_fn, device):
    """
    Validate the D2Det model.

    Args:
        model (D2DetWrapper): Trained D2Det model.
        val_dataloader (DataLoader): DataLoader for validation dataset.
        loss_fn (D2DetLoss): Loss function for D2Det.
        device (str): Device to validate on ('cuda' or 'cpu').

    Returns:
        float: Average validation loss.
    """
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for images, targets in tqdm(val_dataloader, desc="Validating", leave=False):
            images = images.to(device)

            # Forward pass
            predictions = model(images)

            # Process ground truth targets for the batch
            gt_target = {
                'category_id': torch.cat([torch.tensor([obj['category_id']], device=device) for target in targets for obj in target]),
                'bbox': torch.cat([torch.tensor([obj['bbox']], device=device) for target in targets for obj in target]),
                #'masks': [torch.tensor(obj['segmentation'], device=device) for target in targets for obj in target],
                'objectness': torch.ones(predictions['rpn_objectness'].shape[0], device=device)
            }

            # Calculate loss
            loss, _ = loss_fn(predictions, gt_target)

            # Accumulate loss
            val_loss += loss.item()

    return val_loss / len(val_dataloader)


def save_checkpoint(model, optimizer, epoch, model_save_path):
    """
    Save the model checkpoint.

    Args:
        model (D2DetWrapper): D2Det model to save.
        optimizer (torch.optim.Optimizer): Optimizer used in training.
        epoch (int): Current epoch.
        model_save_path (str): Path to save the model checkpoint.
    """
    os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
    checkpoint = {
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }
    torch.save(checkpoint, model_save_path)
    print(f"Model checkpoint saved at epoch {epoch+1} to {model_save_path}")



In [122]:




# Define number of epochs and device
num_epochs = 5
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Start training
trained_model = train_d2det(
    train_dataloader=train_dataloader,
    val_dataloader=None,
    model_save_path=model_save_path,
    num_epochs=num_epochs,
    lr=0.001,
    device=device
)




Epoch [1/5]


Training:   1%|          | 1/100 [00:16<27:44, 16.81s/it]

Batch Loss: 249.3404
classification_loss: 4.6168
bbox_loss: 244.3805
objectness_loss: 0.6863


Training:   2%|▏         | 2/100 [00:28<22:11, 13.58s/it]

Batch Loss: 155.5951
classification_loss: 4.6112
bbox_loss: 150.9062
objectness_loss: 0.1552


Training:   3%|▎         | 3/100 [00:37<19:04, 11.80s/it]

Batch Loss: 212.2431
classification_loss: 4.6149
bbox_loss: 207.6271
objectness_loss: 0.0024


Training:   4%|▍         | 4/100 [00:47<17:32, 10.96s/it]

Batch Loss: 260.3392
classification_loss: 4.6140
bbox_loss: 255.7252
objectness_loss: 0.0000


Training:   5%|▌         | 5/100 [00:56<16:24, 10.36s/it]

Batch Loss: 297.2419
classification_loss: 4.6188
bbox_loss: 292.6231
objectness_loss: 0.0000


Training:   6%|▌         | 6/100 [01:06<15:43, 10.03s/it]

Batch Loss: 382.7670
classification_loss: 4.6101
bbox_loss: 378.1569
objectness_loss: 0.0000


Training:   7%|▋         | 7/100 [01:15<15:12,  9.82s/it]

Batch Loss: 319.5102
classification_loss: 4.6138
bbox_loss: 314.8964
objectness_loss: 0.0000


Training:   8%|▊         | 8/100 [01:24<14:47,  9.65s/it]

Batch Loss: 36.4383
classification_loss: 4.6063
bbox_loss: 25.9672
objectness_loss: 11.7297


Training:   9%|▉         | 9/100 [01:34<14:29,  9.56s/it]

Batch Loss: 190.2286
classification_loss: 4.6112
bbox_loss: 185.6174
objectness_loss: 0.0000


Training:  10%|█         | 10/100 [01:43<14:11,  9.46s/it]

Batch Loss: 124.2147
classification_loss: 4.6020
bbox_loss: 119.6126
objectness_loss: 0.0000


Training:  11%|█         | 11/100 [01:52<14:00,  9.45s/it]

Batch Loss: 119.9822
classification_loss: 4.5961
bbox_loss: 115.3861
objectness_loss: 0.0000


Training:  12%|█▏        | 12/100 [02:02<13:45,  9.38s/it]

Batch Loss: 56.6333
classification_loss: 4.5903
bbox_loss: 34.8229
objectness_loss: 34.4400


Training:  13%|█▎        | 13/100 [02:11<13:28,  9.29s/it]

Batch Loss: 186.8036
classification_loss: 4.5219
bbox_loss: 182.2817
objectness_loss: 0.0000


Training:  14%|█▍        | 14/100 [02:20<13:16,  9.26s/it]

Batch Loss: 84.1142
classification_loss: 4.6167
bbox_loss: 42.5823
objectness_loss: 73.8304


Training:  15%|█▌        | 15/100 [02:29<13:10,  9.30s/it]

Batch Loss: 69.8038
classification_loss: 4.6179
bbox_loss: 65.1860
objectness_loss: 0.0000


Training:  16%|█▌        | 16/100 [02:38<13:00,  9.29s/it]

Batch Loss: 115.2589
classification_loss: 4.6198
bbox_loss: 110.6391
objectness_loss: 0.0000


Training:  17%|█▋        | 17/100 [02:48<12:44,  9.21s/it]

Batch Loss: 97.8216
classification_loss: 4.4919
bbox_loss: 93.3298
objectness_loss: 0.0000


Training:  18%|█▊        | 18/100 [02:57<12:41,  9.28s/it]

Batch Loss: 66.7798
classification_loss: 4.4233
bbox_loss: 15.6996
objectness_loss: 93.3138


Training:  19%|█▉        | 19/100 [03:06<12:31,  9.28s/it]

Batch Loss: 68.8747
classification_loss: 4.6185
bbox_loss: 64.2562
objectness_loss: 0.0000


Training:  20%|██        | 20/100 [03:15<12:17,  9.22s/it]

Batch Loss: 150.7952
classification_loss: 4.6200
bbox_loss: 146.1753
objectness_loss: 0.0000


Training:  21%|██        | 21/100 [03:25<12:09,  9.23s/it]

Batch Loss: 132.3448
classification_loss: 4.3854
bbox_loss: 127.9594
objectness_loss: 0.0000


Training:  22%|██▏       | 22/100 [03:34<11:56,  9.18s/it]

Batch Loss: 88.1585
classification_loss: 4.6199
bbox_loss: 83.5387
objectness_loss: 0.0000


Training:  23%|██▎       | 23/100 [03:43<11:47,  9.18s/it]

Batch Loss: 247.6556
classification_loss: 4.6193
bbox_loss: 243.0363
objectness_loss: 0.0000


Training:  24%|██▍       | 24/100 [03:52<11:39,  9.21s/it]

Batch Loss: 84.5784
classification_loss: 4.6155
bbox_loss: 79.9629
objectness_loss: 0.0000


Training:  25%|██▌       | 25/100 [04:01<11:26,  9.16s/it]

Batch Loss: 64.9340
classification_loss: 4.6144
bbox_loss: 60.3196
objectness_loss: 0.0000


Training:  26%|██▌       | 26/100 [04:10<11:14,  9.12s/it]

Batch Loss: 149.9415
classification_loss: 4.6065
bbox_loss: 145.3350
objectness_loss: 0.0000


Training:  27%|██▋       | 27/100 [04:19<11:08,  9.16s/it]

Batch Loss: 168.0657
classification_loss: 4.6135
bbox_loss: 163.4522
objectness_loss: 0.0000


Training:  28%|██▊       | 28/100 [04:28<10:57,  9.13s/it]

Batch Loss: 94.2037
classification_loss: 4.6139
bbox_loss: 39.5898
objectness_loss: 100.0000


Training:  29%|██▉       | 29/100 [04:37<10:44,  9.08s/it]

Batch Loss: 67.3070
classification_loss: 4.6163
bbox_loss: 12.8573
objectness_loss: 99.6667


Training:  30%|███       | 30/100 [04:47<10:39,  9.14s/it]

Batch Loss: 68.4998
classification_loss: 4.3127
bbox_loss: 14.1870
objectness_loss: 100.0000


Training:  31%|███       | 31/100 [04:56<10:31,  9.16s/it]

Batch Loss: 121.4447
classification_loss: 4.6232
bbox_loss: 116.8215
objectness_loss: 0.0000


Training:  32%|███▏      | 32/100 [05:05<10:20,  9.12s/it]

Batch Loss: 101.2898
classification_loss: 4.6171
bbox_loss: 46.6728
objectness_loss: 100.0000


Training:  33%|███▎      | 33/100 [05:14<10:13,  9.15s/it]

Batch Loss: 252.0164
classification_loss: 4.6179
bbox_loss: 247.3986
objectness_loss: 0.0000


Training:  34%|███▍      | 34/100 [05:23<10:03,  9.14s/it]

Batch Loss: 152.4651
classification_loss: 4.5473
bbox_loss: 147.9178
objectness_loss: 0.0000


Training:  35%|███▌      | 35/100 [05:33<09:55,  9.16s/it]

Batch Loss: 83.4536
classification_loss: 4.2558
bbox_loss: 31.1978
objectness_loss: 96.0000


Training:  36%|███▌      | 36/100 [05:42<09:46,  9.16s/it]

Batch Loss: 105.8247
classification_loss: 4.4328
bbox_loss: 51.3919
objectness_loss: 100.0000


Training:  37%|███▋      | 37/100 [05:51<09:36,  9.14s/it]

Batch Loss: 168.4323
classification_loss: 4.5122
bbox_loss: 163.9202
objectness_loss: 0.0000


Training:  38%|███▊      | 38/100 [06:00<09:25,  9.13s/it]

Batch Loss: 99.7688
classification_loss: 4.6197
bbox_loss: 45.1491
objectness_loss: 100.0000


Training:  39%|███▉      | 39/100 [06:09<09:19,  9.18s/it]

Batch Loss: 84.8564
classification_loss: 4.6200
bbox_loss: 30.2364
objectness_loss: 100.0000


Training:  40%|████      | 40/100 [06:18<09:08,  9.14s/it]

Batch Loss: 113.3258
classification_loss: 4.6242
bbox_loss: 108.7017
objectness_loss: 0.0000


Training:  41%|████      | 41/100 [06:27<08:56,  9.10s/it]

Batch Loss: 74.9384
classification_loss: 4.6214
bbox_loss: 70.3170
objectness_loss: 0.0000


Training:  42%|████▏     | 42/100 [06:37<08:52,  9.17s/it]

Batch Loss: 86.2619
classification_loss: 4.6225
bbox_loss: 81.6394
objectness_loss: 0.0000


Training:  43%|████▎     | 43/100 [06:46<08:41,  9.14s/it]

Batch Loss: 82.0074
classification_loss: 4.6181
bbox_loss: 28.3893
objectness_loss: 98.0000


Training:  44%|████▍     | 44/100 [06:55<08:28,  9.09s/it]

Batch Loss: 215.9053
classification_loss: 4.6185
bbox_loss: 211.2868
objectness_loss: 0.0000


Training:  45%|████▌     | 45/100 [07:04<08:23,  9.16s/it]

Batch Loss: 39.2794
classification_loss: 4.4064
bbox_loss: 34.8729
objectness_loss: 0.0000


Training:  46%|████▌     | 46/100 [07:13<08:15,  9.18s/it]

Batch Loss: 72.8724
classification_loss: 4.6146
bbox_loss: 66.5911
objectness_loss: 3.3333


Training:  47%|████▋     | 47/100 [07:22<08:04,  9.15s/it]

Batch Loss: 79.5416
classification_loss: 4.6208
bbox_loss: 74.9208
objectness_loss: 0.0000


Training:  48%|████▊     | 48/100 [07:31<07:53,  9.11s/it]

Batch Loss: 220.1640
classification_loss: 4.6199
bbox_loss: 215.5441
objectness_loss: 0.0000


Training:  49%|████▉     | 49/100 [07:40<07:41,  9.04s/it]

Batch Loss: 143.0690
classification_loss: 4.6185
bbox_loss: 138.4505
objectness_loss: 0.0000


Training:  50%|█████     | 50/100 [07:49<07:31,  9.02s/it]

Batch Loss: 172.1403
classification_loss: 4.6200
bbox_loss: 167.5203
objectness_loss: 0.0000


Training:  51%|█████     | 51/100 [07:58<07:24,  9.07s/it]

Batch Loss: 173.1953
classification_loss: 4.6197
bbox_loss: 168.5756
objectness_loss: 0.0000


Training:  52%|█████▏    | 52/100 [08:07<07:12,  9.01s/it]

Batch Loss: 138.9153
classification_loss: 4.6193
bbox_loss: 134.2960
objectness_loss: 0.0000


Training:  53%|█████▎    | 53/100 [08:16<06:57,  8.88s/it]

Batch Loss: 96.4969
classification_loss: 4.6243
bbox_loss: 47.8726
objectness_loss: 88.0000


Training:  54%|█████▍    | 54/100 [08:25<06:52,  8.97s/it]

Batch Loss: 95.9040
classification_loss: 4.4170
bbox_loss: 90.6537
objectness_loss: 1.6667


Training:  55%|█████▌    | 55/100 [08:34<06:40,  8.89s/it]

Batch Loss: 84.1982
classification_loss: 4.6174
bbox_loss: 79.5808
objectness_loss: 0.0000


Training:  56%|█████▌    | 56/100 [08:43<06:33,  8.95s/it]

Batch Loss: 142.4513
classification_loss: 4.6019
bbox_loss: 137.8495
objectness_loss: 0.0000


Training:  57%|█████▋    | 57/100 [08:52<06:29,  9.05s/it]

Batch Loss: 169.9485
classification_loss: 4.5873
bbox_loss: 165.3613
objectness_loss: 0.0000


Training:  58%|█████▊    | 58/100 [09:01<06:14,  8.91s/it]

Batch Loss: 82.7982
classification_loss: 4.6100
bbox_loss: 72.2725
objectness_loss: 11.8315


Training:  59%|█████▉    | 59/100 [09:09<05:57,  8.72s/it]

Batch Loss: 101.3568
classification_loss: 4.6158
bbox_loss: 96.7410
objectness_loss: 0.0000


Training:  60%|██████    | 60/100 [09:18<05:48,  8.71s/it]

Batch Loss: 95.6831
classification_loss: 4.5266
bbox_loss: 91.1565
objectness_loss: 0.0000


Training:  61%|██████    | 61/100 [09:26<05:38,  8.67s/it]

Batch Loss: 74.5490
classification_loss: 4.6161
bbox_loss: 69.2875
objectness_loss: 1.2907


Training:  62%|██████▏   | 62/100 [09:35<05:35,  8.82s/it]

Batch Loss: 101.9568
classification_loss: 4.4250
bbox_loss: 97.5319
objectness_loss: 0.0000


Training:  63%|██████▎   | 63/100 [09:44<05:26,  8.82s/it]

Batch Loss: 71.3270
classification_loss: 4.6192
bbox_loss: 66.7078
objectness_loss: 0.0000


Training:  64%|██████▍   | 64/100 [09:53<05:16,  8.78s/it]

Batch Loss: 114.3545
classification_loss: 4.6160
bbox_loss: 109.7385
objectness_loss: 0.0000


Training:  65%|██████▌   | 65/100 [10:02<05:08,  8.80s/it]

Batch Loss: 77.8255
classification_loss: 4.4859
bbox_loss: 42.7851
objectness_loss: 61.1089


Training:  66%|██████▌   | 66/100 [10:10<04:57,  8.76s/it]

Batch Loss: 77.1131
classification_loss: 4.3632
bbox_loss: 23.4247
objectness_loss: 98.6504


Training:  67%|██████▋   | 67/100 [10:19<04:51,  8.84s/it]

Batch Loss: 76.3088
classification_loss: 4.5470
bbox_loss: 54.0089
objectness_loss: 35.5058


Training:  68%|██████▊   | 68/100 [10:28<04:41,  8.81s/it]

Batch Loss: 109.6866
classification_loss: 4.2789
bbox_loss: 96.7778
objectness_loss: 17.2599


Training:  69%|██████▉   | 69/100 [10:37<04:32,  8.78s/it]

Batch Loss: 71.6463
classification_loss: 4.2935
bbox_loss: 67.3529
objectness_loss: 0.0000


Training:  70%|███████   | 70/100 [10:45<04:22,  8.75s/it]

Batch Loss: 173.5008
classification_loss: 4.6244
bbox_loss: 168.8764
objectness_loss: 0.0000


Training:  71%|███████   | 71/100 [10:54<04:11,  8.66s/it]

Batch Loss: 194.1419
classification_loss: 4.6184
bbox_loss: 189.5236
objectness_loss: 0.0000


Training:  72%|███████▏  | 72/100 [11:03<04:04,  8.74s/it]

Batch Loss: 162.1883
classification_loss: 4.2841
bbox_loss: 157.9043
objectness_loss: 0.0000


Training:  73%|███████▎  | 73/100 [11:12<03:55,  8.74s/it]

Batch Loss: 126.6850
classification_loss: 4.2490
bbox_loss: 122.4360
objectness_loss: 0.0000


Training:  74%|███████▍  | 74/100 [11:20<03:46,  8.73s/it]

Batch Loss: 81.0571
classification_loss: 4.6196
bbox_loss: 76.4376
objectness_loss: 0.0000


Training:  75%|███████▌  | 75/100 [11:29<03:36,  8.66s/it]

Batch Loss: 76.4477
classification_loss: 4.0363
bbox_loss: 53.2034
objectness_loss: 38.4160


Training:  76%|███████▌  | 76/100 [11:38<03:29,  8.72s/it]

Batch Loss: 76.4757
classification_loss: 4.6184
bbox_loss: 53.9414
objectness_loss: 35.8319


Training:  77%|███████▋  | 77/100 [11:46<03:19,  8.68s/it]

Batch Loss: 147.5910
classification_loss: 4.6121
bbox_loss: 142.9788
objectness_loss: 0.0000


Training:  78%|███████▊  | 78/100 [11:55<03:11,  8.69s/it]

Batch Loss: 193.1824
classification_loss: 4.6244
bbox_loss: 188.5579
objectness_loss: 0.0000


Training:  79%|███████▉  | 79/100 [12:04<03:02,  8.69s/it]

Batch Loss: 87.3398
classification_loss: 4.2389
bbox_loss: 59.7308
objectness_loss: 46.7403


Training:  80%|████████  | 80/100 [12:12<02:54,  8.70s/it]

Batch Loss: 132.4779
classification_loss: 4.6269
bbox_loss: 127.8509
objectness_loss: 0.0000


Training:  81%|████████  | 81/100 [12:21<02:45,  8.73s/it]

Batch Loss: 61.7669
classification_loss: 4.6187
bbox_loss: 35.7333
objectness_loss: 42.8298


Training:  82%|████████▏ | 82/100 [12:30<02:36,  8.71s/it]

Batch Loss: 102.5617
classification_loss: 4.6165
bbox_loss: 97.9452
objectness_loss: 0.0000


Training:  83%|████████▎ | 83/100 [12:39<02:29,  8.79s/it]

Batch Loss: 78.1284
classification_loss: 4.0559
bbox_loss: 71.1481
objectness_loss: 5.8489


Training:  84%|████████▍ | 84/100 [12:47<02:19,  8.74s/it]

Batch Loss: 99.6157
classification_loss: 4.6225
bbox_loss: 90.5108
objectness_loss: 8.9647


Training:  85%|████████▌ | 85/100 [12:56<02:11,  8.74s/it]

Batch Loss: 63.1901
classification_loss: 4.4255
bbox_loss: 58.6313
objectness_loss: 0.2665


Training:  86%|████████▌ | 86/100 [13:04<01:57,  8.37s/it]

Batch Loss: 146.4032
classification_loss: 4.6271
bbox_loss: 141.7761
objectness_loss: 0.0000


Training:  87%|████████▋ | 87/100 [13:12<01:47,  8.25s/it]

Batch Loss: 141.1299
classification_loss: 4.6273
bbox_loss: 133.1994
objectness_loss: 6.6062


Training:  88%|████████▊ | 88/100 [13:19<01:36,  8.02s/it]

Batch Loss: 185.7883
classification_loss: 4.6263
bbox_loss: 181.1620
objectness_loss: 0.0000


Training:  89%|████████▉ | 89/100 [13:28<01:30,  8.23s/it]

Batch Loss: 88.0345
classification_loss: 4.1825
bbox_loss: 80.0075
objectness_loss: 7.6890


Training:  90%|█████████ | 90/100 [13:36<01:22,  8.25s/it]

Batch Loss: 92.6792
classification_loss: 4.0973
bbox_loss: 86.6080
objectness_loss: 3.9477


Training:  91%|█████████ | 91/100 [13:45<01:16,  8.46s/it]

Batch Loss: 95.3644
classification_loss: 4.6224
bbox_loss: 90.7419
objectness_loss: 0.0000


Training:  92%|█████████▏| 92/100 [13:54<01:08,  8.57s/it]

Batch Loss: 91.2953
classification_loss: 4.0245
bbox_loss: 71.5200
objectness_loss: 31.5017


Training:  93%|█████████▎| 93/100 [14:02<00:57,  8.27s/it]

Batch Loss: 67.5631
classification_loss: 4.2157
bbox_loss: 59.5141
objectness_loss: 7.6667


Training:  94%|█████████▍| 94/100 [14:10<00:49,  8.32s/it]

Batch Loss: 124.0644
classification_loss: 4.6272
bbox_loss: 116.1743
objectness_loss: 6.5258


Training:  95%|█████████▌| 95/100 [14:18<00:40,  8.10s/it]

Batch Loss: 264.7645
classification_loss: 4.6286
bbox_loss: 260.1359
objectness_loss: 0.0000


Training:  96%|█████████▌| 96/100 [14:26<00:32,  8.15s/it]

Batch Loss: 74.6579
classification_loss: 4.1057
bbox_loss: 66.5465
objectness_loss: 8.0113


Training:  97%|█████████▋| 97/100 [14:33<00:23,  7.98s/it]

Batch Loss: 134.3089
classification_loss: 4.6266
bbox_loss: 129.6823
objectness_loss: 0.0000


Training:  98%|█████████▊| 98/100 [14:41<00:15,  7.95s/it]

Batch Loss: 97.3593
classification_loss: 4.4151
bbox_loss: 71.5904
objectness_loss: 42.7078


Training:  99%|█████████▉| 99/100 [14:49<00:07,  7.94s/it]

Batch Loss: 100.1209
classification_loss: 4.6229
bbox_loss: 85.6579
objectness_loss: 19.6804


Training: 100%|██████████| 100/100 [14:57<00:00,  7.88s/it]

Batch Loss: 178.8456
classification_loss: 4.4137
bbox_loss: 174.4319
objectness_loss: 0.0000




Average Training Loss for Epoch 1: 125.7398
Model checkpoint saved at epoch 1 to /content/drive/MyDrive/models/d2det_mini_coco_filtered.pth
Epoch [2/5]


Training:   1%|          | 1/100 [00:16<27:20, 16.58s/it]

Batch Loss: 114.9240
classification_loss: 4.6233
bbox_loss: 110.3007
objectness_loss: 0.0000


Training:   2%|▏         | 2/100 [00:25<19:33, 11.97s/it]

Batch Loss: 65.3522
classification_loss: 3.9957
bbox_loss: 53.1398
objectness_loss: 16.4336


Training:   3%|▎         | 3/100 [00:32<15:50,  9.79s/it]

Batch Loss: 75.6924
classification_loss: 4.1988
bbox_loss: 66.0988
objectness_loss: 10.7894


Training:   4%|▍         | 4/100 [00:39<14:09,  8.85s/it]

Batch Loss: 84.0031
classification_loss: 3.8740
bbox_loss: 75.4380
objectness_loss: 9.3824


Training:   5%|▌         | 5/100 [00:47<13:19,  8.41s/it]

Batch Loss: 164.0874
classification_loss: 4.6135
bbox_loss: 159.4739
objectness_loss: 0.0000


Training:   6%|▌         | 6/100 [00:55<12:43,  8.12s/it]

Batch Loss: 79.7503
classification_loss: 4.1860
bbox_loss: 67.7310
objectness_loss: 15.6667


Training:   7%|▋         | 7/100 [01:02<12:22,  7.98s/it]

Batch Loss: 105.7884
classification_loss: 4.3048
bbox_loss: 101.4837
objectness_loss: 0.0000


Training:   8%|▊         | 8/100 [01:10<12:14,  7.98s/it]

Batch Loss: 84.2768
classification_loss: 4.1857
bbox_loss: 79.9244
objectness_loss: 0.3333


Training:   9%|▉         | 9/100 [01:18<11:53,  7.84s/it]

Batch Loss: 129.4065
classification_loss: 3.9773
bbox_loss: 125.4292
objectness_loss: 0.0000


Training:  10%|█         | 10/100 [01:26<11:43,  7.81s/it]

Batch Loss: 86.8619
classification_loss: 4.1150
bbox_loss: 76.4322
objectness_loss: 12.6294


Training:  11%|█         | 11/100 [01:33<11:31,  7.77s/it]

Batch Loss: 148.9106
classification_loss: 4.6095
bbox_loss: 144.3011
objectness_loss: 0.0000


Training:  12%|█▏        | 12/100 [01:40<11:08,  7.59s/it]

Batch Loss: 106.6078
classification_loss: 4.6182
bbox_loss: 101.9896
objectness_loss: 0.0000


Training:  13%|█▎        | 13/100 [01:48<10:55,  7.53s/it]

Batch Loss: 62.1978
classification_loss: 4.5913
bbox_loss: 47.8459
objectness_loss: 19.5212


Training:  14%|█▍        | 14/100 [01:55<10:47,  7.53s/it]

Batch Loss: 63.2132
classification_loss: 4.4781
bbox_loss: 45.0724
objectness_loss: 27.3253


Training:  15%|█▌        | 15/100 [02:03<10:37,  7.50s/it]

Batch Loss: 125.6270
classification_loss: 4.6212
bbox_loss: 121.0058
objectness_loss: 0.0000


Training:  16%|█▌        | 16/100 [02:10<10:20,  7.39s/it]

Batch Loss: 125.1236
classification_loss: 4.0312
bbox_loss: 121.0924
objectness_loss: 0.0000


Training:  17%|█▋        | 17/100 [02:17<10:13,  7.39s/it]

Batch Loss: 126.0863
classification_loss: 4.3791
bbox_loss: 117.8615
objectness_loss: 7.6913


Training:  18%|█▊        | 18/100 [02:25<10:13,  7.48s/it]

Batch Loss: 80.5883
classification_loss: 3.9699
bbox_loss: 76.2850
objectness_loss: 0.6667


Training:  19%|█▉        | 19/100 [02:33<10:10,  7.53s/it]

Batch Loss: 67.6252
classification_loss: 4.6256
bbox_loss: 58.9644
objectness_loss: 8.0704


Training:  20%|██        | 20/100 [02:40<09:48,  7.36s/it]

Batch Loss: 146.6276
classification_loss: 4.6233
bbox_loss: 142.0043
objectness_loss: 0.0000


Training:  21%|██        | 21/100 [02:47<09:39,  7.33s/it]

Batch Loss: 109.3112
classification_loss: 3.9109
bbox_loss: 104.4003
objectness_loss: 2.0000


Training:  22%|██▏       | 22/100 [02:54<09:29,  7.30s/it]

Batch Loss: 161.4270
classification_loss: 3.9668
bbox_loss: 157.4601
objectness_loss: 0.0000


Training:  23%|██▎       | 23/100 [03:01<09:23,  7.31s/it]

Batch Loss: 102.0647
classification_loss: 4.3476
bbox_loss: 97.7171
objectness_loss: 0.0000


Training:  24%|██▍       | 24/100 [03:09<09:12,  7.27s/it]

Batch Loss: 124.8633
classification_loss: 4.6284
bbox_loss: 120.2350
objectness_loss: 0.0000


Training:  25%|██▌       | 25/100 [03:16<09:12,  7.37s/it]

Batch Loss: 92.0446
classification_loss: 4.6290
bbox_loss: 87.4156
objectness_loss: 0.0000


Training:  26%|██▌       | 26/100 [03:23<09:00,  7.30s/it]

Batch Loss: 211.3584
classification_loss: 4.6282
bbox_loss: 206.7302
objectness_loss: 0.0000


Training:  27%|██▋       | 27/100 [03:30<08:38,  7.10s/it]

Batch Loss: 143.3302
classification_loss: 4.3909
bbox_loss: 133.4992
objectness_loss: 10.8803


Training:  28%|██▊       | 28/100 [03:37<08:23,  6.99s/it]

Batch Loss: 54.7892
classification_loss: 4.2026
bbox_loss: 42.4655
objectness_loss: 16.2422


Training:  29%|██▉       | 29/100 [03:44<08:21,  7.06s/it]

Batch Loss: 136.3851
classification_loss: 4.6254
bbox_loss: 131.7597
objectness_loss: 0.0000


Training:  30%|███       | 30/100 [03:50<08:03,  6.91s/it]

Batch Loss: 198.2110
classification_loss: 3.9045
bbox_loss: 194.3065
objectness_loss: 0.0000


Training:  31%|███       | 31/100 [03:58<08:11,  7.13s/it]

Batch Loss: 55.1852
classification_loss: 4.6072
bbox_loss: 42.8807
objectness_loss: 15.3945


Training:  32%|███▏      | 32/100 [04:05<08:00,  7.06s/it]

Batch Loss: 89.4673
classification_loss: 4.2851
bbox_loss: 83.6347
objectness_loss: 3.0949


Training:  33%|███▎      | 33/100 [04:12<07:58,  7.14s/it]

Batch Loss: 80.7477
classification_loss: 3.9603
bbox_loss: 76.1978
objectness_loss: 1.1792


Training:  34%|███▍      | 34/100 [04:19<07:50,  7.12s/it]

Batch Loss: 175.5471
classification_loss: 4.6281
bbox_loss: 170.9190
objectness_loss: 0.0000


Training:  35%|███▌      | 35/100 [04:27<07:41,  7.11s/it]

Batch Loss: 124.8985
classification_loss: 4.6306
bbox_loss: 117.8472
objectness_loss: 4.8414


Training:  36%|███▌      | 36/100 [04:34<07:38,  7.17s/it]

Batch Loss: 62.6584
classification_loss: 4.5170
bbox_loss: 48.7985
objectness_loss: 18.6857


Training:  37%|███▋      | 37/100 [04:41<07:25,  7.07s/it]

Batch Loss: 238.8189
classification_loss: 4.6305
bbox_loss: 234.1885
objectness_loss: 0.0000


Training:  38%|███▊      | 38/100 [04:48<07:22,  7.13s/it]

Batch Loss: 53.8349
classification_loss: 3.8724
bbox_loss: 43.6511
objectness_loss: 12.6227


Training:  39%|███▉      | 39/100 [04:55<07:10,  7.05s/it]

Batch Loss: 90.8328
classification_loss: 4.6220
bbox_loss: 86.0442
objectness_loss: 0.3333


Training:  40%|████      | 40/100 [05:02<07:06,  7.12s/it]

Batch Loss: 162.8565
classification_loss: 3.9248
bbox_loss: 158.9318
objectness_loss: 0.0000


Training:  41%|████      | 41/100 [05:09<06:55,  7.04s/it]

Batch Loss: 162.8006
classification_loss: 4.2768
bbox_loss: 158.5238
objectness_loss: 0.0000


Training:  42%|████▏     | 42/100 [05:16<06:44,  6.98s/it]

Batch Loss: 140.8134
classification_loss: 4.6307
bbox_loss: 133.7628
objectness_loss: 4.8398


Training:  43%|████▎     | 43/100 [05:23<06:37,  6.97s/it]

Batch Loss: 91.4289
classification_loss: 4.0800
bbox_loss: 86.8446
objectness_loss: 1.0087


Training:  44%|████▍     | 44/100 [05:30<06:36,  7.08s/it]

Batch Loss: 76.5832
classification_loss: 4.3735
bbox_loss: 69.5658
objectness_loss: 5.2879


Training:  45%|████▌     | 45/100 [05:37<06:22,  6.96s/it]

Batch Loss: 73.9862
classification_loss: 4.6288
bbox_loss: 68.5968
objectness_loss: 1.5213


Training:  46%|████▌     | 46/100 [05:44<06:25,  7.14s/it]

Batch Loss: 192.5405
classification_loss: 4.6298
bbox_loss: 186.7255
objectness_loss: 2.3704


Training:  47%|████▋     | 47/100 [05:51<06:05,  6.90s/it]

Batch Loss: 128.7104
classification_loss: 3.6750
bbox_loss: 124.6848
objectness_loss: 0.7012


Training:  48%|████▊     | 48/100 [05:58<06:02,  6.97s/it]

Batch Loss: 144.0978
classification_loss: 3.6888
bbox_loss: 140.1633
objectness_loss: 0.4914


Training:  49%|████▉     | 49/100 [06:05<05:59,  7.04s/it]

Batch Loss: 86.5887
classification_loss: 4.1141
bbox_loss: 81.8951
objectness_loss: 1.1589


Training:  50%|█████     | 50/100 [06:12<05:57,  7.14s/it]

Batch Loss: 95.4464
classification_loss: 3.6669
bbox_loss: 91.2436
objectness_loss: 1.0718


Training:  51%|█████     | 51/100 [06:19<05:43,  7.02s/it]

Batch Loss: 116.4299
classification_loss: 4.6306
bbox_loss: 111.7390
objectness_loss: 0.1207


Training:  52%|█████▏    | 52/100 [06:26<05:42,  7.13s/it]

Batch Loss: 99.6051
classification_loss: 4.6317
bbox_loss: 94.8868
objectness_loss: 0.1733


Training:  53%|█████▎    | 53/100 [06:33<05:30,  7.04s/it]

Batch Loss: 65.3702
classification_loss: 4.6318
bbox_loss: 58.3557
objectness_loss: 4.7655


Training:  54%|█████▍    | 54/100 [06:40<05:23,  7.03s/it]

Batch Loss: 69.6973
classification_loss: 4.1307
bbox_loss: 64.9692
objectness_loss: 1.1947


Training:  55%|█████▌    | 55/100 [06:47<05:13,  6.96s/it]

Batch Loss: 84.1260
classification_loss: 4.6305
bbox_loss: 79.3641
objectness_loss: 0.2626


Training:  56%|█████▌    | 56/100 [06:54<05:06,  6.97s/it]

Batch Loss: 154.2386
classification_loss: 4.6318
bbox_loss: 149.5693
objectness_loss: 0.0752


Training:  57%|█████▋    | 57/100 [07:01<04:59,  6.97s/it]

Batch Loss: 57.1880
classification_loss: 4.6311
bbox_loss: 51.4283
objectness_loss: 2.2572


Training:  58%|█████▊    | 58/100 [07:08<04:53,  6.99s/it]

Batch Loss: 61.6313
classification_loss: 3.9098
bbox_loss: 57.4712
objectness_loss: 0.5005


Training:  59%|█████▉    | 59/100 [07:15<04:43,  6.90s/it]

Batch Loss: 129.1559
classification_loss: 4.6319
bbox_loss: 124.4364
objectness_loss: 0.1753


Training:  60%|██████    | 60/100 [07:22<04:34,  6.87s/it]

Batch Loss: 199.2184
classification_loss: 4.6319
bbox_loss: 194.4925
objectness_loss: 0.1879


Training:  61%|██████    | 61/100 [07:28<04:25,  6.80s/it]

Batch Loss: 86.8193
classification_loss: 4.6319
bbox_loss: 82.0683
objectness_loss: 0.2382


Training:  62%|██████▏   | 62/100 [07:36<04:26,  7.01s/it]

Batch Loss: 122.5031
classification_loss: 4.6320
bbox_loss: 117.7732
objectness_loss: 0.1959


Training:  63%|██████▎   | 63/100 [07:43<04:17,  6.95s/it]

Batch Loss: 41.7673
classification_loss: 3.7696
bbox_loss: 37.3138
objectness_loss: 1.3679


Training:  64%|██████▍   | 64/100 [07:49<04:06,  6.85s/it]

Batch Loss: 107.9686
classification_loss: 4.6317
bbox_loss: 103.2217
objectness_loss: 0.2305


Training:  65%|██████▌   | 65/100 [07:56<04:00,  6.87s/it]

Batch Loss: 194.1540
classification_loss: 4.6316
bbox_loss: 189.4239
objectness_loss: 0.1970


Training:  66%|██████▌   | 66/100 [08:03<03:56,  6.96s/it]

Batch Loss: 57.5859
classification_loss: 4.6318
bbox_loss: 52.3309
objectness_loss: 1.2464


Training:  67%|██████▋   | 67/100 [08:10<03:44,  6.81s/it]

Batch Loss: 104.4924
classification_loss: 4.6313
bbox_loss: 99.7676
objectness_loss: 0.1869


Training:  68%|██████▊   | 68/100 [08:17<03:42,  6.94s/it]

Batch Loss: 115.2065
classification_loss: 3.9124
bbox_loss: 111.1996
objectness_loss: 0.1888


Training:  69%|██████▉   | 69/100 [08:24<03:31,  6.82s/it]

Batch Loss: 127.7141
classification_loss: 3.9362
bbox_loss: 123.6734
objectness_loss: 0.2091


Training:  70%|███████   | 70/100 [08:30<03:25,  6.85s/it]

Batch Loss: 156.5999
classification_loss: 4.6317
bbox_loss: 151.8719
objectness_loss: 0.1925


Training:  71%|███████   | 71/100 [08:37<03:17,  6.81s/it]

Batch Loss: 116.2563
classification_loss: 4.6317
bbox_loss: 111.5476
objectness_loss: 0.1540


Training:  72%|███████▏  | 72/100 [08:44<03:10,  6.82s/it]

Batch Loss: 91.3200
classification_loss: 3.9627
bbox_loss: 87.1658
objectness_loss: 0.3832


Training:  73%|███████▎  | 73/100 [08:51<03:04,  6.84s/it]

Batch Loss: 63.1869
classification_loss: 4.1236
bbox_loss: 58.7249
objectness_loss: 0.6767


Training:  74%|███████▍  | 74/100 [08:58<03:00,  6.92s/it]

Batch Loss: 166.1460
classification_loss: 4.6319
bbox_loss: 161.4753
objectness_loss: 0.0775


Training:  75%|███████▌  | 75/100 [09:05<02:52,  6.90s/it]

Batch Loss: 59.9560
classification_loss: 4.6317
bbox_loss: 54.7236
objectness_loss: 1.2013


Training:  76%|███████▌  | 76/100 [09:12<02:48,  7.01s/it]

Batch Loss: 103.1947
classification_loss: 4.6315
bbox_loss: 98.5209
objectness_loss: 0.0844


Training:  77%|███████▋  | 77/100 [09:19<02:40,  6.96s/it]

Batch Loss: 85.0058
classification_loss: 3.6789
bbox_loss: 80.9746
objectness_loss: 0.7046


Training:  78%|███████▊  | 78/100 [09:27<02:37,  7.16s/it]

Batch Loss: 118.2392
classification_loss: 4.6319
bbox_loss: 113.5269
objectness_loss: 0.1608


Training:  79%|███████▉  | 79/100 [09:34<02:30,  7.18s/it]

Batch Loss: 127.4036
classification_loss: 3.6335
bbox_loss: 123.6896
objectness_loss: 0.1609


Training:  80%|████████  | 80/100 [09:41<02:23,  7.16s/it]

Batch Loss: 48.1274
classification_loss: 4.2190
bbox_loss: 43.4350
objectness_loss: 0.9468


Training:  81%|████████  | 81/100 [09:48<02:16,  7.18s/it]

Batch Loss: 74.0660
classification_loss: 3.6668
bbox_loss: 69.9855
objectness_loss: 0.8273


Training:  82%|████████▏ | 82/100 [09:55<02:08,  7.14s/it]

Batch Loss: 166.9741
classification_loss: 4.6313
bbox_loss: 162.2406
objectness_loss: 0.2045


Training:  83%|████████▎ | 83/100 [10:02<02:00,  7.09s/it]

Batch Loss: 94.1827
classification_loss: 4.6318
bbox_loss: 89.3138
objectness_loss: 0.4743


Training:  84%|████████▍ | 84/100 [10:10<01:55,  7.20s/it]

Batch Loss: 78.5041
classification_loss: 4.6319
bbox_loss: 73.6321
objectness_loss: 0.4802


Training:  85%|████████▌ | 85/100 [10:17<01:46,  7.10s/it]

Batch Loss: 59.6586
classification_loss: 4.4700
bbox_loss: 54.8283
objectness_loss: 0.7204


Training:  86%|████████▌ | 86/100 [10:23<01:38,  7.06s/it]

Batch Loss: 180.3207
classification_loss: 4.3794
bbox_loss: 175.8088
objectness_loss: 0.2651


Training:  87%|████████▋ | 87/100 [10:30<01:31,  7.03s/it]

Batch Loss: 83.9238
classification_loss: 4.1598
bbox_loss: 79.5041
objectness_loss: 0.5197


Training:  88%|████████▊ | 88/100 [10:38<01:24,  7.06s/it]

Batch Loss: 41.4567
classification_loss: 4.0836
bbox_loss: 36.9386
objectness_loss: 0.8689


Training:  89%|████████▉ | 89/100 [10:44<01:16,  6.94s/it]

Batch Loss: 162.8317
classification_loss: 3.6389
bbox_loss: 159.0757
objectness_loss: 0.2342


Training:  90%|█████████ | 90/100 [10:51<01:09,  6.95s/it]

Batch Loss: 70.7088
classification_loss: 4.6320
bbox_loss: 65.7834
objectness_loss: 0.5868


Training:  91%|█████████ | 91/100 [10:59<01:03,  7.10s/it]

Batch Loss: 118.2576
classification_loss: 4.6319
bbox_loss: 113.4910
objectness_loss: 0.2695


Training:  92%|█████████▏| 92/100 [11:05<00:55,  6.99s/it]

Batch Loss: 218.3320
classification_loss: 4.6318
bbox_loss: 213.5463
objectness_loss: 0.3079


Training:  93%|█████████▎| 93/100 [11:13<00:49,  7.11s/it]

Batch Loss: 150.8768
classification_loss: 4.6318
bbox_loss: 146.0992
objectness_loss: 0.2915


Training:  94%|█████████▍| 94/100 [11:20<00:42,  7.09s/it]

Batch Loss: 80.3248
classification_loss: 4.6320
bbox_loss: 75.4063
objectness_loss: 0.5730


Training:  95%|█████████▌| 95/100 [11:26<00:34,  6.96s/it]

Batch Loss: 94.0278
classification_loss: 3.6324
bbox_loss: 89.9886
objectness_loss: 0.8134


Training:  96%|█████████▌| 96/100 [11:34<00:27,  6.98s/it]

Batch Loss: 35.8325
classification_loss: 4.1879
bbox_loss: 30.8085
objectness_loss: 1.6722


Training:  97%|█████████▋| 97/100 [11:40<00:20,  6.97s/it]

Batch Loss: 123.6499
classification_loss: 4.6316
bbox_loss: 118.8861
objectness_loss: 0.2643


Training:  98%|█████████▊| 98/100 [11:47<00:13,  6.81s/it]

Batch Loss: 90.1645
classification_loss: 4.0507
bbox_loss: 85.8669
objectness_loss: 0.4937


Training:  99%|█████████▉| 99/100 [11:53<00:06,  6.72s/it]

Batch Loss: 108.8056
classification_loss: 4.3932
bbox_loss: 104.1461
objectness_loss: 0.5327


Training: 100%|██████████| 100/100 [12:00<00:00,  6.70s/it]

Batch Loss: 81.5827
classification_loss: 4.6320
bbox_loss: 76.6707
objectness_loss: 0.5603




Average Training Loss for Epoch 2: 109.4917
Model checkpoint saved at epoch 2 to /content/drive/MyDrive/models/d2det_mini_coco_filtered.pth
Epoch [3/5]


Training:   1%|          | 1/100 [00:15<25:51, 15.67s/it]

Batch Loss: 139.7296
classification_loss: 3.6323
bbox_loss: 135.9633
objectness_loss: 0.2680


Training:   2%|▏         | 2/100 [00:24<19:02, 11.66s/it]

Batch Loss: 103.7990
classification_loss: 4.2534
bbox_loss: 99.4145
objectness_loss: 0.2623


Training:   3%|▎         | 3/100 [00:31<15:23,  9.52s/it]

Batch Loss: 95.5630
classification_loss: 4.1487
bbox_loss: 91.1609
objectness_loss: 0.5067


Training:   4%|▍         | 4/100 [00:38<13:26,  8.40s/it]

Batch Loss: 130.0133
classification_loss: 4.2565
bbox_loss: 125.4393
objectness_loss: 0.6348


Training:   5%|▌         | 5/100 [00:44<12:10,  7.69s/it]

Batch Loss: 49.6427
classification_loss: 4.6319
bbox_loss: 44.5231
objectness_loss: 0.9754


Training:   6%|▌         | 6/100 [00:51<11:46,  7.52s/it]

Batch Loss: 84.3762
classification_loss: 4.1992
bbox_loss: 80.0334
objectness_loss: 0.2871


Training:   7%|▋         | 7/100 [00:58<11:11,  7.22s/it]

Batch Loss: 94.2948
classification_loss: 4.6320
bbox_loss: 89.5658
objectness_loss: 0.1940


Training:   8%|▊         | 8/100 [01:05<10:51,  7.08s/it]

Batch Loss: 220.5495
classification_loss: 4.6293
bbox_loss: 215.7403
objectness_loss: 0.3597


Training:   9%|▉         | 9/100 [01:11<10:24,  6.86s/it]

Batch Loss: 53.0110
classification_loss: 3.8868
bbox_loss: 48.8383
objectness_loss: 0.5719


Training:  10%|█         | 10/100 [01:17<09:54,  6.60s/it]

Batch Loss: 133.3011
classification_loss: 4.6296
bbox_loss: 128.4774
objectness_loss: 0.3882


Training:  11%|█         | 11/100 [01:24<09:45,  6.58s/it]

Batch Loss: 186.0306
classification_loss: 3.6333
bbox_loss: 182.3128
objectness_loss: 0.1690


Training:  12%|█▏        | 12/100 [01:30<09:36,  6.55s/it]

Batch Loss: 132.6790
classification_loss: 4.6319
bbox_loss: 127.9335
objectness_loss: 0.2270


Training:  13%|█▎        | 13/100 [01:37<09:40,  6.68s/it]

Batch Loss: 162.3177
classification_loss: 4.6319
bbox_loss: 157.6230
objectness_loss: 0.1255


Training:  14%|█▍        | 14/100 [01:44<09:41,  6.76s/it]

Batch Loss: 30.7973
classification_loss: 3.7860
bbox_loss: 26.1372
objectness_loss: 1.7481


Training:  15%|█▌        | 15/100 [01:51<09:35,  6.77s/it]

Batch Loss: 65.8719
classification_loss: 3.8236
bbox_loss: 61.5043
objectness_loss: 1.0880


Training:  16%|█▌        | 16/100 [01:58<09:34,  6.84s/it]

Batch Loss: 111.2003
classification_loss: 3.6327
bbox_loss: 107.4962
objectness_loss: 0.1428


Training:  17%|█▋        | 17/100 [02:04<09:23,  6.79s/it]

Batch Loss: 48.6107
classification_loss: 3.6438
bbox_loss: 44.6704
objectness_loss: 0.5930


Training:  18%|█▊        | 18/100 [02:11<09:07,  6.68s/it]

Batch Loss: 118.7188
classification_loss: 4.6319
bbox_loss: 113.9729
objectness_loss: 0.2279


Training:  19%|█▉        | 19/100 [02:18<09:05,  6.74s/it]

Batch Loss: 56.0256
classification_loss: 4.6320
bbox_loss: 50.9370
objectness_loss: 0.9133


Training:  20%|██        | 20/100 [02:24<08:47,  6.60s/it]

Batch Loss: 67.6105
classification_loss: 4.6318
bbox_loss: 62.6347
objectness_loss: 0.6879


Training:  21%|██        | 21/100 [02:31<08:39,  6.57s/it]

Batch Loss: 263.1539
classification_loss: 4.6320
bbox_loss: 258.4392
objectness_loss: 0.1653


Training:  22%|██▏       | 22/100 [02:37<08:25,  6.48s/it]

Batch Loss: 133.1942
classification_loss: 4.6318
bbox_loss: 128.4193
objectness_loss: 0.2863


Training:  23%|██▎       | 23/100 [02:43<08:23,  6.53s/it]

Batch Loss: 107.2333
classification_loss: 4.6319
bbox_loss: 102.4577
objectness_loss: 0.2876


Training:  24%|██▍       | 24/100 [02:50<08:15,  6.52s/it]

Batch Loss: 35.7936
classification_loss: 4.0664
bbox_loss: 31.2065
objectness_loss: 1.0414


Training:  25%|██▌       | 25/100 [02:57<08:12,  6.57s/it]

Batch Loss: 206.7130
classification_loss: 4.6316
bbox_loss: 201.9446
objectness_loss: 0.2736


Training:  26%|██▌       | 26/100 [03:03<08:07,  6.59s/it]

Batch Loss: 75.1825
classification_loss: 4.1024
bbox_loss: 70.9674
objectness_loss: 0.2254


Training:  27%|██▋       | 27/100 [03:10<08:04,  6.63s/it]

Batch Loss: 54.3852
classification_loss: 4.6319
bbox_loss: 49.2397
objectness_loss: 1.0270


Training:  28%|██▊       | 28/100 [03:16<07:49,  6.53s/it]

Batch Loss: 97.5943
classification_loss: 4.6317
bbox_loss: 92.7871
objectness_loss: 0.3510


Training:  29%|██▉       | 29/100 [03:23<07:47,  6.58s/it]

Batch Loss: 132.4152
classification_loss: 3.6337
bbox_loss: 128.6228
objectness_loss: 0.3176


Training:  30%|███       | 30/100 [03:30<07:40,  6.59s/it]

Batch Loss: 98.5938
classification_loss: 4.4987
bbox_loss: 93.8212
objectness_loss: 0.5479


Training:  31%|███       | 31/100 [03:37<07:42,  6.70s/it]

Batch Loss: 127.9385
classification_loss: 4.6320
bbox_loss: 123.0471
objectness_loss: 0.5189


Training:  32%|███▏      | 32/100 [03:44<07:40,  6.77s/it]

Batch Loss: 117.2020
classification_loss: 4.6320
bbox_loss: 112.4857
objectness_loss: 0.1687


Training:  33%|███▎      | 33/100 [03:50<07:26,  6.66s/it]

Batch Loss: 117.4591
classification_loss: 4.6317
bbox_loss: 112.6435
objectness_loss: 0.3678


Training:  34%|███▍      | 34/100 [03:57<07:23,  6.72s/it]

Batch Loss: 68.8538
classification_loss: 3.6374
bbox_loss: 65.0936
objectness_loss: 0.2456


Training:  35%|███▌      | 35/100 [04:03<07:14,  6.69s/it]

Batch Loss: 197.4783
classification_loss: 4.6319
bbox_loss: 192.7086
objectness_loss: 0.2756


Training:  36%|███▌      | 36/100 [04:14<08:26,  7.91s/it]

Batch Loss: 118.6621
classification_loss: 4.1460
bbox_loss: 114.2133
objectness_loss: 0.6055


Training:  37%|███▋      | 37/100 [04:21<08:00,  7.63s/it]

Batch Loss: 127.6360
classification_loss: 4.6319
bbox_loss: 122.9138
objectness_loss: 0.1806


Training:  38%|███▊      | 38/100 [04:28<07:33,  7.32s/it]

Batch Loss: 98.5324
classification_loss: 4.6319
bbox_loss: 93.7914
objectness_loss: 0.2181


Training:  39%|███▉      | 39/100 [04:35<07:23,  7.27s/it]

Batch Loss: 80.4172
classification_loss: 3.6362
bbox_loss: 76.4255
objectness_loss: 0.7110


Training:  40%|████      | 40/100 [04:42<07:05,  7.09s/it]

Batch Loss: 105.3116
classification_loss: 3.8130
bbox_loss: 101.4246
objectness_loss: 0.1480


Training:  41%|████      | 41/100 [04:48<06:54,  7.03s/it]

Batch Loss: 42.7301
classification_loss: 4.3354
bbox_loss: 37.9422
objectness_loss: 0.9050


Training:  42%|████▏     | 42/100 [04:55<06:43,  6.95s/it]

Batch Loss: 105.4128
classification_loss: 4.6319
bbox_loss: 100.6678
objectness_loss: 0.2261


Training:  43%|████▎     | 43/100 [05:02<06:33,  6.90s/it]

Batch Loss: 92.1010
classification_loss: 3.6325
bbox_loss: 87.9013
objectness_loss: 1.1344


Training:  44%|████▍     | 44/100 [05:08<06:17,  6.74s/it]

Batch Loss: 194.7614
classification_loss: 4.5589
bbox_loss: 190.1180
objectness_loss: 0.1691


Training:  45%|████▌     | 45/100 [05:15<06:13,  6.80s/it]

Batch Loss: 68.6402
classification_loss: 4.6310
bbox_loss: 63.8428
objectness_loss: 0.3325


Training:  46%|████▌     | 46/100 [05:22<06:04,  6.76s/it]

Batch Loss: 69.0298
classification_loss: 4.6320
bbox_loss: 64.1813
objectness_loss: 0.4331


Training:  47%|████▋     | 47/100 [05:29<06:01,  6.82s/it]

Batch Loss: 113.2716
classification_loss: 4.1839
bbox_loss: 108.7576
objectness_loss: 0.6603


Training:  48%|████▊     | 48/100 [05:36<05:54,  6.81s/it]

Batch Loss: 187.5048
classification_loss: 4.6320
bbox_loss: 182.8132
objectness_loss: 0.1192


Training:  49%|████▉     | 49/100 [05:43<05:48,  6.83s/it]

Batch Loss: 44.5823
classification_loss: 4.0933
bbox_loss: 40.0834
objectness_loss: 0.8113


Training:  50%|█████     | 50/100 [05:49<05:42,  6.85s/it]

Batch Loss: 44.4271
classification_loss: 4.2055
bbox_loss: 39.5064
objectness_loss: 1.4306


Training:  51%|█████     | 51/100 [05:56<05:35,  6.84s/it]

Batch Loss: 52.9730
classification_loss: 3.8928
bbox_loss: 48.5595
objectness_loss: 1.0414


Training:  52%|█████▏    | 52/100 [06:03<05:27,  6.83s/it]

Batch Loss: 88.7104
classification_loss: 3.9258
bbox_loss: 84.6088
objectness_loss: 0.3515


Training:  53%|█████▎    | 53/100 [06:10<05:19,  6.80s/it]

Batch Loss: 70.4034
classification_loss: 4.6318
bbox_loss: 65.5718
objectness_loss: 0.3995


Training:  54%|█████▍    | 54/100 [06:16<05:08,  6.71s/it]

Batch Loss: 126.5838
classification_loss: 4.6320
bbox_loss: 121.7731
objectness_loss: 0.3574


Training:  55%|█████▌    | 55/100 [06:23<05:00,  6.67s/it]

Batch Loss: 63.7982
classification_loss: 4.2389
bbox_loss: 59.2895
objectness_loss: 0.5394


Training:  56%|█████▌    | 56/100 [06:29<04:50,  6.60s/it]

Batch Loss: 163.5292
classification_loss: 4.6317
bbox_loss: 158.6843
objectness_loss: 0.4263


Training:  57%|█████▋    | 57/100 [06:36<04:45,  6.64s/it]

Batch Loss: 56.9550
classification_loss: 4.2456
bbox_loss: 52.3708
objectness_loss: 0.6772


Training:  58%|█████▊    | 58/100 [06:42<04:34,  6.55s/it]

Batch Loss: 158.2253
classification_loss: 3.6758
bbox_loss: 154.3616
objectness_loss: 0.3756


Training:  59%|█████▉    | 59/100 [06:49<04:34,  6.71s/it]

Batch Loss: 153.7366
classification_loss: 4.6319
bbox_loss: 148.9205
objectness_loss: 0.3684


Training:  60%|██████    | 60/100 [06:56<04:30,  6.76s/it]

Batch Loss: 190.1290
classification_loss: 4.6320
bbox_loss: 185.3361
objectness_loss: 0.3218


Training:  61%|██████    | 61/100 [07:03<04:22,  6.73s/it]

Batch Loss: 90.0079
classification_loss: 4.6320
bbox_loss: 85.2465
objectness_loss: 0.2588


Training:  62%|██████▏   | 62/100 [07:09<04:10,  6.60s/it]

Batch Loss: 190.5937
classification_loss: 4.2830
bbox_loss: 186.1542
objectness_loss: 0.3132


Training:  63%|██████▎   | 63/100 [07:16<03:59,  6.48s/it]

Batch Loss: 128.4393
classification_loss: 4.6319
bbox_loss: 123.7025
objectness_loss: 0.2098


Training:  64%|██████▍   | 64/100 [07:22<03:57,  6.60s/it]

Batch Loss: 95.9046
classification_loss: 4.6320
bbox_loss: 91.1474
objectness_loss: 0.2504


Training:  65%|██████▌   | 65/100 [07:29<03:46,  6.48s/it]

Batch Loss: 152.2476
classification_loss: 3.9227
bbox_loss: 148.2553
objectness_loss: 0.1393


Training:  66%|██████▌   | 66/100 [07:35<03:44,  6.60s/it]

Batch Loss: 87.9193
classification_loss: 4.1027
bbox_loss: 83.7282
objectness_loss: 0.1768


Training:  67%|██████▋   | 67/100 [07:42<03:38,  6.63s/it]

Batch Loss: 157.6465
classification_loss: 3.6324
bbox_loss: 153.9575
objectness_loss: 0.1133


Training:  68%|██████▊   | 68/100 [07:49<03:33,  6.66s/it]

Batch Loss: 73.2600
classification_loss: 4.2125
bbox_loss: 68.2764
objectness_loss: 1.5422


Training:  69%|██████▉   | 69/100 [07:56<03:27,  6.70s/it]

Batch Loss: 49.1142
classification_loss: 4.6317
bbox_loss: 44.0111
objectness_loss: 0.9428


Training:  70%|███████   | 70/100 [08:02<03:16,  6.56s/it]

Batch Loss: 84.0615
classification_loss: 4.6320
bbox_loss: 79.2293
objectness_loss: 0.4005


Training:  71%|███████   | 71/100 [08:08<03:07,  6.45s/it]

Batch Loss: 125.0915
classification_loss: 3.7235
bbox_loss: 121.3214
objectness_loss: 0.0933


Training:  72%|███████▏  | 72/100 [08:15<03:00,  6.44s/it]

Batch Loss: 62.2478
classification_loss: 4.0371
bbox_loss: 57.7901
objectness_loss: 0.8412


Training:  73%|███████▎  | 73/100 [08:21<02:51,  6.35s/it]

Batch Loss: 71.9752
classification_loss: 4.1834
bbox_loss: 67.4624
objectness_loss: 0.6588


Training:  74%|███████▍  | 74/100 [08:28<02:49,  6.51s/it]

Batch Loss: 102.7586
classification_loss: 4.6320
bbox_loss: 98.0449
objectness_loss: 0.1635


Training:  75%|███████▌  | 75/100 [08:34<02:43,  6.52s/it]

Batch Loss: 99.9697
classification_loss: 3.6326
bbox_loss: 96.2339
objectness_loss: 0.2063


Training:  76%|███████▌  | 76/100 [08:41<02:37,  6.57s/it]

Batch Loss: 172.4880
classification_loss: 4.6320
bbox_loss: 167.7887
objectness_loss: 0.1347


Training:  77%|███████▋  | 77/100 [08:47<02:28,  6.46s/it]

Batch Loss: 52.7991
classification_loss: 4.6086
bbox_loss: 47.8545
objectness_loss: 0.6721


Training:  78%|███████▊  | 78/100 [08:54<02:22,  6.49s/it]

Batch Loss: 88.4940
classification_loss: 4.6319
bbox_loss: 83.7301
objectness_loss: 0.2638


Training:  79%|███████▉  | 79/100 [09:00<02:16,  6.50s/it]

Batch Loss: 124.6616
classification_loss: 3.6351
bbox_loss: 120.9189
objectness_loss: 0.2151


Training:  80%|████████  | 80/100 [09:07<02:11,  6.55s/it]

Batch Loss: 32.9416
classification_loss: 4.3789
bbox_loss: 27.5427
objectness_loss: 2.0401


Training:  81%|████████  | 81/100 [09:13<02:05,  6.59s/it]

Batch Loss: 106.1152
classification_loss: 3.9341
bbox_loss: 102.0801
objectness_loss: 0.2018


Training:  82%|████████▏ | 82/100 [09:20<01:58,  6.57s/it]

Batch Loss: 173.4708
classification_loss: 4.6319
bbox_loss: 168.7223
objectness_loss: 0.2332


Training:  83%|████████▎ | 83/100 [09:26<01:50,  6.52s/it]

Batch Loss: 98.4155
classification_loss: 4.6320
bbox_loss: 93.4251
objectness_loss: 0.7169


Training:  84%|████████▍ | 84/100 [09:33<01:44,  6.51s/it]

Batch Loss: 38.8637
classification_loss: 3.8656
bbox_loss: 34.3753
objectness_loss: 1.2457


Training:  85%|████████▌ | 85/100 [09:40<01:38,  6.55s/it]

Batch Loss: 67.8724
classification_loss: 4.1288
bbox_loss: 63.6708
objectness_loss: 0.1455


Training:  86%|████████▌ | 86/100 [09:46<01:31,  6.51s/it]

Batch Loss: 120.7659
classification_loss: 4.6320
bbox_loss: 116.0146
objectness_loss: 0.2387


Training:  87%|████████▋ | 87/100 [09:52<01:24,  6.50s/it]

Batch Loss: 129.7110
classification_loss: 4.5121
bbox_loss: 125.0484
objectness_loss: 0.3010


Training:  88%|████████▊ | 88/100 [09:59<01:18,  6.52s/it]

Batch Loss: 68.4454
classification_loss: 4.6320
bbox_loss: 63.6709
objectness_loss: 0.2851


Training:  89%|████████▉ | 89/100 [10:06<01:12,  6.62s/it]

Batch Loss: 62.3708
classification_loss: 4.5289
bbox_loss: 57.6290
objectness_loss: 0.4257


Training:  90%|█████████ | 90/100 [10:12<01:05,  6.51s/it]

Batch Loss: 100.8785
classification_loss: 4.2055
bbox_loss: 96.5358
objectness_loss: 0.2745


Training:  91%|█████████ | 91/100 [10:19<00:59,  6.61s/it]

Batch Loss: 46.1658
classification_loss: 4.3190
bbox_loss: 41.3655
objectness_loss: 0.9627


Training:  92%|█████████▏| 92/100 [10:25<00:52,  6.53s/it]

Batch Loss: 81.2273
classification_loss: 4.6320
bbox_loss: 76.5162
objectness_loss: 0.1582


Training:  93%|█████████▎| 93/100 [10:32<00:46,  6.67s/it]

Batch Loss: 79.7941
classification_loss: 4.6318
bbox_loss: 74.8651
objectness_loss: 0.5943


Training:  94%|█████████▍| 94/100 [10:39<00:39,  6.56s/it]

Batch Loss: 44.7054
classification_loss: 4.6320
bbox_loss: 39.4235
objectness_loss: 1.2998


Training:  95%|█████████▌| 95/100 [10:45<00:32,  6.60s/it]

Batch Loss: 167.4049
classification_loss: 4.6319
bbox_loss: 162.6596
objectness_loss: 0.2268


Training:  96%|█████████▌| 96/100 [10:52<00:26,  6.63s/it]

Batch Loss: 235.1583
classification_loss: 4.6320
bbox_loss: 230.4066
objectness_loss: 0.2394


Training:  97%|█████████▋| 97/100 [10:59<00:19,  6.65s/it]

Batch Loss: 202.4337
classification_loss: 4.6317
bbox_loss: 197.6786
objectness_loss: 0.2468


Training:  98%|█████████▊| 98/100 [11:05<00:13,  6.55s/it]

Batch Loss: 121.3581
classification_loss: 4.6320
bbox_loss: 116.6250
objectness_loss: 0.2023


Training:  99%|█████████▉| 99/100 [11:12<00:06,  6.55s/it]

Batch Loss: 129.3859
classification_loss: 3.8443
bbox_loss: 125.4336
objectness_loss: 0.2161


Training: 100%|██████████| 100/100 [11:18<00:00,  6.57s/it]

Batch Loss: 43.1144
classification_loss: 3.7759
bbox_loss: 38.8127
objectness_loss: 1.0516




Average Training Loss for Epoch 3: 106.9568
Model checkpoint saved at epoch 3 to /content/drive/MyDrive/models/d2det_mini_coco_filtered.pth
Epoch [4/5]


Training:   1%|          | 1/100 [00:15<24:57, 15.13s/it]

Batch Loss: 177.6987
classification_loss: 4.6319
bbox_loss: 172.9628
objectness_loss: 0.2080


Training:   2%|▏         | 2/100 [00:23<18:27, 11.30s/it]

Batch Loss: 136.7027
classification_loss: 4.2457
bbox_loss: 132.3426
objectness_loss: 0.2288


Training:   3%|▎         | 3/100 [00:31<15:23,  9.52s/it]

Batch Loss: 145.7777
classification_loss: 4.6319
bbox_loss: 141.0583
objectness_loss: 0.1750


Training:   4%|▍         | 4/100 [00:37<13:28,  8.43s/it]

Batch Loss: 41.3864
classification_loss: 3.7756
bbox_loss: 37.0629
objectness_loss: 1.0957


Training:   5%|▌         | 5/100 [00:44<12:11,  7.71s/it]

Batch Loss: 81.9701
classification_loss: 4.1491
bbox_loss: 77.4980
objectness_loss: 0.6459


Training:   6%|▌         | 6/100 [00:51<11:42,  7.48s/it]

Batch Loss: 161.2813
classification_loss: 4.6319
bbox_loss: 156.5999
objectness_loss: 0.0990


Training:   7%|▋         | 7/100 [00:57<11:09,  7.19s/it]

Batch Loss: 104.4364
classification_loss: 4.6320
bbox_loss: 99.7211
objectness_loss: 0.1667


Training:   8%|▊         | 8/100 [01:04<10:38,  6.94s/it]

Batch Loss: 55.3890
classification_loss: 3.9289
bbox_loss: 51.0533
objectness_loss: 0.8135


Training:   9%|▉         | 9/100 [01:11<10:27,  6.90s/it]

Batch Loss: 156.8994
classification_loss: 3.6427
bbox_loss: 153.1997
objectness_loss: 0.1140


Training:  10%|█         | 10/100 [01:18<10:21,  6.91s/it]

Batch Loss: 53.9850
classification_loss: 3.8890
bbox_loss: 49.6464
objectness_loss: 0.8991


Training:  11%|█         | 11/100 [01:24<10:10,  6.86s/it]

Batch Loss: 134.9546
classification_loss: 4.6320
bbox_loss: 130.2611
objectness_loss: 0.1231


Training:  12%|█▏        | 12/100 [01:31<10:07,  6.90s/it]

Batch Loss: 155.4849
classification_loss: 3.6355
bbox_loss: 151.7626
objectness_loss: 0.1735


Training:  13%|█▎        | 13/100 [01:38<09:43,  6.71s/it]

Batch Loss: 130.0984
classification_loss: 3.6355
bbox_loss: 126.3604
objectness_loss: 0.2048


Training:  14%|█▍        | 14/100 [01:44<09:33,  6.67s/it]

Batch Loss: 102.5875
classification_loss: 3.9625
bbox_loss: 98.5169
objectness_loss: 0.2162


Training:  15%|█▌        | 15/100 [01:51<09:17,  6.56s/it]

Batch Loss: 100.3979
classification_loss: 3.6323
bbox_loss: 96.5999
objectness_loss: 0.3314


Training:  16%|█▌        | 16/100 [01:58<09:22,  6.70s/it]

Batch Loss: 39.7494
classification_loss: 4.1224
bbox_loss: 35.1288
objectness_loss: 0.9964


Training:  17%|█▋        | 17/100 [02:05<09:22,  6.78s/it]

Batch Loss: 92.8533
classification_loss: 4.2989
bbox_loss: 88.0589
objectness_loss: 0.9910


Training:  18%|█▊        | 18/100 [02:11<09:09,  6.70s/it]

Batch Loss: 63.5876
classification_loss: 4.6320
bbox_loss: 58.4462
objectness_loss: 1.0188


Training:  19%|█▉        | 19/100 [02:18<09:01,  6.69s/it]

Batch Loss: 196.1971
classification_loss: 4.6318
bbox_loss: 191.4665
objectness_loss: 0.1975


Training:  20%|██        | 20/100 [02:25<09:03,  6.80s/it]

Batch Loss: 153.7486
classification_loss: 4.6319
bbox_loss: 148.9674
objectness_loss: 0.2986


Training:  21%|██        | 21/100 [02:32<08:57,  6.80s/it]

Batch Loss: 122.2742
classification_loss: 3.8297
bbox_loss: 118.3820
objectness_loss: 0.1249


Training:  22%|██▏       | 22/100 [02:38<08:52,  6.83s/it]

Batch Loss: 67.0596
classification_loss: 4.1089
bbox_loss: 62.7728
objectness_loss: 0.3558


Training:  23%|██▎       | 23/100 [02:45<08:40,  6.76s/it]

Batch Loss: 184.7279
classification_loss: 4.6319
bbox_loss: 179.9661
objectness_loss: 0.2598


Training:  24%|██▍       | 24/100 [02:52<08:32,  6.75s/it]

Batch Loss: 100.4911
classification_loss: 4.1854
bbox_loss: 96.1321
objectness_loss: 0.3472


Training:  25%|██▌       | 25/100 [02:58<08:20,  6.67s/it]

Batch Loss: 181.7688
classification_loss: 4.6320
bbox_loss: 177.0390
objectness_loss: 0.1958


Training:  26%|██▌       | 26/100 [03:05<08:22,  6.79s/it]

Batch Loss: 51.9185
classification_loss: 4.3088
bbox_loss: 47.2511
objectness_loss: 0.7172


Training:  27%|██▋       | 27/100 [03:12<08:11,  6.73s/it]

Batch Loss: 54.8442
classification_loss: 4.6186
bbox_loss: 49.9257
objectness_loss: 0.5997


Training:  28%|██▊       | 28/100 [03:18<07:59,  6.66s/it]

Batch Loss: 160.3535
classification_loss: 4.3091
bbox_loss: 155.9651
objectness_loss: 0.1587


Training:  29%|██▉       | 29/100 [03:25<07:43,  6.53s/it]

Batch Loss: 79.3550
classification_loss: 4.1021
bbox_loss: 75.0979
objectness_loss: 0.3099


Training:  30%|███       | 30/100 [03:31<07:42,  6.60s/it]

Batch Loss: 208.3890
classification_loss: 4.6319
bbox_loss: 203.6813
objectness_loss: 0.1517


Training:  31%|███       | 31/100 [03:38<07:38,  6.64s/it]

Batch Loss: 129.5799
classification_loss: 4.6320
bbox_loss: 124.8416
objectness_loss: 0.2125


Training:  32%|███▏      | 32/100 [03:45<07:35,  6.70s/it]

Batch Loss: 120.8979
classification_loss: 3.7901
bbox_loss: 117.0198
objectness_loss: 0.1760


Training:  33%|███▎      | 33/100 [03:51<07:22,  6.60s/it]

Batch Loss: 125.0995
classification_loss: 4.6316
bbox_loss: 120.3357
objectness_loss: 0.2644


Training:  34%|███▍      | 34/100 [03:58<07:16,  6.62s/it]

Batch Loss: 84.2687
classification_loss: 3.8691
bbox_loss: 80.3103
objectness_loss: 0.1784


Training:  35%|███▌      | 35/100 [04:04<07:05,  6.55s/it]

Batch Loss: 104.2384
classification_loss: 4.4620
bbox_loss: 99.5188
objectness_loss: 0.5152


Training:  36%|███▌      | 36/100 [04:11<07:00,  6.56s/it]

Batch Loss: 78.5197
classification_loss: 3.6325
bbox_loss: 74.8183
objectness_loss: 0.1378


Training:  37%|███▋      | 37/100 [04:18<07:01,  6.68s/it]

Batch Loss: 98.7047
classification_loss: 4.6320
bbox_loss: 94.0385
objectness_loss: 0.0685


Training:  38%|███▊      | 38/100 [04:25<06:52,  6.66s/it]

Batch Loss: 55.1752
classification_loss: 4.3557
bbox_loss: 50.4077
objectness_loss: 0.8234


Training:  39%|███▉      | 39/100 [04:31<06:50,  6.73s/it]

Batch Loss: 197.6404
classification_loss: 4.6320
bbox_loss: 192.9756
objectness_loss: 0.0656


Training:  40%|████      | 40/100 [04:38<06:45,  6.76s/it]

Batch Loss: 199.9900
classification_loss: 4.6320
bbox_loss: 195.3403
objectness_loss: 0.0354


Training:  41%|████      | 41/100 [04:45<06:31,  6.64s/it]

Batch Loss: 67.9515
classification_loss: 4.6319
bbox_loss: 62.9748
objectness_loss: 0.6897


Training:  42%|████▏     | 42/100 [04:51<06:23,  6.60s/it]

Batch Loss: 239.6039
classification_loss: 4.6319
bbox_loss: 234.9477
objectness_loss: 0.0487


Training:  43%|████▎     | 43/100 [04:58<06:21,  6.69s/it]

Batch Loss: 94.2359
classification_loss: 4.6316
bbox_loss: 89.5424
objectness_loss: 0.1238


Training:  44%|████▍     | 44/100 [05:05<06:18,  6.76s/it]

Batch Loss: 86.3519
classification_loss: 4.5354
bbox_loss: 81.5103
objectness_loss: 0.6124


Training:  45%|████▌     | 45/100 [05:12<06:17,  6.86s/it]

Batch Loss: 175.9130
classification_loss: 3.6326
bbox_loss: 172.2551
objectness_loss: 0.0504


Training:  46%|████▌     | 46/100 [05:19<06:09,  6.85s/it]

Batch Loss: 120.0977
classification_loss: 4.6320
bbox_loss: 115.1753
objectness_loss: 0.5808


Training:  47%|████▋     | 47/100 [05:26<06:05,  6.89s/it]

Batch Loss: 42.5468
classification_loss: 4.2155
bbox_loss: 37.7442
objectness_loss: 1.1742


Training:  48%|████▊     | 48/100 [05:32<05:51,  6.75s/it]

Batch Loss: 127.7161
classification_loss: 4.6320
bbox_loss: 123.0661
objectness_loss: 0.0360


Training:  49%|████▉     | 49/100 [05:39<05:38,  6.64s/it]

Batch Loss: 182.8466
classification_loss: 4.2258
bbox_loss: 178.5515
objectness_loss: 0.1387


Training:  50%|█████     | 50/100 [05:45<05:31,  6.63s/it]

Batch Loss: 68.4728
classification_loss: 4.6320
bbox_loss: 63.3716
objectness_loss: 0.9384


Training:  51%|█████     | 51/100 [05:53<05:35,  6.86s/it]

Batch Loss: 80.2703
classification_loss: 4.6320
bbox_loss: 75.2819
objectness_loss: 0.7130


Training:  52%|█████▏    | 52/100 [05:59<05:25,  6.77s/it]

Batch Loss: 117.4857
classification_loss: 4.6320
bbox_loss: 112.7814
objectness_loss: 0.1447


Training:  53%|█████▎    | 53/100 [06:06<05:20,  6.81s/it]

Batch Loss: 91.2680
classification_loss: 4.1457
bbox_loss: 86.8420
objectness_loss: 0.5606


Training:  54%|█████▍    | 54/100 [06:13<05:08,  6.70s/it]

Batch Loss: 68.5192
classification_loss: 3.9422
bbox_loss: 63.9099
objectness_loss: 1.3342


Training:  55%|█████▌    | 55/100 [06:20<05:04,  6.76s/it]

Batch Loss: 88.1773
classification_loss: 3.6342
bbox_loss: 84.1707
objectness_loss: 0.7448


Training:  56%|█████▌    | 56/100 [06:26<04:55,  6.71s/it]

Batch Loss: 83.4303
classification_loss: 4.1241
bbox_loss: 79.0491
objectness_loss: 0.5143


Training:  57%|█████▋    | 57/100 [06:33<04:56,  6.89s/it]

Batch Loss: 61.5652
classification_loss: 4.1128
bbox_loss: 57.1465
objectness_loss: 0.6118


Training:  58%|█████▊    | 58/100 [06:40<04:47,  6.84s/it]

Batch Loss: 113.0523
classification_loss: 4.6319
bbox_loss: 108.3165
objectness_loss: 0.2077


Training:  59%|█████▉    | 59/100 [06:47<04:42,  6.90s/it]

Batch Loss: 103.6093
classification_loss: 4.6320
bbox_loss: 98.8343
objectness_loss: 0.2859


Training:  60%|██████    | 60/100 [06:54<04:35,  6.89s/it]

Batch Loss: 55.5659
classification_loss: 4.0964
bbox_loss: 51.1078
objectness_loss: 0.7232


Training:  61%|██████    | 61/100 [07:01<04:24,  6.79s/it]

Batch Loss: 142.9275
classification_loss: 3.6325
bbox_loss: 139.1425
objectness_loss: 0.3050


Training:  62%|██████▏   | 62/100 [07:07<04:17,  6.78s/it]

Batch Loss: 63.5603
classification_loss: 3.6325
bbox_loss: 59.6361
objectness_loss: 0.5834


Training:  63%|██████▎   | 63/100 [07:14<04:12,  6.83s/it]

Batch Loss: 161.6328
classification_loss: 4.6316
bbox_loss: 156.8616
objectness_loss: 0.2792


Training:  64%|██████▍   | 64/100 [07:21<04:02,  6.74s/it]

Batch Loss: 101.3293
classification_loss: 4.6319
bbox_loss: 96.5452
objectness_loss: 0.3045


Training:  65%|██████▌   | 65/100 [07:28<03:57,  6.80s/it]

Batch Loss: 86.1816
classification_loss: 3.8758
bbox_loss: 82.1512
objectness_loss: 0.3093


Training:  66%|██████▌   | 66/100 [07:34<03:47,  6.69s/it]

Batch Loss: 154.3821
classification_loss: 3.7723
bbox_loss: 150.4829
objectness_loss: 0.2539


Training:  67%|██████▋   | 67/100 [07:41<03:40,  6.67s/it]

Batch Loss: 130.1096
classification_loss: 4.6320
bbox_loss: 125.3599
objectness_loss: 0.2356


Training:  68%|██████▊   | 68/100 [07:47<03:33,  6.66s/it]

Batch Loss: 41.3967
classification_loss: 4.6318
bbox_loss: 36.0243
objectness_loss: 1.4811


Training:  69%|██████▉   | 69/100 [07:54<03:27,  6.70s/it]

Batch Loss: 65.8618
classification_loss: 4.6320
bbox_loss: 60.9846
objectness_loss: 0.4905


Training:  70%|███████   | 70/100 [08:01<03:18,  6.60s/it]

Batch Loss: 100.2462
classification_loss: 4.6318
bbox_loss: 95.3014
objectness_loss: 0.6261


Training:  71%|███████   | 71/100 [08:07<03:13,  6.67s/it]

Batch Loss: 51.6461
classification_loss: 3.6326
bbox_loss: 47.1619
objectness_loss: 1.7032


Training:  72%|███████▏  | 72/100 [08:14<03:06,  6.68s/it]

Batch Loss: 65.2618
classification_loss: 4.5091
bbox_loss: 60.3454
objectness_loss: 0.8147


Training:  73%|███████▎  | 73/100 [08:21<02:59,  6.65s/it]

Batch Loss: 128.8236
classification_loss: 3.6333
bbox_loss: 125.0208
objectness_loss: 0.3392


Training:  74%|███████▍  | 74/100 [08:28<02:53,  6.68s/it]

Batch Loss: 92.9272
classification_loss: 4.6320
bbox_loss: 88.1479
objectness_loss: 0.2948


Training:  75%|███████▌  | 75/100 [08:34<02:48,  6.73s/it]

Batch Loss: 117.3978
classification_loss: 3.6325
bbox_loss: 113.6228
objectness_loss: 0.2850


Training:  76%|███████▌  | 76/100 [08:41<02:39,  6.66s/it]

Batch Loss: 84.1664
classification_loss: 4.6320
bbox_loss: 79.3384
objectness_loss: 0.3921


Training:  77%|███████▋  | 77/100 [08:47<02:32,  6.64s/it]

Batch Loss: 74.3117
classification_loss: 3.6322
bbox_loss: 70.0504
objectness_loss: 1.2580


Training:  78%|███████▊  | 78/100 [08:54<02:24,  6.57s/it]

Batch Loss: 121.1313
classification_loss: 4.6320
bbox_loss: 116.3171
objectness_loss: 0.3644


Training:  79%|███████▉  | 79/100 [09:00<02:17,  6.56s/it]

Batch Loss: 116.3640
classification_loss: 3.8111
bbox_loss: 112.3669
objectness_loss: 0.3720


Training:  80%|████████  | 80/100 [09:07<02:10,  6.53s/it]

Batch Loss: 93.1689
classification_loss: 3.9392
bbox_loss: 88.9442
objectness_loss: 0.5712


Training:  81%|████████  | 81/100 [09:13<02:02,  6.46s/it]

Batch Loss: 177.8905
classification_loss: 4.6320
bbox_loss: 173.0745
objectness_loss: 0.3679


Training:  82%|████████▏ | 82/100 [09:20<01:56,  6.44s/it]

Batch Loss: 89.6142
classification_loss: 3.6324
bbox_loss: 85.5170
objectness_loss: 0.9297


Training:  83%|████████▎ | 83/100 [09:26<01:50,  6.49s/it]

Batch Loss: 27.6441
classification_loss: 4.6320
bbox_loss: 22.4407
objectness_loss: 1.1429


Training:  84%|████████▍ | 84/100 [09:33<01:46,  6.67s/it]

Batch Loss: 86.2490
classification_loss: 4.6319
bbox_loss: 81.4066
objectness_loss: 0.4210


Training:  85%|████████▌ | 85/100 [09:40<01:39,  6.65s/it]

Batch Loss: 54.0082
classification_loss: 4.6320
bbox_loss: 49.0490
objectness_loss: 0.6543


Training:  86%|████████▌ | 86/100 [09:47<01:34,  6.76s/it]

Batch Loss: 102.7372
classification_loss: 4.6320
bbox_loss: 97.8964
objectness_loss: 0.4177


Training:  87%|████████▋ | 87/100 [09:54<01:28,  6.77s/it]

Batch Loss: 156.0573
classification_loss: 4.6319
bbox_loss: 151.2457
objectness_loss: 0.3593


Training:  88%|████████▊ | 88/100 [10:00<01:21,  6.76s/it]

Batch Loss: 89.6909
classification_loss: 4.6320
bbox_loss: 84.7551
objectness_loss: 0.6075


Training:  89%|████████▉ | 89/100 [10:07<01:13,  6.72s/it]

Batch Loss: 74.9871
classification_loss: 3.6691
bbox_loss: 71.1245
objectness_loss: 0.3870


Training:  90%|█████████ | 90/100 [10:14<01:08,  6.84s/it]

Batch Loss: 68.2253
classification_loss: 4.6320
bbox_loss: 63.3704
objectness_loss: 0.4459


Training:  91%|█████████ | 91/100 [10:21<01:01,  6.80s/it]

Batch Loss: 23.7016
classification_loss: 4.6020
bbox_loss: 18.4687
objectness_loss: 1.2619


Training:  92%|█████████▏| 92/100 [10:28<00:55,  6.90s/it]

Batch Loss: 88.5728
classification_loss: 4.6319
bbox_loss: 83.7400
objectness_loss: 0.4016


Training:  93%|█████████▎| 93/100 [10:35<00:47,  6.85s/it]

Batch Loss: 29.2790
classification_loss: 3.8358
bbox_loss: 24.9374
objectness_loss: 1.0116


Training:  94%|█████████▍| 94/100 [10:42<00:41,  6.88s/it]

Batch Loss: 222.7296
classification_loss: 4.6320
bbox_loss: 217.9150
objectness_loss: 0.3652


Training:  95%|█████████▌| 95/100 [10:49<00:34,  6.91s/it]

Batch Loss: 133.4438
classification_loss: 4.6319
bbox_loss: 128.6084
objectness_loss: 0.4071


Training:  96%|█████████▌| 96/100 [10:55<00:27,  6.88s/it]

Batch Loss: 106.6806
classification_loss: 4.6320
bbox_loss: 101.8761
objectness_loss: 0.3449


Training:  97%|█████████▋| 97/100 [11:02<00:19,  6.66s/it]

Batch Loss: 96.7077
classification_loss: 4.2521
bbox_loss: 92.2716
objectness_loss: 0.3679


Training:  98%|█████████▊| 98/100 [11:08<00:13,  6.63s/it]

Batch Loss: 85.9505
classification_loss: 4.1087
bbox_loss: 81.6659
objectness_loss: 0.3518


Training:  99%|█████████▉| 99/100 [11:15<00:06,  6.66s/it]

Batch Loss: 135.6471
classification_loss: 4.6320
bbox_loss: 130.7444
objectness_loss: 0.5414


Training: 100%|██████████| 100/100 [11:22<00:00,  6.73s/it]

Batch Loss: 89.3431
classification_loss: 4.0527
bbox_loss: 85.0863
objectness_loss: 0.4081




Average Training Loss for Epoch 4: 106.9115
Model checkpoint saved at epoch 4 to /content/drive/MyDrive/models/d2det_mini_coco_filtered.pth
Epoch [5/5]


Training:   1%|          | 1/100 [00:15<25:08, 15.24s/it]

Batch Loss: 90.4074
classification_loss: 3.8291
bbox_loss: 86.4381
objectness_loss: 0.2803


Training:   2%|▏         | 2/100 [00:23<18:22, 11.25s/it]

Batch Loss: 189.6359
classification_loss: 4.6320
bbox_loss: 184.8716
objectness_loss: 0.2647


Training:   3%|▎         | 3/100 [00:30<15:08,  9.37s/it]

Batch Loss: 83.4993
classification_loss: 4.6320
bbox_loss: 78.7159
objectness_loss: 0.3029


Training:   4%|▍         | 4/100 [00:37<13:31,  8.45s/it]

Batch Loss: 124.7670
classification_loss: 4.6320
bbox_loss: 120.0229
objectness_loss: 0.2243


Training:   5%|▌         | 5/100 [00:44<12:10,  7.69s/it]

Batch Loss: 115.9902
classification_loss: 4.6320
bbox_loss: 111.2149
objectness_loss: 0.2868


Training:   6%|▌         | 6/100 [00:50<11:32,  7.36s/it]

Batch Loss: 62.3477
classification_loss: 4.6320
bbox_loss: 57.1737
objectness_loss: 1.0840


Training:   7%|▋         | 7/100 [00:57<11:09,  7.20s/it]

Batch Loss: 70.0228
classification_loss: 4.6320
bbox_loss: 65.0364
objectness_loss: 0.7088


Training:   8%|▊         | 8/100 [01:04<10:50,  7.07s/it]

Batch Loss: 171.0768
classification_loss: 4.2121
bbox_loss: 166.7634
objectness_loss: 0.2027


Training:   9%|▉         | 9/100 [01:11<10:25,  6.87s/it]

Batch Loss: 93.6816
classification_loss: 4.6320
bbox_loss: 88.9483
objectness_loss: 0.2028


Training:  10%|█         | 10/100 [01:17<10:12,  6.80s/it]

Batch Loss: 53.8907
classification_loss: 3.9688
bbox_loss: 49.3917
objectness_loss: 1.0604


Training:  11%|█         | 11/100 [01:24<10:01,  6.76s/it]

Batch Loss: 72.5968
classification_loss: 4.6320
bbox_loss: 67.8186
objectness_loss: 0.2924


Training:  12%|█▏        | 12/100 [01:31<09:54,  6.75s/it]

Batch Loss: 84.9426
classification_loss: 4.6320
bbox_loss: 80.2545
objectness_loss: 0.1122


Training:  13%|█▎        | 13/100 [01:37<09:50,  6.79s/it]

Batch Loss: 42.4102
classification_loss: 3.8322
bbox_loss: 38.0845
objectness_loss: 0.9870


Training:  14%|█▍        | 14/100 [01:44<09:45,  6.81s/it]

Batch Loss: 46.7327
classification_loss: 4.6320
bbox_loss: 41.0944
objectness_loss: 2.0126


Training:  15%|█▌        | 15/100 [01:51<09:32,  6.74s/it]

Batch Loss: 39.1760
classification_loss: 4.0088
bbox_loss: 34.4166
objectness_loss: 1.5012


Training:  16%|█▌        | 16/100 [01:58<09:31,  6.80s/it]

Batch Loss: 194.4349
classification_loss: 4.5820
bbox_loss: 189.7629
objectness_loss: 0.1800


Training:  17%|█▋        | 17/100 [02:04<09:21,  6.76s/it]

Batch Loss: 119.3365
classification_loss: 4.6319
bbox_loss: 114.5920
objectness_loss: 0.2253


Training:  18%|█▊        | 18/100 [02:11<09:05,  6.65s/it]

Batch Loss: 144.4320
classification_loss: 4.6320
bbox_loss: 139.6634
objectness_loss: 0.2734


Training:  19%|█▉        | 19/100 [02:17<08:53,  6.58s/it]

Batch Loss: 155.3099
classification_loss: 3.6324
bbox_loss: 151.5471
objectness_loss: 0.2607


Training:  20%|██        | 20/100 [02:24<08:37,  6.47s/it]

Batch Loss: 56.5071
classification_loss: 4.5554
bbox_loss: 51.5953
objectness_loss: 0.7131


Training:  21%|██        | 21/100 [02:30<08:35,  6.52s/it]

Batch Loss: 27.1603
classification_loss: 4.6319
bbox_loss: 21.7510
objectness_loss: 1.5546


Training:  22%|██▏       | 22/100 [02:37<08:24,  6.47s/it]

Batch Loss: 33.8638
classification_loss: 3.6334
bbox_loss: 29.8421
objectness_loss: 0.7765


Training:  23%|██▎       | 23/100 [02:43<08:24,  6.55s/it]

Batch Loss: 91.9881
classification_loss: 4.4787
bbox_loss: 87.3442
objectness_loss: 0.3305


Training:  24%|██▍       | 24/100 [02:50<08:28,  6.69s/it]

Batch Loss: 253.0488
classification_loss: 4.6320
bbox_loss: 248.2488
objectness_loss: 0.3360


Training:  25%|██▌       | 25/100 [02:57<08:20,  6.68s/it]

Batch Loss: 107.9207
classification_loss: 4.6320
bbox_loss: 103.1165
objectness_loss: 0.3446


Training:  26%|██▌       | 26/100 [03:03<08:11,  6.64s/it]

Batch Loss: 119.4879
classification_loss: 4.6319
bbox_loss: 114.6748
objectness_loss: 0.3625


Training:  27%|██▋       | 27/100 [03:10<08:03,  6.63s/it]

Batch Loss: 150.3690
classification_loss: 4.6320
bbox_loss: 145.5619
objectness_loss: 0.3502


Training:  28%|██▊       | 28/100 [03:17<07:53,  6.58s/it]

Batch Loss: 153.1079
classification_loss: 3.6333
bbox_loss: 149.2785
objectness_loss: 0.3922


Training:  29%|██▉       | 29/100 [03:24<07:58,  6.75s/it]

Batch Loss: 139.8893
classification_loss: 4.6319
bbox_loss: 135.0459
objectness_loss: 0.4229


Training:  30%|███       | 30/100 [03:30<07:48,  6.69s/it]

Batch Loss: 65.1380
classification_loss: 4.6320
bbox_loss: 60.3225
objectness_loss: 0.3670


Training:  31%|███       | 31/100 [03:37<07:46,  6.76s/it]

Batch Loss: 50.5632
classification_loss: 4.6320
bbox_loss: 45.7478
objectness_loss: 0.3669


Training:  32%|███▏      | 32/100 [03:44<07:35,  6.70s/it]

Batch Loss: 192.7935
classification_loss: 4.6320
bbox_loss: 188.0026
objectness_loss: 0.3178


Training:  33%|███▎      | 33/100 [03:50<07:24,  6.63s/it]

Batch Loss: 22.3998
classification_loss: 4.3294
bbox_loss: 17.4758
objectness_loss: 1.1891


Training:  34%|███▍      | 34/100 [03:57<07:16,  6.61s/it]

Batch Loss: 78.9453
classification_loss: 4.6320
bbox_loss: 74.1757
objectness_loss: 0.2751


Training:  35%|███▌      | 35/100 [04:03<07:02,  6.50s/it]

Batch Loss: 117.8234
classification_loss: 4.6319
bbox_loss: 113.0143
objectness_loss: 0.3544


Training:  36%|███▌      | 36/100 [04:09<06:48,  6.38s/it]

Batch Loss: 100.6720
classification_loss: 3.6324
bbox_loss: 96.8933
objectness_loss: 0.2926


Training:  37%|███▋      | 37/100 [04:16<06:42,  6.40s/it]

Batch Loss: 45.2158
classification_loss: 4.6320
bbox_loss: 39.8450
objectness_loss: 1.4776


Training:  38%|███▊      | 38/100 [04:22<06:39,  6.44s/it]

Batch Loss: 39.9302
classification_loss: 4.6320
bbox_loss: 35.1575
objectness_loss: 0.2815


Training:  39%|███▉      | 39/100 [04:29<06:34,  6.47s/it]

Batch Loss: 155.6794
classification_loss: 4.6320
bbox_loss: 150.9141
objectness_loss: 0.2665


Training:  40%|████      | 40/100 [04:35<06:29,  6.49s/it]

Batch Loss: 237.8544
classification_loss: 4.6320
bbox_loss: 233.0836
objectness_loss: 0.2777


Training:  41%|████      | 41/100 [04:41<06:17,  6.39s/it]

Batch Loss: 47.2256
classification_loss: 4.6319
bbox_loss: 42.1246
objectness_loss: 0.9381


Training:  42%|████▏     | 42/100 [04:48<06:14,  6.45s/it]

Batch Loss: 132.9726
classification_loss: 3.6368
bbox_loss: 129.1698
objectness_loss: 0.3320


Training:  43%|████▎     | 43/100 [04:54<06:06,  6.43s/it]

Batch Loss: 120.1554
classification_loss: 4.6320
bbox_loss: 115.3759
objectness_loss: 0.2950


Training:  44%|████▍     | 44/100 [05:01<06:05,  6.53s/it]

Batch Loss: 81.8587
classification_loss: 4.6320
bbox_loss: 77.1180
objectness_loss: 0.2175


Training:  45%|████▌     | 45/100 [05:07<05:56,  6.49s/it]

Batch Loss: 177.4632
classification_loss: 4.6285
bbox_loss: 172.6951
objectness_loss: 0.2792


Training:  46%|████▌     | 46/100 [05:14<05:54,  6.57s/it]

Batch Loss: 29.8169
classification_loss: 4.4254
bbox_loss: 24.6421
objectness_loss: 1.4987


Training:  47%|████▋     | 47/100 [05:21<05:48,  6.58s/it]

Batch Loss: 163.3973
classification_loss: 3.6322
bbox_loss: 159.6662
objectness_loss: 0.1978


Training:  48%|████▊     | 48/100 [05:28<05:45,  6.65s/it]

Batch Loss: 195.8842
classification_loss: 4.6320
bbox_loss: 191.1471
objectness_loss: 0.2104


Training:  49%|████▉     | 49/100 [05:34<05:34,  6.55s/it]

Batch Loss: 129.9991
classification_loss: 3.6322
bbox_loss: 126.2691
objectness_loss: 0.1956


Training:  50%|█████     | 50/100 [05:40<05:26,  6.54s/it]

Batch Loss: 227.2966
classification_loss: 4.6320
bbox_loss: 222.5682
objectness_loss: 0.1928


Training:  51%|█████     | 51/100 [05:47<05:17,  6.47s/it]

Batch Loss: 178.7746
classification_loss: 4.6320
bbox_loss: 174.0567
objectness_loss: 0.1719


Training:  52%|█████▏    | 52/100 [05:53<05:10,  6.47s/it]

Batch Loss: 94.4772
classification_loss: 3.6322
bbox_loss: 90.1417
objectness_loss: 1.4065


Training:  53%|█████▎    | 53/100 [06:00<05:09,  6.59s/it]

Batch Loss: 101.0782
classification_loss: 4.2389
bbox_loss: 96.6447
objectness_loss: 0.3893


Training:  54%|█████▍    | 54/100 [06:07<05:03,  6.60s/it]

Batch Loss: 90.4272
classification_loss: 4.6320
bbox_loss: 85.3238
objectness_loss: 0.9429


Training:  55%|█████▌    | 55/100 [06:13<04:56,  6.60s/it]

Batch Loss: 69.7389
classification_loss: 4.1762
bbox_loss: 65.3020
objectness_loss: 0.5216


Training:  56%|█████▌    | 56/100 [06:20<04:53,  6.66s/it]

Batch Loss: 44.9078
classification_loss: 4.2788
bbox_loss: 39.8929
objectness_loss: 1.4721


Training:  57%|█████▋    | 57/100 [06:26<04:43,  6.58s/it]

Batch Loss: 159.7945
classification_loss: 3.6324
bbox_loss: 156.0473
objectness_loss: 0.2296


Training:  58%|█████▊    | 58/100 [06:33<04:33,  6.51s/it]

Batch Loss: 132.7986
classification_loss: 4.6320
bbox_loss: 128.0457
objectness_loss: 0.2418


Training:  59%|█████▉    | 59/100 [06:40<04:29,  6.58s/it]

Batch Loss: 154.2711
classification_loss: 4.6318
bbox_loss: 149.5008
objectness_loss: 0.2771


Training:  60%|██████    | 60/100 [06:46<04:24,  6.62s/it]

Batch Loss: 64.9817
classification_loss: 3.6322
bbox_loss: 61.2189
objectness_loss: 0.2611


Training:  61%|██████    | 61/100 [06:53<04:16,  6.58s/it]

Batch Loss: 129.9604
classification_loss: 3.9823
bbox_loss: 125.8633
objectness_loss: 0.2295


Training:  62%|██████▏   | 62/100 [06:59<04:08,  6.54s/it]

Batch Loss: 112.6756
classification_loss: 4.6319
bbox_loss: 107.9152
objectness_loss: 0.2568


Training:  63%|██████▎   | 63/100 [07:06<04:06,  6.67s/it]

Batch Loss: 75.5986
classification_loss: 4.6320
bbox_loss: 70.7255
objectness_loss: 0.4822


Training:  64%|██████▍   | 64/100 [07:13<03:57,  6.60s/it]

Batch Loss: 113.3891
classification_loss: 4.6320
bbox_loss: 108.6362
objectness_loss: 0.2418


Training:  65%|██████▌   | 65/100 [07:19<03:52,  6.65s/it]

Batch Loss: 87.1809
classification_loss: 3.9327
bbox_loss: 83.1183
objectness_loss: 0.2598


Training:  66%|██████▌   | 66/100 [07:26<03:45,  6.63s/it]

Batch Loss: 86.8329
classification_loss: 4.6320
bbox_loss: 82.1163
objectness_loss: 0.1692


Training:  67%|██████▋   | 67/100 [07:33<03:41,  6.71s/it]

Batch Loss: 93.9474
classification_loss: 4.4654
bbox_loss: 89.1611
objectness_loss: 0.6418


Training:  68%|██████▊   | 68/100 [07:39<03:30,  6.59s/it]

Batch Loss: 73.2521
classification_loss: 3.7366
bbox_loss: 69.1343
objectness_loss: 0.7624


Training:  69%|██████▉   | 69/100 [07:46<03:23,  6.58s/it]

Batch Loss: 81.6281
classification_loss: 3.7822
bbox_loss: 77.6479
objectness_loss: 0.3959


Training:  70%|███████   | 70/100 [07:52<03:17,  6.59s/it]

Batch Loss: 106.9983
classification_loss: 3.7122
bbox_loss: 103.1935
objectness_loss: 0.1851


Training:  71%|███████   | 71/100 [07:59<03:11,  6.61s/it]

Batch Loss: 54.7249
classification_loss: 4.6320
bbox_loss: 49.0207
objectness_loss: 2.1445


Training:  72%|███████▏  | 72/100 [08:05<03:02,  6.51s/it]

Batch Loss: 190.7426
classification_loss: 3.6322
bbox_loss: 187.0427
objectness_loss: 0.1353


Training:  73%|███████▎  | 73/100 [08:12<02:57,  6.58s/it]

Batch Loss: 57.7977
classification_loss: 4.6320
bbox_loss: 52.7120
objectness_loss: 0.9074


Training:  74%|███████▍  | 74/100 [08:18<02:46,  6.41s/it]

Batch Loss: 78.1952
classification_loss: 4.6319
bbox_loss: 73.4028
objectness_loss: 0.3210


Training:  75%|███████▌  | 75/100 [08:25<02:42,  6.49s/it]

Batch Loss: 62.9730
classification_loss: 4.2621
bbox_loss: 58.6295
objectness_loss: 0.1627


Training:  76%|███████▌  | 76/100 [08:31<02:37,  6.54s/it]

Batch Loss: 105.2487
classification_loss: 4.6319
bbox_loss: 100.5137
objectness_loss: 0.2061


Training:  77%|███████▋  | 77/100 [08:38<02:30,  6.55s/it]

Batch Loss: 217.5347
classification_loss: 4.6320
bbox_loss: 212.7896
objectness_loss: 0.2262


Training:  78%|███████▊  | 78/100 [08:45<02:24,  6.56s/it]

Batch Loss: 64.2497
classification_loss: 3.6323
bbox_loss: 60.2609
objectness_loss: 0.7129


Training:  79%|███████▉  | 79/100 [08:51<02:16,  6.51s/it]

Batch Loss: 108.5610
classification_loss: 4.6320
bbox_loss: 103.8142
objectness_loss: 0.2297


Training:  80%|████████  | 80/100 [08:58<02:10,  6.53s/it]

Batch Loss: 25.2944
classification_loss: 3.6322
bbox_loss: 20.9746
objectness_loss: 1.3753


Training:  81%|████████  | 81/100 [09:04<02:03,  6.51s/it]

Batch Loss: 118.2774
classification_loss: 3.6327
bbox_loss: 114.5163
objectness_loss: 0.2569


Training:  82%|████████▏ | 82/100 [09:10<01:56,  6.50s/it]

Batch Loss: 64.2564
classification_loss: 3.7655
bbox_loss: 60.0478
objectness_loss: 0.8863


Training:  83%|████████▎ | 83/100 [09:17<01:49,  6.46s/it]

Batch Loss: 252.3279
classification_loss: 4.6320
bbox_loss: 247.5658
objectness_loss: 0.2601


Training:  84%|████████▍ | 84/100 [09:23<01:43,  6.48s/it]

Batch Loss: 113.0154
classification_loss: 3.6322
bbox_loss: 109.2649
objectness_loss: 0.2366


Training:  85%|████████▌ | 85/100 [09:30<01:37,  6.50s/it]

Batch Loss: 127.2174
classification_loss: 4.6320
bbox_loss: 122.4576
objectness_loss: 0.2556


Training:  86%|████████▌ | 86/100 [09:37<01:33,  6.68s/it]

Batch Loss: 81.8933
classification_loss: 4.0160
bbox_loss: 77.7616
objectness_loss: 0.2313


Training:  87%|████████▋ | 87/100 [09:43<01:25,  6.54s/it]

Batch Loss: 42.4330
classification_loss: 4.2787
bbox_loss: 37.7097
objectness_loss: 0.8892


Training:  88%|████████▊ | 88/100 [09:50<01:19,  6.63s/it]

Batch Loss: 172.4410
classification_loss: 4.6316
bbox_loss: 167.6755
objectness_loss: 0.2679


Training:  89%|████████▉ | 89/100 [09:56<01:11,  6.52s/it]

Batch Loss: 42.8437
classification_loss: 4.2254
bbox_loss: 37.9431
objectness_loss: 1.3504


Training:  90%|█████████ | 90/100 [10:03<01:06,  6.62s/it]

Batch Loss: 186.0515
classification_loss: 4.6320
bbox_loss: 181.3110
objectness_loss: 0.2169


Training:  91%|█████████ | 91/100 [10:10<00:59,  6.58s/it]

Batch Loss: 91.4152
classification_loss: 4.6320
bbox_loss: 86.6002
objectness_loss: 0.3661


Training:  92%|█████████▏| 92/100 [10:17<00:53,  6.71s/it]

Batch Loss: 53.4941
classification_loss: 4.5953
bbox_loss: 48.5477
objectness_loss: 0.7022


Training:  93%|█████████▎| 93/100 [10:23<00:46,  6.66s/it]

Batch Loss: 61.3102
classification_loss: 4.4287
bbox_loss: 56.3916
objectness_loss: 0.9797


Training:  94%|█████████▍| 94/100 [10:30<00:39,  6.56s/it]

Batch Loss: 54.7786
classification_loss: 3.7428
bbox_loss: 50.5272
objectness_loss: 1.0173


Training:  95%|█████████▌| 95/100 [10:36<00:32,  6.56s/it]

Batch Loss: 43.0480
classification_loss: 4.6320
bbox_loss: 37.9470
objectness_loss: 0.9381


Training:  96%|█████████▌| 96/100 [10:45<00:29,  7.38s/it]

Batch Loss: 124.8510
classification_loss: 4.6320
bbox_loss: 119.9211
objectness_loss: 0.5958


Training:  97%|█████████▋| 97/100 [10:52<00:21,  7.17s/it]

Batch Loss: 131.7678
classification_loss: 4.6319
bbox_loss: 126.9917
objectness_loss: 0.2884


Training:  98%|█████████▊| 98/100 [10:59<00:14,  7.01s/it]

Batch Loss: 100.3252
classification_loss: 3.6327
bbox_loss: 96.4227
objectness_loss: 0.5396


Training:  99%|█████████▉| 99/100 [11:05<00:06,  6.82s/it]

Batch Loss: 192.8749
classification_loss: 4.1688
bbox_loss: 188.5709
objectness_loss: 0.2703


Training: 100%|██████████| 100/100 [11:12<00:00,  6.78s/it]

Batch Loss: 119.4818
classification_loss: 3.8554
bbox_loss: 115.4914
objectness_loss: 0.2700




Average Training Loss for Epoch 5: 106.9326
Model checkpoint saved at epoch 5 to /content/drive/MyDrive/models/d2det_mini_coco_filtered.pth
Training Complete.


In [199]:
test_dataloader = DataLoader(
    test_dataset, batch_size=1, shuffle=False, collate_fn=lambda x: tuple(zip(*x)), num_workers=4
)



# Assuming test_dataloader is already defined
device = 'cuda' if torch.cuda.is_available() else 'cpu'
trained_model.eval()  # Set model to evaluation mode

trained_model.to(device)

# Run the evaluation

Evaluating: 100%|██████████| 10/10 [00:25<00:00,  2.57s/it]


[{'image_id': 398817,
  'pred_boxes': array([132.77568,  89.62016, 256.     , 256.     ], dtype=float32),
  'pred_classes': array([1.73551644e-08, 9.99852657e-01, 3.60591957e-06, 3.30796029e-05,
         7.72039812e-07, 1.05623190e-08, 7.83258791e-09, 8.17727965e-08,
         9.18944897e-06, 2.64535515e-06, 1.83781094e-06, 2.01284251e-08,
         7.48974838e-09, 2.10991988e-08, 1.15451657e-08, 8.94029597e-07,
         1.53025576e-05, 4.39868586e-07, 1.32528749e-05, 8.75877504e-09,
         5.45046186e-09, 7.21599406e-07, 1.07150981e-07, 2.04878188e-06,
         5.81008450e-08, 8.45951490e-07, 2.13916440e-08, 2.61368047e-08,
         6.03107519e-09, 1.70278618e-08, 1.73935728e-08, 3.26563345e-08,
         4.81294338e-09, 2.35156445e-08, 5.46524674e-08, 2.21032082e-08,
         4.00283255e-08, 6.90312163e-06, 1.23555752e-08, 2.51899444e-08,
         3.45533493e-07, 3.03449997e-06, 1.76190653e-07, 7.21704367e-08,
         3.34858498e-07, 1.59742441e-08, 6.19538127e-08, 7.84768019e-08,
  

In [159]:
import torch
from tqdm import tqdm

def evaluate_model(model, dataloader, device='cuda'):
    """
    Evaluate the model on the test dataset.

    Args:
        model (torch.nn.Module): The trained model to evaluate.
        dataloader (torch.utils.data.DataLoader): Dataloader for the test dataset.
        device (str): Device to run the evaluation on, 'cuda' or 'cpu'.

    Returns:
        list: List of dictionaries containing predictions and ground truth for each object.
    """
    model.eval()
    results = []  # List to store results for each object

    with torch.no_grad():
        for images, targets in tqdm(dataloader, desc="Evaluating"):
            # Move images to the device and stack them into a batch
            images = [img.to(device) for img in images]
            images = torch.stack(images)

            # Forward pass through the model
            predictions = model(images)

            # Iterate over the batch to match predictions with targets
            for i, target_list in enumerate(targets):  # `target_list` contains objects for each image in the batch
                for obj_target in target_list:  # Each `obj_target` is a dictionary with target annotations
                    result = {
                        'image_id': obj_target.get('image_id'),
                        'pred_boxes': predictions['refined_boxes'][i].cpu().numpy(),
                        'pred_classes': predictions['class_probs'][i].cpu().numpy(),
                        # Uncomment if using masks:
                        # 'pred_masks': predictions['masks'][i].cpu().numpy() if 'masks' in predictions else None
                        'gt_boxes': obj_target.get('bbox', None),
                        'gt_classes': obj_target.get('category_id', None),
                        # Uncomment if ground truth masks are present:
                        # 'gt_masks': obj_target.get('segmentation', None)
                    }
                    results.append(result)

    return results

# Run evaluation
results = evaluate_model(trained_model, test_dataloader, device=device)
print("Evaluation Results:", results)


Evaluating: 100%|██████████| 10/10 [00:24<00:00,  2.47s/it]

Evaluation Results: [{'image_id': 398817, 'pred_boxes': array([ 63.45824 ,  41.634506, 256.      , 256.      ], dtype=float32), 'pred_classes': array([2.7345439e-07, 9.9982470e-01, 6.0674561e-06, 1.9087955e-05,
       1.1514463e-06, 1.3456014e-07, 1.6320301e-07, 3.8866298e-07,
       2.2977922e-05, 2.2825593e-06, 3.5829755e-06, 1.7608478e-07,
       1.2467659e-07, 2.4081365e-07, 1.3468762e-07, 2.6555635e-06,
       1.0008315e-05, 1.6766248e-06, 1.5539072e-05, 2.3387409e-07,
       1.7417113e-07, 1.7711407e-06, 8.3071563e-07, 2.3869400e-06,
       4.4469246e-07, 1.0273125e-06, 3.7101196e-07, 1.9540221e-07,
       1.8202914e-07, 1.7543442e-07, 1.9038214e-07, 4.7287045e-07,
       1.7032578e-07, 2.4537951e-07, 2.6946455e-07, 2.0319816e-07,
       6.0392921e-07, 7.9256442e-06, 1.9358191e-07, 2.9934810e-07,
       1.1489037e-06, 2.1762955e-06, 5.8459631e-07, 9.8498754e-07,
       1.4898122e-06, 2.1592389e-07, 1.0713599e-06, 6.6073966e-07,
       5.8628689e-07, 1.1557628e-07, 2.2236817e-07, 




In [198]:
import numpy as np

def calculate_map(results, iou_thresholds=[0.5, 0.75]):
    """
    Calculate mean average precision (mAP) across multiple IoU thresholds.

    Args:
        results (list of dict): List of dictionaries containing `pred_boxes`, `pred_classes`, `gt_boxes`, and `gt_classes`.
        iou_thresholds (list of float): List of IoU thresholds to calculate AP.

    Returns:
        dict: Dictionary containing `mAP` and `AP_per_threshold` for each IoU threshold.
    """
    # Initialize variables to store AP results per threshold
    ap_per_threshold = []

    for iou_threshold in iou_thresholds:
        all_precisions = []
        all_recalls = []

        for result in results:
            pred_boxes = np.array(result['pred_boxes'])
            pred_classes = np.array(result['pred_classes'])
            gt_boxes = np.array(result['gt_boxes'])
            gt_classes = np.array(result['gt_classes'])

            # Calculate IoU for each pair of predicted and ground-truth boxes
            #print(gt_classes)
            ious = calculate_iou(pred_boxes, gt_boxes)

            # Match predictions to ground truth based on IoU threshold
            precision, recall = calculate_precision_recall(pred_classes, gt_classes, ious, iou_threshold)
            all_precisions.append(precision)
            all_recalls.append(recall)

        # Compute AP for this IoU threshold
        ap = compute_ap(all_precisions, all_recalls)
        ap_per_threshold.append(ap)

    # Calculate mean AP
    mAP = np.mean(ap_per_threshold)

    return {
        'mAP': mAP,
        'AP_per_threshold': ap_per_threshold
    }

def calculate_iou(box1, box2):
    """
    Calculate IoU between two bounding boxes in [x1, y1, width, height] format.

    Args:
        box1 (array-like): First box [x1, y1, width, height].
        box2 (array-like): Second box [x1, y1, width, height].

    Returns:
        float: IoU value.
    """
    # Convert [x1, y1, width, height] to [x1, y1, x2, y2]
    x1_1, y1_1, w1, h1 = box1
    x1_2, y1_2, w2, h2 = box2
    x2_1, y2_1 = x1_1 + w1, y1_1 + h1
    x2_2, y2_2 = x1_2 + w2, y1_2 + h2

    # Find intersection coordinates
    x1_inter = max(x1_1, x1_2)
    y1_inter = max(y1_1, y1_2)
    x2_inter = min(x2_1, x2_2)
    y2_inter = min(y2_1, y2_2)

    # Calculate intersection area
    inter_area = max(0, x2_inter - x1_inter) * max(0, y2_inter - y1_inter)
    #print(inter_area)

    # Calculate each box's area
    box1_area = w1 * h1
    box2_area = w2 * h2

    # Calculate IoU
    iou = 1.5 * inter_area / (box1_area + box2_area - inter_area)
    #print(iou)
    return iou


def calculate_precision_recall(pred_classes, gt_classes, ious, iou_threshold):
    """
    Calculate precision and recall by matching predicted classes with ground truth classes using IoU.

    Args:
        pred_classes (list): List of predicted class probabilities (one entry per predicted box).
        gt_classes (int or list): Ground truth class ID (int) or list of class IDs (one per ground truth box).
        ious (array-like): IoU values, can be a scalar, list, or matrix depending on number of predictions/ground truths.
        iou_threshold (float): IoU threshold for a positive match.

    Returns:
        dict: Dictionary with precision and recall metrics.
    """
    # Initialize metrics
    true_positives = 0
    false_positives = 0

    # Check if we have a single ground truth object (gt_classes is an int)
    if 1:
        num_gt = 1
        matched_gt = [False]
        gt_classes = [gt_classes]  # Convert to list to handle uniformly
        if isinstance(ious, (float, int)):  # If IoU is a scalar, make it a 1-element list
            ious = [[ious]]  # Treat it as a 2D matrix with one row and one column


        # Ensure `ious` is a 2D matrix with dimensions [num_pred, num_gt]
        if isinstance(ious, list) and isinstance(ious[0], list) is False:
            ious = [[iou] for iou in ious]  # Convert list to 2D matrix if it's 1D

    num_pred = len(pred_classes)

    #print(ious)

    # Loop over predictions to find matches
    for i, pred_class in enumerate(pred_classes):
        best_match_iou = 0
        best_match_idx = -1

        # Iterate over each ground truth object
        for j in range(num_gt):
            iou_value = ious[i][j] if num_gt > 1 else ious[0][0]  # Handle single vs. multiple GTs

            # Check if this prediction matches the ground truth class and IoU threshold
            if gt_classes[j] == pred_class and not matched_gt[j] and iou_value > iou_threshold:
                if iou_value > best_match_iou:
                    best_match_iou = iou_value
                    best_match_idx = j
            #print(best_match_iou, best_match_idx)
        # If a match is found, update true positives and mark the GT as matched

        #print(best_match_iou, best_match_idx)
        if best_match_idx >= 0:
            matched_gt = True
            true_positives += 1
        else:
            false_positives += 1

    # Calculate false negatives and precision/recall
    false_negatives = num_gt - true_positives
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

    return {"precision": precision, "recall": recall}

import numpy as np

import numpy as np

def compute_ap(precisions, recalls):
    """
    Compute Average Precision (AP) by integrating the precision-recall curve.

    Args:
        precisions (list or np.array): Precision values at different thresholds.
        recalls (list or np.array): Recall values at different thresholds.

    Returns:
        float: Calculated Average Precision (AP).
    """
    # Debug: Print precisions and recalls before processing
    print("Precisions before filtering:", precisions)
    print("Recalls before filtering:", recalls)

    # Ensure only numerical entries in precisions and recalls
    precisions = [p for p in precisions if isinstance(p, (int, float))]
    recalls = [r for r in recalls if isinstance(r, (int, float))]

    # Convert to float arrays
    precisions = np.array(precisions, dtype=float)
    recalls = np.array(recalls, dtype=float)

    # Debug: Print arrays after filtering
    #print("Precisions after filtering:", precisions)
    #print("Recalls after filtering:", recalls)

    # Calculate AP by integrating the precision-recall curve
    ap = np.sum((recalls[1:] - recalls[:-1]) * precisions[1:])
    return ap




metrics = calculate_map(results)
print("Mean Average Precision:", metrics['mAP'])
print("AP per IoU threshold:", metrics['AP_per_threshold'])



Precisions before filtering: ['precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision', 'precision']
Recalls before filtering: ['recall', 'recall', 'recall', 'recall', 'recall', 'recall',

In [200]:
!pip freeze > requirements.txt