In [12]:
from torch import tensor
import torch
import torch.nn as nn 
import torch.functional as F

# Cross Entropy Loss

In [8]:
batch_size = 4
vocab_size = 32
input_size = 16
weights = torch.rand((input_size, vocab_size))
inputs = torch.rand((batch_size, input_size))
bias = torch.rand((1, vocab_size))

In [9]:
outputs = inputs @ weights + bias

In [11]:
inputs.shape, weights.shape, bias.shape, outputs.shape

(torch.Size([4, 16]),
 torch.Size([16, 32]),
 torch.Size([1, 32]),
 torch.Size([4, 32]))

In [18]:
# output cannot be guaranteed to have sum = 1
outputs.sum(dim=1)

tensor([121.7445, 125.9949, 153.5376, 106.9565])

In [23]:
softmax = nn.Softmax(dim=1)
yhat = softmax(outputs)

In [24]:
yhat.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000])

In [25]:
yhat[0]

tensor([0.0134, 0.0360, 0.0161, 0.0542, 0.0118, 0.0264, 0.0199, 0.0169, 0.0150,
        0.0517, 0.0660, 0.0251, 0.0423, 0.1195, 0.0241, 0.0482, 0.0204, 0.0201,
        0.0086, 0.0194, 0.0086, 0.1258, 0.0123, 0.0131, 0.0094, 0.0069, 0.0623,
        0.0238, 0.0212, 0.0139, 0.0371, 0.0103])

In [17]:
outputs.sum(dim=1)

tensor([121.7445, 125.9949, 153.5376, 106.9565])

In [2]:
pred_bbox = tensor([[1,1,1,1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0]])
test_bbox = tensor([[0,0,1,1,1,1,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0]])

pred_span = tensor([[0, 4], [13, 19]])
test_span = tensor([[2, 8], [12, 14]])

In [1]:
import torch
import torch.nn as nn

# Settings
batch_size = 2
seq_len = 30
vocab_size = 255

# Simulated logits output from a model: [batch_size, seq_len, vocab_size]
logits = torch.randn(batch_size, seq_len, vocab_size, requires_grad=True)

# Simulated target labels: [batch_size, seq_len]
# Each value is an integer in the range [0, vocab_size)
targets = torch.randint(0, vocab_size, (batch_size, seq_len))

# Reshape for CrossEntropyLoss: it expects input of shape [N, C] and target [N]
logits_reshaped = logits.view(-1, vocab_size)
targets_reshaped = targets.view(-1)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Compute loss
loss = criterion(logits_reshaped, targets_reshaped)

# Backward pass to compute gradients
loss.backward()

# Check if gradients exist
print("Gradient shape (should be same as logits):", logits.grad.shape)
print("Gradients computed:", logits.grad is not None)


Gradient shape (should be same as logits): torch.Size([2, 30, 255])
Gradients computed: True


In [5]:
logits.grad.shape

torch.Size([2, 30, 255])

# GIoU implementation from Moment-detr

In [3]:
def print_vars(var_dict: dict):
    for k, v in var_dict.items():
        print(k, v, sep='\n', end='\n\n')

In [10]:
def temporal_iou(spans1, spans2):
    """
    Args:
        spans1: (N, 2) torch.Tensor, each row defines a span [st, ed]
        spans2: (M, 2) torch.Tensor, ...

    Returns:
        iou: (N, M) torch.Tensor
        union: (N, M) torch.Tensor
    >>> test_spans1 = torch.Tensor([[0, 0.2], [0.5, 1.0]])
    >>> test_spans2 = torch.Tensor([[0, 0.3], [0., 1.0]])
    >>> temporal_iou(test_spans1, test_spans2)
    (tensor([[0.6667, 0.2000],
         [0.0000, 0.5000]]),
     tensor([[0.3000, 1.0000],
             [0.8000, 1.0000]]))
    """
    areas1 = spans1[:, 1] - spans1[:, 0]  # (N, )
    areas2 = spans2[:, 1] - spans2[:, 0]  # (M, )
    left = torch.max(spans1[:, None, 0], spans2[:, 0])  # (N, M)
    right = torch.min(spans1[:, None, 1], spans2[:, 1])  # (N, M)
    inter = (right - left).clamp(min=0)  # (N, M)
    union = areas1[:, None] + areas2 - inter  # (N, M)
    iou = inter / union
    # print_vars(dict(spans1=spans1, spans2=spans2,
    #     areas1=areas1, areas2=areas2, left=left, right=right, inter=inter, union=union, iou=iou,
    # ))
    # print_vars(dict(
    #     sorcery1=spans1[:, None, 0], sorcery2=spans2[:, 0],
    #     sorcery3=spans1[:, None, 1], sorcery4=spans2[:, 1]
    # ))
    return iou, union


def generalized_temporal_iou(spans1, spans2):
    """
    Generalized IoU from https://giou.stanford.edu/
    Also reference to DETR implementation of generalized_box_iou
    https://github.com/facebookresearch/detr/blob/master/util/box_ops.py#L40

    Args:
        spans1: (N, 2) torch.Tensor, each row defines a span in xx format [st, ed]
        spans2: (M, 2) torch.Tensor, ...

    Returns:
        giou: (N, M) torch.Tensor

    >>> test_spans1 = torch.Tensor([[0, 0.2], [0.5, 1.0]])
    >>> test_spans2 = torch.Tensor([[0, 0.3], [0., 1.0]])
    >>> generalized_temporal_iou(test_spans1, test_spans2)
    tensor([[ 0.6667,  0.2000],
        [-0.2000,  0.5000]])
    """
    spans1 = spans1.float()
    spans2 = spans2.float()
    assert (spans1[:, 1] >= spans1[:, 0]).all()
    assert (spans2[:, 1] >= spans2[:, 0]).all()
    iou, union = temporal_iou(spans1, spans2)

    left = torch.min(spans1[:, None, 0], spans2[:, 0])  # (N, M)
    right = torch.max(spans1[:, None, 1], spans2[:, 1])  # (N, M)
    enclosing_area = (right - left).clamp(min=0)  # (N, M)
    print(f"{enclosing_area=}")
    return iou - (enclosing_area - union) / enclosing_area


In [11]:
pred_bbox = tensor([[1,1,1,1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0]])
test_bbox = tensor([[0,0,1,1,1,1,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0]])

In [12]:
pred_span = tensor([[0, 4], [13, 19]], dtype=torch.float32) / 20
test_span = tensor([[2, 8], [12, 14]], dtype=torch.float32) / 20
pred_span.requires_grad_()

tensor([[0.0000, 0.2000],
        [0.6500, 0.9500]], requires_grad=True)

In [14]:
import torch

# call your differentiable IoU/GIoU fn
giou = generalized_temporal_iou(pred_span, test_span)  # shape (1,1)
loss = (1 - torch.diag(giou)).mean()
loss.backward()

print(pred_span.grad)  # if this is all zeros, gradient died in the loop


enclosing_area=tensor([[0.4000, 0.7000],
        [0.8500, 0.3500]], grad_fn=<ClampBackward1>)
tensor([[-0.3125, -1.2500],
        [ 1.4286,  0.2041]])


In [69]:
left = torch.max(
    tensor([[ 0],
            [13]]),
    tensor([2, 12])
)

In [68]:
right = torch.min(
    tensor([[ 4],
            [19]]),
    tensor([8, 14])
)

In [73]:
right, left, (right-left)

(tensor([[ 4,  4],
         [ 8, 14]]),
 tensor([[ 2, 12],
         [13, 13]]),
 tensor([[ 2, -8],
         [-5,  1]]))

In [66]:
temporal_iou(
    pred_span, test_span
)

spans1
tensor([[ 0,  4],
        [13, 19]])

spans2
tensor([[ 2,  8],
        [12, 14]])

areas1
tensor([4, 6])

areas2
tensor([6, 2])

left
tensor([[ 2, 12],
        [13, 13]])

right
tensor([[ 4,  4],
        [ 8, 14]])

inter
tensor([[2, 0],
        [0, 1]])

union
tensor([[ 8,  6],
        [12,  7]])

iou
tensor([[0.2500, 0.0000],
        [0.0000, 0.1429]])

sorcery1
tensor([[ 0],
        [13]])

sorcery2
tensor([ 2, 12])

sorcery3
tensor([[ 4],
        [19]])

sorcery4
tensor([ 8, 14])



(tensor([[0.2500, 0.0000],
         [0.0000, 0.1429]]),
 tensor([[ 8,  6],
         [12,  7]]))

In [75]:
torch.diag(tensor([[0.2500, 0.0000],
         [0.0000, 0.1429]])).mean()

tensor(0.1964)

In [58]:
giou_span = generalized_temporal_iou(pred_span, test_span)
giou_span

areas1
tensor([4., 6.])

areas2
tensor([6., 2.])

left
tensor([[ 2., 12.],
        [13., 13.]])

right
tensor([[ 4.,  4.],
        [ 8., 14.]])

inter
tensor([[2., 0.],
        [0., 1.]])

union
tensor([[ 8.,  6.],
        [12.,  7.]])

iou
tensor([[0.2500, 0.0000],
        [0.0000, 0.1429]])

sorcery1
tensor([[ 0.],
        [13.]])

sorcery2
tensor([ 2., 12.])

enclosing_area=tensor([[ 8., 14.],
        [17.,  7.]])


tensor([[ 0.2500, -0.5714],
        [-0.2941,  0.1429]])

In [20]:
giou_loss_span = 1 - torch.diag(giou_span)
giou_loss_span, giou_loss_span.mean()

(tensor([0.7500, 0.8571]), tensor(0.8036))

# BBox Implementation

In [21]:
def temporal_iou_bbox(masks1, masks2):
    """
    Args:
        masks1: (N, L) torch.Tensor with binary values (0 or 1)
        masks2: (M, L) torch.Tensor with binary values (0 or 1)

    Returns:
        iou: (N, M) torch.Tensor
        union: (N, M) torch.Tensor
    """
    masks1 = masks1.float()
    masks2 = masks2.float()

    # (N, M, L) shape after broadcasting
    intersection = (masks1[:, None, :] * masks2[None, :, :]).sum(dim=2)
    area1 = masks1.sum(dim=1)  # (N,)
    area2 = masks2.sum(dim=1)  # (M,)
    union = area1[:, None] + area2[None, :] - intersection  # (N, M)

    iou = intersection / union.clamp(min=1e-6)
    return iou, union


def generalized_temporal_iou_bbox(masks1, masks2):
    """
    Generalized IoU for 1D segmentation masks.

    Args:
        masks1: (N, L) torch.Tensor with binary values (0 or 1)
        masks2: (M, L) torch.Tensor with binary values (0 or 1)

    Returns:
        giou: (N, M) torch.Tensor
    """
    masks1 = masks1.float()
    masks2 = masks2.float()

    iou, union = temporal_iou_bbox(masks1, masks2)

    # Enclosing box: union of supports (non-zero positions)
    # Find leftmost and rightmost 1s in either mask
    N, L = masks1.shape
    M = masks2.shape[0]

    enclosing_area = torch.zeros((N, M), dtype=torch.float32)

    for i in range(N):
        idx1 = torch.where(masks1[i] > 0)[0]
        if len(idx1) == 0:
            continue
        min1, max1 = idx1[0].item(), idx1[-1].item()
        for j in range(M):
            idx2 = torch.where(masks2[j] > 0)[0]
            if len(idx2) == 0:
                continue
            min2, max2 = idx2[0].item(), idx2[-1].item()
            left = min(min1, min2)
            right = max(max1, max2)
            enclosing_area[i, j] = right - left + 1  # +1 for inclusive span

    giou = iou - (enclosing_area - union) / enclosing_area.clamp(min=1e-6)
    return giou

In [93]:
import torch

# toy continuous “mask” (not hard 0/1) so gradients can flow
pred = torch.tensor([[0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,1,1,0,0,0,0]], requires_grad=True, dtype=torch.float32)
# treat test as fixed float mask
test = torch.tensor([[0,0,0,1,1,1,1,1,1,0,0,0,0,1,1,0,0,0,0,0,0]], dtype=torch.float32)

# call your differentiable IoU/GIoU fn
giou = generalized_temporal_iou_bbox(pred, test)  # shape (1,1)
loss = 1 - giou[0,0]
loss.backward()

print(pred.grad)  # if this is all zeros, gradient died in the loop


tensor([[-0.0506, -0.0506, -0.0506, -0.0833, -0.0833, -0.0833, -0.0833, -0.0833,
         -0.0833, -0.0506, -0.0506, -0.0506, -0.0506, -0.0833, -0.0833, -0.0506,
         -0.0506, -0.0506, -0.0506, -0.0506, -0.0506]])


In [95]:
1- generalized_temporal_iou_bbox(pred, test)

tensor([[0.8929]], grad_fn=<RsubBackward1>)

In [22]:
temporal_iou_bbox(pred_bbox, test_bbox)

(tensor([[0.2000]]), tensor([[15.]]))

In [26]:
generalized_temporal_iou_bbox(pred_bbox, test_bbox)

tensor([[-0.0105]])

In [13]:
1 - torch.diag(generalized_temporal_iou_bbox(pred_bbox, test_bbox))

tensor([1.0105])

In [None]:


intersection = torch.sum(pred_bbox & test_bbox)
area_pred = torch.sum(pred_bbox)
area_test = torch.sum(test_bbox)
union = area_pred + area_test - intersection


In [None]:
import numpy as np

def giou_1d(pred, test):
    pred = np.array(pred)
    test = np.array(test)

    intersection = np.sum(pred & test)
    area_pred = np.sum(pred)
    area_test = np.sum(test)
    union = area_pred + area_test - intersection

    if union == 0:
        return 1.0  # Special case: empty pred and test

    active_indices = np.where((pred + test) > 0)[0]
    x1_c = active_indices[0]
    x2_c = active_indices[-1]
    area_c = x2_c - x1_c + 1

    iou = intersection / union
    giou = iou - (area_c - union) / area_c

    return giou

# Example
pred_bbox = [1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0]
test_bbox = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1]

print(giou_1d(pred_bbox, test_bbox))


In [None]:
span_cxw_to_xx()

In [None]:
preds = torch.zeros((5, 40))
tests = torch.zeros((5, 40))

tests[0][1:8] = 1
tests[0][]

In [None]:
generalized_temporal_iou(
    tensor([1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]),
    tensor([0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0])
)