In [None]:
%matplotlib inline

import time
import ultralytics
ultralytics.checks()
from ultralytics import YOLO
model = YOLO("yolo11x.pt")

import torch
import torch.version
import torchvision
print("Torch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
print(torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

#from PIL import Image
import depth_pro

# Load depth model and preprocessing transform
depth_model, transform = depth_pro.create_model_and_transforms(device="cuda",precision=torch.half)
depth_model.eval()

from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

In [None]:
class Group:
    def __init__(self,group,frameid):
        self.groupcount = group.shape
        self.frameid = frameid
        pass

    def update(self,):
        pass

In [None]:
def calculate_l2_norms(centroids, threshold=10.0, device='cuda'):
    """
    Vectorized function to calculate pairwise L2 norms between centroids (3D) on GPU
    and return group memberships based on a distance threshold.

    Parameters:
    centroids (torch.Tensor): A tensor of shape (N, 3) representing object centroids (x, y, z) for the current frame.
    threshold (float): The distance threshold to consider objects in the same group.
    device (str): Device where the tensors are stored ('cpu' or 'cuda').

    Returns:
    group_memberships (list of sets): List of sets, each set contains indices of objects that are in the same group.
    """
    centroids = centroids.to(device)  # Move centroids tensor to the appropriate device

    # Compute pairwise L2 distances using broadcasting in PyTorch
    diff = centroids[:, None, :] - centroids[None, :, :]  # Shape (N, N, 3)
    dist_matrix = torch.norm(diff, dim=2)  # Shape (N, N), compute the L2 norm along axis 2

    # Create a list of groups based on the distance threshold
    num_objects = centroids.shape[0]
    group_memberships = []

    for i in range(num_objects):
        group = set([i])  # Start with the current object in its own group
        for j in range(num_objects):
            if i != j and dist_matrix[i, j] < threshold:
                group.add(j)
        group_memberships.append(group)

    return group_memberships

def track_groups(previous_groups, current_groups):
    """
    Function to track if objects have joined or left groups between frames.

    Parameters:
    previous_groups (list of sets): List of sets from the previous frame's group memberships.
    current_groups (list of sets): List of sets from the current frame's group memberships.

    Returns:
    joined (list): List of object indices that joined a new group.
    left (list): List of object indices that left a group.
    """
    joined = []
    left = []

    # Create sets to track which objects have changed groups
    prev_indices = {frozenset(group) for group in previous_groups}
    curr_indices = {frozenset(group) for group in current_groups}

    # Detect objects joining new groups
    for i, current_group in enumerate(current_groups):
        if frozenset(current_group) not in prev_indices:
            joined.append(i)

    # Detect objects leaving groups
    for i, prev_group in enumerate(previous_groups):
        if frozenset(prev_group) not in curr_indices:
            left.append(i)

    return joined, left


## Loop-based grouping with depth-based inter-object distance estimation

In [None]:
results = model.track(source="Homebrew-video/Low-quality/IMG_5347.MP4",stream=True,classes=[0],half=True,imgsz=1280,vid_stride=1)
frame=1
for j,r in enumerate(results):
    #if j==1: break
    #r.to("cuda")

    # Load and preprocess given frame.
    d_img = transform(r.orig_img).to("cuda") # Transform and transfer given frame to gpu to be ready for depth inference.
    f_px = depth_pro.utils.fpx_from_f35(d_img.shape[2],d_img.shape[1],26) # Converts focal length from my phone camera to fpx.

    # Run depth inference on given frame.
    prediction = depth_model.infer(d_img,f_px)
    depth = prediction["depth"]  # Depth in [m].
    focallength_px = prediction["focallength_px"]  # Focal length in pixels.

    # Initialise tensor to store bounding box centroids and depth estimates.
    richard=harry = torch.empty(0,3).to("cuda")
    start = time.time()
    # Loop over bounding boxes detected in given frame.
    for i in r.boxes:
        xy = i.xyxy[0]#; print(xy); print(xy[:2],xy[2:])
        c = torch.tensor([ (xy[:2][0]+xy[2:][0])/2, (xy[2:][-1]+xy[:2][-1])/2 ], device="cuda"); print("Centroid coordinates (px)",c)
        c = i.xywh[0,:2].to("cuda") ;print(c)
        d = depth[int(c[1].item())][int(c[0].item())]; print("Depth estimate at centroid (m)",d)
        f = focallength_px
        id = i.id ;print(i.id)
        cr = c * d/f # Centroids coordinates in estimated metric distance.
        
        blobs = torch.hstack((c,d)) ;print(blobs)
        blobs2= torch.hstack((cr,d)) ;print(blobs2)
        richard = torch.vstack((richard,blobs)) ;print("this is richard say hello:", richard)
        harry = torch.vstack((harry,blobs2)) ;print("this is harry say hello:", harry)
    #dist = calculate_l2_norms(richard); print("these are the groups:", dist)

    clustering = DBSCAN(eps=4, min_samples=2).fit(harry.cpu().numpy())
    labs = clustering.labels_
    print(labs)
    end = time.time()
    print("Time to loop over all boxes",end-start)
    
    plt.imshow(r.orig_img)
    scatter = plt.scatter(richard.cpu().numpy()[:,0],richard.cpu().numpy()[:,1],c=labs,label=labs)
    plt.legend(*scatter.legend_elements(),
               title="Classes")
    #plt.xlim(0,1280)
    #plt.ylim(0,720)
    plt.tight_layout
    plt.savefig(f"runs/dots3/frame_{frame}",dpi=200)
    frame += 1
    plt.clf()

## Parallelised grouping with depth-based inter-object distance estimation

In [None]:
results = model.track(source="Homebrew-video/Low-quality/IMG_5354.MP4",
                      stream=True,classes=[0],half=True,imgsz=1280,vid_stride=1,
                      conf=0.25,iou=0.9)
frame=1
for j,r in enumerate(results):
    #if j==60: break
    r = r.to("cuda")

    # Load and preprocess given frame.
    d_img = transform(r.orig_img) # Transform and transfer given frame to gpu to be ready for depth inference.
    f_px = depth_pro.utils.fpx_from_f35(d_img.shape[2],d_img.shape[1],26) # Converts focal length from my phone camera to fpx.

    # Run depth inference on given frame.
    prediction = depth_model.infer(d_img,f_px)
    depth = prediction["depth"]  # Depth in [m].
    focallength_px = prediction["focallength_px"]  # Focal length in pixels.

    # Initialise tensor to store bounding box centroids and depth estimates.
    groups = torch.empty(0,4).to("cuda")

    # Parallelised centroid extraction and depth assignment
    c = r.boxes.xywh[:,:2] ;print("Centroid coordinates (px):", c)
    d = depth[c[:,1].int(),c[:,0].int()] ;print("Depth at each centroid:", d)
    f = focallength_px ;print("Focal length (px):", f)
    cr = c.T * d/f ;print("Metric centroid coordinates (m):",cr)
    groups = torch.vstack((cr,d)) ;print("Bounding box x,y,z coordinates (m):",groups)

    # L2 Norm
    # norms = calculate_l2_norms(groups.T,2) ;print("L2 norm code grouping:",norms)

    # Clustering 3D points
    clustering = DBSCAN(eps=2, min_samples=2).fit(groups.T.cpu().numpy())
    labs = clustering.labels_ ;print("Clustering labels:",labs)

    # Prepare groups for tracking group momentum
    groups = torch.vstack((groups,r.boxes.id,torch.tensor(labs,device="cuda"))) ;print("Groups with id appended:",groups.T)
    
    # Count frames and modulate group membership
    group_class = Group(groups,frame)
    

    # Saving the images of centroids and group ownership
    plt.imshow(r.orig_img)
    scatter = plt.scatter(c.cpu().numpy()[:,0],c.cpu().numpy()[:,1],c=labs,label=labs)
    plt.legend(*scatter.legend_elements(),title="Classes")
    plt.tight_layout
    plt.savefig(f"runs/dots4/frame_{frame}",dpi=100)
    plt.clf()
    frame += 1
    