In [None]:
%matplotlib inline

import ultralytics
ultralytics.checks()
from ultralytics import YOLO
model = YOLO("yolo11x.pt")

import torch
import torch.version
import torchvision
print("Torch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
print(torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

#from PIL import Image
#import depth_pro

# Load model and preprocessing transform
#depth_model, transform = depth_pro.create_model_and_transforms(device="cuda",precision=torch.float16)
#depth_model.eval()

#from sklearn.cluster import DBSCAN
#import matplotlib.pyplot as plt

import cv2
from ultralytics import solutions

In [None]:
results = model.track(source="videos/Pedestrians_1.mp4",stream=True,classes=[0],save=True)
for r in results:
    print(r.orig_shape)

In [None]:
# speed and direction estimation
cap = cv2.VideoCapture("videos/Video.mp4")
assert cap.isOpened(), "Error reading video file"

# Video writer
w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))
video_writer = cv2.VideoWriter("Speed_Results/Video.avi", cv2.VideoWriter_fourcc(*"mp4v"), fps, (w, h))

# speed region points
speed_region = [(1, 1), (w, 0), (w, h), (0, h)]

# Initialize speed estimation object
speedestimator = solutions.SpeedEstimator(
    show=False,  # display the output
    model="yolo11x.pt",  # path to the YOLO11 model file.
    region=speed_region,  # pass region points
    classes=[0],  # estimate speed of specific classes.
    #tracker = 'bytetrack.yaml',
)

# Process video
while cap.isOpened():
    success, im0 = cap.read()

    if not success:
        print("Video frame is empty or processing is complete.")
        break

    results = speedestimator(im0)

    video_writer.write(results.plot_im)  # write the processed frame.

cap.release()
video_writer.release()
cv2.destroyAllWindows()  # destroy all opened windows

In [None]:
#instance segmentation and tracking
cap = cv2.VideoCapture("videos/VID-20250322-WA0007.mp4")
assert cap.isOpened(), "Error reading video file"

# Video writer
w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))
video_writer = cv2.VideoWriter("Speed_Results/VID-20250322-WA0007.mp4", cv2.VideoWriter_fourcc(*"mp4v"), fps, (w, h))

# Initialize instance segmentation object
isegment = solutions.InstanceSegmentation(
    show=False,  # display the output
    model="yolo11n-seg.pt",  # model="yolo11n-seg.pt" for object segmentation using YOLO11.
    # classes=[0, 2],  # segment specific classes i.e, person and car with pretrained model.
)

# Process video
while cap.isOpened():
    success, im0 = cap.read()

    if not success:
        print("Video frame is empty or video processing has been successfully completed.")
        break

    results = isegment(im0)

    # print(results)  # access the output

    video_writer.write(results.plot_im)  # write the processed frame.

cap.release()
video_writer.release()
cv2.destroyAllWindows()  # destroy all opened windows

model = YOLO("yolo11n.pt")

results = model.track(source="Speed_Results/VID-20250322-WA0007.mp4",stream=True,classes=[0],save=True)
for r in results:
    print(r.orig_shape)

In [None]:
results = model.track(source="videos/VID-20250322-WA0007.mp4",stream=True,classes=[0])
for r in results:
    #print(r.cuda())
    # Load and preprocess an image.
    #d_image, _, f_px = depth_pro.load_rgb("Homebrew-image/WIN_20250308_20_09_29_Pro.jpg")
    d_img = transform(r.orig_img).to("cuda")
    # Run inference.
    prediction = depth_model.infer(d_img)
    depth = prediction["depth"]  # Depth in [m].
    #focallength_px = prediction["focallength_px"]  # Focal length in pixels.

    print(depth.shape)

In [None]:
# Load and preprocess an image.
image, _, f_px = depth_pro.load_rgb("Homebrew-image/WIN_20250308_20_09_29_Pro.jpg")
image = transform(image)

In [None]:
# Run inference.
prediction = depth_model.infer(image, f_px=f_px)
depth = prediction["depth"]  # Depth in [m].
focallength_px = prediction["focallength_px"]  # Focal length in pixels.

In [None]:
import numpy as np

print(depth)
depth = (depth - depth.min()) / depth.max()
print(depth)
# Remove the extra channel dimension if present.
img_array = depth.cpu().numpy()

# Convert the image from [0, 1] to [0, 255] and cast to uint8.
img_array = (img_array*255).astype(np.uint8)

# Create a PIL image in grayscale mode ('L').
pil_img = Image.fromarray(img_array, mode='L')

# Display the image using PIL's built-in viewer.
pil_img.show()

In [None]:
def calculate_l2_norms(centroids, threshold=100.0, device='cuda'):
    """
    Vectorized function to calculate pairwise L2 norms between centroids (3D) on GPU
    and return group memberships based on a distance threshold.

    Parameters:
    centroids (torch.Tensor): A tensor of shape (N, 3) representing object centroids (x, y, z) for the current frame.
    threshold (float): The distance threshold to consider objects in the same group.
    device (str): Device where the tensors are stored ('cpu' or 'cuda').

    Returns:
    group_memberships (list of sets): List of sets, each set contains indices of objects that are in the same group.
    """
    centroids = centroids.to(device)  # Move centroids tensor to the appropriate device

    # Compute pairwise L2 distances using broadcasting in PyTorch
    diff = centroids[:, None, :] - centroids[None, :, :]  # Shape (N, N, 3)
    dist_matrix = torch.norm(diff, dim=2)  # Shape (N, N), compute the L2 norm along axis 2

    # Create a list of groups based on the distance threshold
    num_objects = centroids.shape[0]
    group_memberships = []

    for i in range(num_objects):
        group = set([i])  # Start with the current object in its own group
        for j in range(num_objects):
            if i != j and dist_matrix[i, j] < threshold:
                group.add(j)
        group_memberships.append(group)

    return group_memberships

def track_groups(previous_groups, current_groups):
    """
    Function to track if objects have joined or left groups between frames.

    Parameters:
    previous_groups (list of sets): List of sets from the previous frame's group memberships.
    current_groups (list of sets): List of sets from the current frame's group memberships.

    Returns:
    joined (list): List of object indices that joined a new group.
    left (list): List of object indices that left a group.
    """
    joined = []
    left = []

    # Create sets to track which objects have changed groups
    prev_indices = {frozenset(group) for group in previous_groups}
    curr_indices = {frozenset(group) for group in current_groups}

    # Detect objects joining new groups
    for i, current_group in enumerate(current_groups):
        if frozenset(current_group) not in prev_indices:
            joined.append(i)

    # Detect objects leaving groups
    for i, prev_group in enumerate(previous_groups):
        if frozenset(prev_group) not in curr_indices:
            left.append(i)

    return joined, left


In [None]:
results = model.track(source="Homebrew-video/Low-quality/IMG_5347.MP4",stream=True,imgsz=1280,classes=[0],vid_stride=10)
for r in results:
    #print(r.cuda())
    # Load and preprocess an image.
    #d_image, _, f_px = depth_pro.load_rgb("Homebrew-image/WIN_20250308_20_09_29_Pro.jpg")
    d_img = transform(r.orig_img).to("cuda")
    # Run inference.
    prediction = depth_model.infer(d_img)
    depth = prediction["depth"]  # Depth in [m].
    #focallength_px = prediction["focallength_px"]  # Focal length in pixels.

    print(depth.shape)

    richard = torch.empty(0,3).to("cuda")
    for i in r.boxes:
        xy = i.xyxy[0]#; print(xy); print(xy[:2],xy[2:])
        c = torch.tensor([ (xy[:2][0]+xy[2:][0])/2, (xy[2:][-1]+xy[:2][-1])/2 ], device="cuda"); print("centroids",c)
        wh = i.xywh[0]# ;print(wh)
        d = depth[int(c[1].item())][int(c[0].item())]; print("depth value at centroid (m)",d)
        #d = ((2*torch.pi*180)/(wh[2]+wh[3]*360)*1000+3); print(d)
        #id = i.id; print(i.id)
        
        blobs = torch.hstack((c,d)); print(blobs)
        richard = torch.vstack((richard,blobs))#; print("this is richard say hello:", richard)
    #dist = calculate_l2_norms(richard); print("these are the groups:", dist)

    clustering = DBSCAN(eps=100, min_samples=2).fit(richard.cpu().numpy())
    print(clustering.labels_)

    plt.scatter(richard.cpu().numpy()[:,0],richard.cpu().numpy()[:,1],c=clustering.labels_)
    plt.xlim(0,1280)
    plt.ylim(0,720)
plt.show()

In [None]:
results = model.track(source="Homebrew-video/Low-quality/IMG_5347.MP4",stream=True,classes=[0],half=False,imgsz=1280,vid_stride=1)
frame=1
for r in results:
    #print(r.cuda())
    # Load and preprocess an image.
    #d_image, _, f_px = depth_pro.load_rgb("Homebrew-image/WIN_20250308_20_09_29_Pro.jpg")
    d_img = transform(r.orig_img).to("cuda")
    # Run inference.
    prediction = depth_model.infer(d_img)
    depth = prediction["depth"]  # Depth in [m].
    #focallength_px = prediction["focallength_px"]  # Focal length in pixels.

    print(depth.shape)

    richard = torch.empty(0,3).to("cuda")
    for i in r.boxes:
        xy = i.xyxy[0]#; print(xy); print(xy[:2],xy[2:])
        c = torch.tensor([ (xy[:2][0]+xy[2:][0])/2, (xy[2:][-1]+xy[:2][-1])/2 ], device="cuda"); print("centroids",c)
        wh = i.xywh[0]# ;print(wh)
        d = depth[int(c[1].item())][int(c[0].item())]; print("depth value at centroid (m)",d)
        #d = ((2*torch.pi*180)/(wh[2]+wh[3]*360)*1000+3); print(d)
        #id = i.id; print(i.id)
        
        blobs = torch.hstack((c,d)); print(blobs)
        richard = torch.vstack((richard,blobs))#; print("this is richard say hello:", richard)
    #dist = calculate_l2_norms(richard); print("these are the groups:", dist)

    clustering = DBSCAN(eps=100, min_samples=2).fit(richard.cpu().numpy())
    labs = clustering.labels_
    print(labs)

    
    plt.imshow(r.orig_img)
    scatter = plt.scatter(richard.cpu().numpy()[:,0],richard.cpu().numpy()[:,1],c=labs,label=labs)
    plt.legend(*scatter.legend_elements(),
               title="Classes")
    #plt.xlim(0,1280)
    #plt.ylim(0,720)
    plt.tight_layout
    plt.savefig(f"runs/dots/frame_{frame}",dpi=200)
    frame += 1
    plt.clf()

In [None]:
model.track(source="Homebrew-video/Low-quality/IMG_5347.MP4",save=True,classes=[0],half=False,imgsz=1280,vid_stride=1)

In [None]:
model.track(source="Homebrew-video/Low-quality/IMG_5347.MP4",save=True,classes=[0],half=False,imgsz=1280,vid_stride=1,iou=0.9,conf=0.5)