<a href="https://colab.research.google.com/github/ayush6233/reidentification_of_moving_object/blob/main/moving_object_reidentification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import cv2
import numpy as np
import torch
import torch.nn as nn
from torchvision import transforms, models
from ultralytics import YOLO
## one sure thing   model will work better at some particular threshold , we need to. determine it
## by running model multiple times and watching the result

In [None]:
# Embedding network: ResNet50 → 128‑D
class EmbeddingNet(nn.Module):
    def __init__(self):
        super().__init__()
        backbone = models.resnet50(pretrained=True)
        self.features = nn.Sequential(*list(backbone.children())[:-1])
        self.embed = nn.Linear(backbone.fc.in_features, 128)
    def forward(self, x):
        x = self.features(x).view(x.size(0), -1)
        x = self.embed(x)
        return nn.functional.normalize(x, dim=1)

In [None]:
# Tracker using 128‑D vector
class ReIDTracker:
    def __init__(self, sim_thresh=0.3):
        self.next_id   = 1
        self.templates = {}   # id -> 128‑D vector like mapping
        self.sim_thresh= sim_thresh

    def assign_ids(self, embeddings): ###gives us list of N IDs, matched greedily against existing templates
        ##embeddings is  list of N new 128‑D vectors
        n = len(embeddings)
        m = len(self.templates)
        if m == 0:
            # no existing IDs → give every emb a new ID
            ids = []
            for emb in embeddings:
                oid = self.next_id
                self.next_id += 1
                self.templates[oid] = emb
                ids.append(oid)
            return ids

        # build list of (new_idx, old_id, similarity)
        sims = []
        old_ids = list(self.templates.keys())
        for i, emb in enumerate(embeddings):
            for oid in old_ids:
                tmpl = self.templates[oid] ## used cosine similarty
                sim = float(np.dot(emb, tmpl)/(np.linalg.norm(emb)*np.linalg.norm(tmpl)+1e-6))
                sims.append((i, oid, sim))

        # sort by similarity in descending
        sims.sort(key=lambda x: x[2], reverse=True)
        assigned_new = set()
        assigned_old = set()
        ids = [None]*n
        # greedy match
        for i, oid, sim in sims:
            if sim < self.sim_thresh:
                break
            if i in assigned_new or oid in assigned_old:
                continue
            # match them
            ids[i] = oid
            assigned_new.add(i)
            assigned_old.add(oid)
            # update template
            self.templates[oid] = 0.5*self.templates[oid] + 0.5*embeddings[i] ## important step ,
            ## whenever we found a player that has some time appeared before then do not just update
            ## the embedding to new one , use 50 / 50 of of both. basically take mean of them

        # unmatched → new IDs
        for i in range(n):
            if ids[i] is None: # not matched
                oid = self.next_id
                self.next_id += 1
                self.templates[oid] = embeddings[i]
                ids[i] = oid
        return ids

In [None]:
model = YOLO("/content/drive/MyDrive/best.pt", task="detect")
print(model.names)
print(model.info())
# print(model.type) ## can observe how NN layers are propagated, activation functions and all

{0: 'ball', 1: 'goalkeeper', 2: 'player', 3: 'referee'}
YOLOv5x summary: 285 layers, 97,203,260 parameters, 0 gradients, 246.9 GFLOPs
(285, 97203260, 0, 246.91051520000002)


In [None]:
# 2. Load embedding network
device = "cpu" ## YOU CAN USE CUDA if you have GPU available
embed_net = EmbeddingNet().to(device).eval()
preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]), ## we should normalize, as it will be easy to decide cut off manually later
])




In [None]:
tracker = ReIDTracker(sim_thresh=0.6) ## track re-ids

In [None]:
cap = cv2.VideoCapture("/content/drive/MyDrive/15sec_input_720p.mp4")
## all below code is very standered to create video frame by frame and storing it in some different file
fps = cap.get(cv2.CAP_PROP_FPS) or 30
w   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h   = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out    = cv2.VideoWriter("tracked_output.mp4", fourcc, fps, (w, h))

In [None]:
i = 0
while True:
    flag, frame = cap.read()
    i+=1
    if not flag or i>=10: ## break early , cpu will take long time. it is 30 fps video 15 sec , 450 frames cpu takes 4 sec to proccess 1 frame 1800 sec= 30min
        ## for full render set i>=500 or remove i
        break
    # model.track(persist =True)
    res = model(frame)[0]
    #print(type(res)) ## object of class ultralytics engine
    # print(res)
    # break
    xyxy_list= res.boxes.xyxy.tolist() ## to define a rectangle its top left and bottom right co ordinates are enough , so it contains those
    confs    = res.boxes.conf.tolist() ## confidence intervals
    classes  = res.boxes.cls.tolist() # only take class 2 (players)
    bboxes, crops = [], []
    for xyxy, conf, cls in zip(xyxy_list, confs, classes):
        if int(cls)==2 and conf>0.3: ## class 2 means a player. and we must have above 30% confidence that it is a player
            x1,y1,x2,y2 = map(int, xyxy)
            bboxes.append((x1,y1,x2,y2))
            crops.append(frame[y1:y2, x1:x2])
    embeddings = []
    if crops: ## if players are there
        batch = torch.stack([preprocess(c) for c in crops]).to(device)
        with torch.no_grad():
            embs = embed_net(batch).cpu().numpy()
        embeddings = [embs[i] for i in range(embs.shape[0])]
    ids = tracker.assign_ids(embeddings) ## assign id's

    for (x1,y1,x2,y2), oid in zip(bboxes, ids):
        cv2.rectangle(frame, (x1,y1), (x2,y2), (0,255,0), 2)
        cv2.putText(frame, f"P{oid}", (x1,y1-10),cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)
    out.write(frame)
cap.release()
out.release()



0: 384x640 1 ball, 16 players, 2 referees, 62.8ms
Speed: 20.7ms preprocess, 62.8ms inference, 286.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 18 players, 2 referees, 67.4ms
Speed: 3.3ms preprocess, 67.4ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 16 players, 2 referees, 67.4ms
Speed: 2.4ms preprocess, 67.4ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 14 players, 2 referees, 67.4ms
Speed: 2.4ms preprocess, 67.4ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 14 players, 2 referees, 67.4ms
Speed: 9.2ms preprocess, 67.4ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 16 players, 2 referees, 67.4ms
Speed: 2.2ms preprocess, 67.4ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 players, 2 referees, 67.5ms
Speed: 2.3ms preprocess, 67.5ms inference, 1.7ms postprocess per image at 