# Assignment Solution – Player Re-Identification


In [2]:
import cv2
import numpy as np
import torch
import torchvision.models as models
import torchvision.transforms as T
from ultralytics import YOLO

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load pre-trained ResNet18 and remove the final layer for embeddings
resnet = models.resnet18(weights='IMAGENET1K_V1')
resnet.fc = torch.nn.Identity()
resnet = resnet.to(device).eval()

# Define preprocessing pipeline
transform = T.Compose([
    T.ToPILImage(),
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

# Extract feature vector from a cropped image using ResNet18
def extract_embedding(crop):
    try:
        img_tensor = transform(crop).unsqueeze(0).to(device)
        with torch.no_grad():
            features = resnet(img_tensor)
        return features.squeeze().cpu().numpy()
    except:
        return None  # If transform or embedding fails

# Check spatial proximity between two bounding boxes
def is_spatially_near(box1, box2, max_dist=100):
    cx1 = (box1[0] + box1[2]) / 2
    cy1 = (box1[1] + box1[3]) / 2
    cx2 = (box2[0] + box2[2]) / 2
    cy2 = (box2[1] + box2[3]) / 2
    return np.hypot(cx1 - cx2, cy1 - cy2) < max_dist

# Try to match player based on both appearance and spatial closeness
def match_player_id(embedding, box, player_db, threshold=0.75):
    best_id = None
    best_score = -1
    for pid, (saved_emb, saved_box) in player_db.items():
        if not is_spatially_near(box, saved_box):
            continue
        sim = np.dot(embedding, saved_emb) / (np.linalg.norm(embedding) * np.linalg.norm(saved_emb))
        if sim > threshold and sim > best_score:
            best_score = sim
            best_id = pid
    return best_id

# Load YOLOv8/YOLOv11 model
model = YOLO("best.pt")  # Make sure to use correct weights

# Open input video and prepare output
cap = cv2.VideoCapture("15sec_input_720p.mp4")
width = int(cap.get(3))
height = int(cap.get(4))
fps = int(cap.get(cv2.CAP_PROP_FPS))
out = cv2.VideoWriter("output.avi", cv2.VideoWriter_fourcc(*'XVID'), fps, (width, height))

# Track player ID assignments
player_db = {}   # id -> (embedding, last known box)
next_id = 0

# Frame processing loop
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame)
    detections = results[0].boxes

    for det in detections:
        cls_id = int(det.cls.item())
        class_name = results[0].names[cls_id]

        # Only process 'player' class
        if class_name.lower() != "player":
            continue

        x1, y1, x2, y2 = map(int, det.xyxy[0].tolist())
        if x2 <= x1 or y2 <= y1:
            continue

        crop = frame[y1:y2, x1:x2]
        if crop.size == 0:
            continue

        embedding = extract_embedding(crop)
        if embedding is None:
            continue

        # Match or assign a new ID
        matched_id = match_player_id(embedding, (x1, y1, x2, y2), player_db)
        if matched_id is None:
            matched_id = next_id
            next_id += 1

        player_db[matched_id] = (embedding, (x1, y1, x2, y2))

        # Annotate frame with ID
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, f"ID: {matched_id}", (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2)

    out.write(frame)
    cv2.imshow("Tracking", frame)
    if cv2.waitKey(1) == 27:  # ESC to stop
        break

cap.release()
out.release()
cv2.destroyAllWindows()


0: 384x640 1 ball, 16 players, 2 referees, 773.7ms
Speed: 3.9ms preprocess, 773.7ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 18 players, 2 referees, 805.4ms
Speed: 2.8ms preprocess, 805.4ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 16 players, 2 referees, 783.5ms
Speed: 3.5ms preprocess, 783.5ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 14 players, 2 referees, 738.3ms
Speed: 2.0ms preprocess, 738.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 14 players, 2 referees, 749.2ms
Speed: 2.2ms preprocess, 749.2ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 16 players, 2 referees, 759.7ms
Speed: 1.8ms preprocess, 759.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 players, 2 referees, 770.8ms
Speed: 2.3ms preprocess, 770.8ms inference, 1.2ms postprocess pe

Speed: 3.6ms preprocess, 926.6ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 players, 2 referees, 1027.4ms
Speed: 2.3ms preprocess, 1027.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 players, 2 referees, 939.8ms
Speed: 2.1ms preprocess, 939.8ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 players, 2 referees, 954.0ms
Speed: 2.1ms preprocess, 954.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 14 players, 2 referees, 944.1ms
Speed: 3.2ms preprocess, 944.1ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 15 players, 2 referees, 1013.5ms
Speed: 2.7ms preprocess, 1013.5ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 15 players, 2 referees, 927.2ms
Speed: 2.1ms preprocess, 927.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 players


0: 384x640 19 players, 1 referee, 923.7ms
Speed: 3.0ms preprocess, 923.7ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 15 players, 2 referees, 928.4ms
Speed: 2.6ms preprocess, 928.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 players, 1 referee, 912.2ms
Speed: 2.3ms preprocess, 912.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 players, 1 referee, 935.0ms
Speed: 2.3ms preprocess, 935.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 16 players, 1 referee, 922.3ms
Speed: 2.3ms preprocess, 922.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 players, 1 referee, 912.6ms
Speed: 2.4ms preprocess, 912.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 17 players, 1 referee, 907.9ms
Speed: 3.8ms preprocess, 907.9ms inference, 1.5ms postprocess per image at shape (1, 3


0: 384x640 1 ball, 13 players, 982.6ms
Speed: 2.3ms preprocess, 982.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 players, 1 referee, 986.4ms
Speed: 2.4ms preprocess, 986.4ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 players, 1 referee, 1016.3ms
Speed: 2.5ms preprocess, 1016.3ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 players, 1 referee, 963.3ms
Speed: 3.2ms preprocess, 963.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 players, 1 referee, 943.9ms
Speed: 2.7ms preprocess, 943.9ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 players, 1004.6ms
Speed: 3.2ms preprocess, 1004.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 players, 1061.3ms
Speed: 3.8ms preprocess, 1061.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 14 players, 


0: 384x640 2 balls, 1 goalkeeper, 11 players, 1026.4ms
Speed: 2.5ms preprocess, 1026.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 1 goalkeeper, 13 players, 1126.9ms
Speed: 2.8ms preprocess, 1126.9ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 1 goalkeeper, 12 players, 969.7ms
Speed: 2.5ms preprocess, 969.7ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 ball, 1 goalkeeper, 10 players, 1054.4ms
Speed: 2.5ms preprocess, 1054.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 balls, 8 players, 1122.9ms
Speed: 4.3ms preprocess, 1122.9ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 goalkeeper, 8 players, 1024.5ms
Speed: 3.1ms preprocess, 1024.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 players, 1078.8ms
Speed: 2.5ms preprocess, 1078.8ms inference, 1.6ms postprocess per 


0: 384x640 1 goalkeeper, 15 players, 945.2ms
Speed: 2.6ms preprocess, 945.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 goalkeeper, 13 players, 945.9ms
Speed: 2.6ms preprocess, 945.9ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 goalkeeper, 12 players, 996.4ms
Speed: 2.8ms preprocess, 996.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 goalkeeper, 9 players, 974.6ms
Speed: 2.5ms preprocess, 974.6ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 players, 966.0ms
Speed: 2.6ms preprocess, 966.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 players, 973.5ms
Speed: 2.4ms preprocess, 973.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 players, 939.8ms
Speed: 2.2ms preprocess, 939.8ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 players, 998.9ms
Spe


0: 384x640 13 players, 1327.1ms
Speed: 2.5ms preprocess, 1327.1ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 players, 1165.3ms
Speed: 2.9ms preprocess, 1165.3ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 players, 1124.1ms
Speed: 2.6ms preprocess, 1124.1ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 players, 1062.8ms
Speed: 3.2ms preprocess, 1062.8ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 players, 1013.3ms
Speed: 3.6ms preprocess, 1013.3ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 players, 1028.6ms
Speed: 2.6ms preprocess, 1028.6ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 players, 964.1ms
Speed: 2.3ms preprocess, 964.1ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 players, 969.2ms
Speed: 2.6ms preprocess, 969.2ms inference, 1.