# Assignment 5 — Task 2: Video Analysing

Abhinav Kumar
12/14/2025

In [1]:
import os
import cv2
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ultralytics import YOLO

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/home/vscode/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [2]:
VIDEO_URL = "https://www.youtube.com/watch?v=EoVTttvKfRs"
VIDEO_PATH = "game7.mp4"

if not os.path.exists(VIDEO_PATH):
    import subprocess
    subprocess.run([
        "yt-dlp",
        "-f", "bv*+ba/b",
        "-o", VIDEO_PATH,
        VIDEO_URL
    ], check=True)

VIDEO_PATH


[youtube] Extracting URL: https://www.youtube.com/watch?v=EoVTttvKfRs
[youtube] EoVTttvKfRs: Downloading webpage




[youtube] EoVTttvKfRs: Downloading android sdkless player API JSON
[youtube] EoVTttvKfRs: Downloading web safari player API JSON




[youtube] EoVTttvKfRs: Downloading m3u8 information




[info] EoVTttvKfRs: Downloading 1 format(s): 398+251
[download] Destination: game7.mp4.f398.mp4
[download] 100% of  819.48MiB in 00:02:22 at 5.76MiB/s      
[download] Destination: game7.mp4.f251.webm
[download] 100% of   93.14MiB in 00:00:15 at 5.97MiB/s     
[Merger] Merging formats into "game7.mp4.webm"
Deleting original file game7.mp4.f398.mp4 (pass -k to keep)
Deleting original file game7.mp4.f251.webm (pass -k to keep)


'game7.mp4'

In [10]:
import os, subprocess

VIDEO_WEBM = "game7.mp4.webm"
CLIP_MP4   = "clip_h264.mp4"

START_TIME = "00:10:00"
END_TIME   = "00:14:30"


if not os.path.exists(CLIP_MP4):
    subprocess.run([
        "ffmpeg", "-y",
        "-ss", START_TIME,
        "-to", END_TIME,
        "-i", VIDEO_WEBM,
        "-c:v", "libx264",
        "-preset", "veryfast",
        "-crf", "23",
        "-c:a", "aac",
        "-b:a", "128k",
        CLIP_MP4
    ], check=True)

WORK_VIDEO = CLIP_MP4
WORK_VIDEO



ffmpeg version 5.1.7-0+deb12u1 Copyright (c) 2000-2025 the FFmpeg developers
  built with gcc 12 (Debian 12.2.0-14+deb12u1)
  configuration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidsta

'clip_h264.mp4'

In [11]:
cap = cv2.VideoCapture(WORK_VIDEO)
ok, frame = cap.read()
cap.release()

ok, None if frame is None else frame.shape

(True, (720, 1280, 3))

In [12]:
cap = cv2.VideoCapture(WORK_VIDEO)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
duration = frame_count / fps
fps, frame_count, duration


(29.97002997002997, 8092, 270.00306666666665)

In [13]:
model = YOLO("yolov8n.pt")

In [19]:
PERSON_ID = 0
BALL_ID = 32

def run_detection(video_path, sample_fps=6, min_conf=0.08):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)

    if not fps or fps <= 0:
        cap.release()
        raise RuntimeError("FPS is 0. Video decode failed. Convert to H.264 MP4 first.")

    step = max(1, int(fps / sample_fps))
    out = []
    idx = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if idx % step == 0:
            t = idx / fps
            res = model.predict(frame, verbose=False)[0]
            boxes = res.boxes

            if boxes is not None and len(boxes) > 0:
                for b in boxes:
                    cls = int(b.cls.item())
                    conf = float(b.conf.item())
                    x1, y1, x2, y2 = map(float, b.xyxy[0].tolist())
                    if cls in [PERSON_ID, BALL_ID] and conf >= min_conf:
                        out.append({
                            "t": t,
                            "cls": cls,
                            "conf": conf,
                            "x1": x1, "y1": y1, "x2": x2, "y2": y2
                        })

        idx += 1

    cap.release()

    cols = ["t","cls","conf","x1","y1","x2","y2"]
    df = pd.DataFrame(out, columns=cols)
    return df

det_df = run_detection(WORK_VIDEO, sample_fps=6)
det_df.head(), det_df["cls"].value_counts(dropna=False)

(     t  cls      conf          x1          y1          x2          y2
 0  0.0    0  0.548220  720.552124  364.043640  831.543091  498.536682
 1  0.0    0  0.491348  395.638977  295.477997  469.079041  453.936676
 2  0.0    0  0.415781  295.825378  232.333374  344.471252  364.916260
 3  0.0    0  0.358772  861.969604  234.385498  915.050415  379.375244
 4  0.0    0  0.338407  336.437561  284.592834  385.239075  407.035400,
 cls
 0     19702
 32       38
 Name: count, dtype: int64)

In [20]:
def centroid(row):
    return ((row.x1 + row.x2) / 2.0, (row.y1 + row.y2) / 2.0)

def dist(a, b):
    return math.hypot(a[0]-b[0], a[1]-b[1])

def simple_tracker(det_df, cls_id, max_jump=80):
    df = det_df[det_df["cls"] == cls_id].copy()
    df = df.sort_values("t")

    tracks = []
    next_id = 1
    last_pos = {}  # track_id -> (t, (cx,cy))

    for _, r in df.iterrows():
        c = centroid(r)
        t = r.t

        best_id = None
        best_d = 1e9
        for tid, (lt, lc) in last_pos.items():
            if t - lt <= 1.0:
                d = dist(c, lc)
                if d < best_d:
                    best_d = d
                    best_id = tid

        if best_id is not None and best_d <= max_jump:
            tid = best_id
        else:
            tid = next_id
            next_id += 1

        last_pos[tid] = (t, c)
        tracks.append({**r.to_dict(), "track_id": tid, "cx": c[0], "cy": c[1]})

    return pd.DataFrame(tracks)

players_tr = simple_tracker(det_df, PERSON_ID, max_jump=120)
ball_tr = simple_tracker(det_df, BALL_ID, max_jump=220)

players_tr.head(), ball_tr.head()


(     t  cls      conf          x1          y1          x2          y2  \
 0  0.0  0.0  0.548220  720.552124  364.043640  831.543091  498.536682   
 1  0.0  0.0  0.491348  395.638977  295.477997  469.079041  453.936676   
 2  0.0  0.0  0.358772  861.969604  234.385498  915.050415  379.375244   
 3  0.0  0.0  0.415781  295.825378  232.333374  344.471252  364.916260   
 4  0.0  0.0  0.338407  336.437561  284.592834  385.239075  407.035400   
 
    track_id          cx          cy  
 0         1  776.047607  431.290161  
 1         2  432.359009  374.707336  
 2         3  888.510010  306.880371  
 3         4  320.148315  298.624817  
 4         4  360.838318  345.814117  ,
            t   cls      conf          x1          y1          x2          y2  \
 0  22.956267  32.0  0.369108  773.240112  530.264526  797.533081  554.408081   
 1  23.089733  32.0  0.327798  764.593140  534.672729  791.467407  563.434204   
 2  28.561867  32.0  0.391434  770.300659  513.988647  802.249634  548.92590

#Task 2.1

In [22]:
def chunk_plays(ball_tr, gap_sec=5.0):
    if ball_tr.empty:
        return []

    df = ball_tr.sort_values("t").copy()
    ts = df["t"].to_numpy()

    plays = []
    start = ts[0]
    last = ts[0]

    for t in ts[1:]:
        if t - last > gap_sec:
            plays.append((start, last))
            start = t
        last = t

    plays.append((start, last))
    return plays

plays = chunk_plays(ball_tr, gap_sec=3.0)
plays[:10], len(plays)


([(np.float64(22.956266666666668), np.float64(23.089733333333335)),
  (np.float64(28.561866666666667), np.float64(28.695333333333334)),
  (np.float64(35.2352), np.float64(35.2352)),
  (np.float64(65.79906666666666), np.float64(65.79906666666666)),
  (np.float64(69.80306666666667), np.float64(69.93653333333333)),
  (np.float64(85.41866666666667), np.float64(85.41866666666667)),
  (np.float64(105.43866666666666), np.float64(105.43866666666666)),
  (np.float64(124.9248), np.float64(124.9248)),
  (np.float64(130.39693333333332), np.float64(130.39693333333332)),
  (np.float64(136.26946666666666), np.float64(136.26946666666666))],
 19)

#Task 2.2

In [26]:
def get_frame_at(video_path, t_sec):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
    ret, frame = cap.read()
    cap.release()
    if not ret:
        raise RuntimeError("Failed to read frame")
    return frame

mid_t = duration / 2
frame0 = get_frame_at(WORK_VIDEO, mid_t)

clicked_points = []

def on_mouse(event, x, y, flags, param):
    global clicked_points
    if event == cv2.EVENT_LBUTTONDOWN:
        clicked_points.append((x, y))
        print(f"Point {len(clicked_points)}: ({x}, {y})")

        cv2.circle(param, (x, y), 6, (0, 0, 255), -1)

clone = frame0.copy()
window_name = "Click 4 court corners in order: TL, TR, BR, BL (press Q to quit)"
cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
cv2.setMouseCallback(window_name, on_mouse, clone)

while True:
    cv2.imshow(window_name, clone)
    key = cv2.waitKey(20) & 0xFF
    if len(clicked_points) >= 4:
        break
    if key == ord("q"):
        break

cv2.destroyAllWindows()

if len(clicked_points) != 4:
    raise RuntimeError(f"Expected 4 points, got {len(clicked_points)}")

src_pts = np.array(clicked_points, dtype=np.float32)
src_pts

Point 1: (3, 211)
Point 2: (1273, 211)
Point 3: (1276, 502)
Point 4: (6, 502)


array([[          3,         211],
       [       1273,         211],
       [       1276,         502],
       [          6,         502]], dtype=float32)

In [27]:
COURT_W, COURT_H = 940, 500
dst_pts = np.array([
    [0, 0],
    [COURT_W, 0],
    [COURT_W, COURT_H],
    [0, COURT_H]
], dtype=np.float32)

H, _ = cv2.findHomography(src_pts, dst_pts)
H


array([[    0.74016,  -0.0076305,    -0.61044],
       [ 8.6187e-18,      1.7182,     -362.54],
       [          0,           0,           1]])

In [28]:
def warp_point(x, y, H):
    p = np.array([x, y, 1.0], dtype=np.float32).reshape(3,1)
    q = H @ p
    q = q / q[2,0]
    return float(q[0,0]), float(q[1,0])

def render_birdseye(t_sec, players_tr, ball_tr, H, out_path):
    canvas = np.ones((COURT_H, COURT_W, 3), dtype=np.uint8) * 255

    # players near time
    p = players_tr[(players_tr["t"] >= t_sec-0.5) & (players_tr["t"] <= t_sec+0.5)]
    for _, r in p.iterrows():
        x, y = warp_point(r.cx, r.cy, H)
        xi, yi = int(x), int(y)
        if 0 <= xi < COURT_W and 0 <= yi < COURT_H:
            cv2.circle(canvas, (xi, yi), 6, (0,0,0), -1)  # black dots
            cv2.putText(canvas, str(int(r.track_id)), (xi+6, yi-6),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,0), 1)

    # ball near time
    b = ball_tr[(ball_tr["t"] >= t_sec-0.5) & (ball_tr["t"] <= t_sec+0.5)]
    if not b.empty:
        r = b.iloc[-1]
        x, y = warp_point(r.cx, r.cy, H)
        xi, yi = int(x), int(y)
        if 0 <= xi < COURT_W and 0 <= yi < COURT_H:
            cv2.circle(canvas, (xi, yi), 5, (0,0,255), -1)  # red ball

    cv2.imwrite(out_path, canvas)
    return out_path


In [None]:
def nearest_player_to_ball(t_sec, players_tr, ball_tr, max_dist=80):
    b = ball_tr[(ball_tr["t"] >= t_sec-0.5) & (ball_tr["t"] <= t_sec+0.5)]
    if b.empty:
        return None
    br = b.iloc[-1]
    bx, by = br.cx, br.cy

    p = players_tr[(players_tr["t"] >= t_sec-0.5) & (players_tr["t"] <= t_sec+0.5)]
    if p.empty:
        return None

    best = None
    best_d = 1e9
    for _, r in p.iterrows():
        d = math.hypot(r.cx - bx, r.cy - by)
        if d < best_d:
            best_d = d
            best = r

    if best is None or best_d > max_dist:
        return None

    return int(best.track_id), best_d

def detect_actions_for_play(play_start, play_end, players_tr, ball_tr):
    rows = []
    t = play_start
    step = 1.0

    last_owner = None
    owner_change_count = 0

    while t <= play_end:
        owner = nearest_player_to_ball(t, players_tr, ball_tr)
        owner_id = owner[0] if owner else None

        if last_owner is not None and owner_id is not None and owner_id != last_owner:
            owner_change_count += 1
            rows.append({"t": t, "player_track": last_owner, "action": "Pass (inferred)"})

        if owner_id is not None:
            rows.append({"t": t, "player_track": owner_id, "action": "Dribble/Control (inferred)"})

        last_owner = owner_id
        t += step

    if owner_change_count >= 2 and last_owner is not None:
        rows.append({"t": play_end, "player_track": last_owner, "action": "Assist/Playmaking (inferred)"})

    return rows


In [31]:
def fmt_time(sec: int) -> str:
    h = sec // 3600
    m = (sec % 3600) // 60
    s = sec % 60
    return f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}"

In [32]:
os.makedirs("birdseye", exist_ok=True)

output_rows = []
play_id = 0

for (ps, pe) in plays[:8]:
    play_id += 1
    actions = detect_actions_for_play(ps, pe, players_tr, ball_tr)

    for a in actions[:4]:
        t = a["t"]
        img_path = f"birdseye/play{play_id}_t{int(t)}.png"
        render_birdseye(t, players_tr, ball_tr, H, img_path)

        output_rows.append({
            "Timestamp": fmt_time(int(t)),
            "Player": f"Track {a['player_track']}",
            "Action": a["action"],
            "BirdsEyeView": img_path
        })

df_out = pd.DataFrame(output_rows)
df_out.head(20)


Unnamed: 0,Timestamp,Player,Action,BirdsEyeView
0,1:05,Track 237,Dribble/Control (inferred),birdseye/play4_t65.png
1,1:09,Track 259,Dribble/Control (inferred),birdseye/play5_t69.png


In [33]:
df_out.to_csv("task2_actions_with_birdseye.csv", index=False)
"task2_actions_with_birdseye.csv"


'task2_actions_with_birdseye.csv'

In [34]:
counts = []
for i, (ps, pe) in enumerate(plays[:20], start=1):
    actions = detect_actions_for_play(ps, pe, players_tr, ball_tr)
    counts.append((i, ps, pe, pe-ps, len(actions)))

df_counts = pd.DataFrame(counts, columns=["play_idx","start_s","end_s","dur_s","num_actions"])
df_counts


Unnamed: 0,play_idx,start_s,end_s,dur_s,num_actions
0,1,22.956267,23.089733,0.133467,0
1,2,28.561867,28.695333,0.133467,0
2,3,35.2352,35.2352,0.0,0
3,4,65.799067,65.799067,0.0,1
4,5,69.803067,69.936533,0.133467,1
5,6,85.418667,85.418667,0.0,0
6,7,105.438667,105.438667,0.0,0
7,8,124.9248,124.9248,0.0,0
8,9,130.396933,130.396933,0.0,1
9,10,136.269467,136.269467,0.0,1


In [35]:
def consolidate_plays(plays, min_len=2.5, merge_gap=2.0):
    """
    Merge nearby short plays and ensure each play has a usable duration.
    """
    if not plays:
        return []

    merged = [list(plays[0])]

    for s, e in plays[1:]:
        last_s, last_e = merged[-1]

        # merge if close in time
        if s - last_e <= merge_gap:
            merged[-1][1] = e
        else:
            merged.append([s, e])

    final = []
    for s, e in merged:
        if e - s < min_len:
            e = s + min_len
        final.append((s, e))

    return final

plays_fixed = consolidate_plays(plays, min_len=2.5, merge_gap=2.0)


In [38]:
def show_plays(plays, limit=15):
    for i, (s, e) in enumerate(plays[:limit], start=1):
        dur = e - s
        print(f"Play {i}: {fmt_time(int(s))} ({s:.2f}s)  →  {fmt_time(int(e))} ({e:.2f}s)   | duration ≈ {dur:.2f}s")


In [39]:
show_plays(plays_fixed, limit=15)


Play 1: 0:22 (22.96s)  →  0:25 (25.46s)   | duration ≈ 2.50s
Play 2: 0:28 (28.56s)  →  0:31 (31.06s)   | duration ≈ 2.50s
Play 3: 0:35 (35.24s)  →  0:37 (37.74s)   | duration ≈ 2.50s
Play 4: 1:05 (65.80s)  →  1:08 (68.30s)   | duration ≈ 2.50s
Play 5: 1:09 (69.80s)  →  1:12 (72.30s)   | duration ≈ 2.50s
Play 6: 1:25 (85.42s)  →  1:27 (87.92s)   | duration ≈ 2.50s
Play 7: 1:45 (105.44s)  →  1:47 (107.94s)   | duration ≈ 2.50s
Play 8: 2:04 (124.92s)  →  2:07 (127.42s)   | duration ≈ 2.50s
Play 9: 2:10 (130.40s)  →  2:12 (132.90s)   | duration ≈ 2.50s
Play 10: 2:16 (136.27s)  →  2:18 (138.77s)   | duration ≈ 2.50s
Play 11: 2:24 (144.94s)  →  2:27 (147.44s)   | duration ≈ 2.50s
Play 12: 2:38 (158.96s)  →  2:42 (162.43s)   | duration ≈ 3.47s
Play 13: 2:48 (168.57s)  →  2:51 (171.07s)   | duration ≈ 2.50s
Play 14: 3:09 (189.39s)  →  3:11 (191.89s)   | duration ≈ 2.50s
Play 15: 3:41 (221.15s)  →  3:43 (223.65s)   | duration ≈ 2.50s


In [40]:
output_rows = []
play_id = 0

for (ps, pe) in plays_fixed[:10]:
    play_id += 1
    actions = detect_actions_for_play(ps, pe, players_tr, ball_tr)

    for a in actions[:6]:
        t = a["t"]
        img_path = f"birdseye/play{play_id}_t{int(t)}.png"
        render_birdseye(t, players_tr, ball_tr, H, img_path)

        output_rows.append({
            "Timestamp": fmt_time(int(t)),
            "Player": f"Track {a['player_track']}",
            "Action": a["action"],
            "BirdsEyeView": img_path
        })

df_out = pd.DataFrame(output_rows)
len(df_out), df_out.head(20)


(4,
   Timestamp     Player                      Action              BirdsEyeView
 0      1:05  Track 237  Dribble/Control (inferred)    birdseye/play4_t65.png
 1      1:09  Track 259  Dribble/Control (inferred)    birdseye/play5_t69.png
 2      2:10  Track 470  Dribble/Control (inferred)   birdseye/play9_t130.png
 3      2:16  Track 502  Dribble/Control (inferred)  birdseye/play10_t136.png)