In [1]:

!pip install -q ultralytics opencv-python-headless scenedetect ffmpeg-python deep_sort_realtime --upgrade

import cv2
import numpy as np
import torch
import os
import time
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m0.9/1.0 MB[0m [31m27.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.6/131.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m89.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m102.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:

# ========== CONFIG ==========
VIDEO_PATH = "/content/drive/MyDrive/Colab Notebooks/input.mp4"  # Update path as needed
OUTPUT_PATH = "test_output_1.mp4"

# ========== PARAMETERS ==========
ZOOM_MIN = 2.2
ZOOM_MAX = 3.4
CONFIDENCE_THRESHOLD = 0.35
SCENE_THRESHOLD = 30.0
TARGET_CLASS = 0  # 'person'


In [3]:
# from ultralytics import YOLO
# ========== INIT ==========
model = YOLO("yolo12x.pt")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
deepsort = DeepSort(max_age=30)


Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo12x.pt to 'yolo12x.pt': 100%|██████████| 114M/114M [00:01<00:00, 65.4MB/s]


In [4]:

def detect_scenes(video_path, threshold=SCENE_THRESHOLD):
    video_manager = VideoManager([video_path])
    scene_manager = SceneManager()
    scene_manager.add_detector(ContentDetector(threshold=threshold))
    video_manager.set_duration()
    video_manager.start()
    scene_manager.detect_scenes(frame_source=video_manager)
    scene_list = scene_manager.get_scene_list()
    scene_frames = [(int(start.get_frames()), int(end.get_frames())) for start, end in scene_list]
    video_manager.release()
    return scene_frames


In [5]:

def crop_zoom(frame, center_x, center_y, zoom):
    h, w = frame.shape[:2]
    new_w = int(w / zoom)
    new_h = int(h / zoom)
    x1 = max(0, center_x - new_w // 2)
    y1 = max(0, center_y - new_h // 2)
    x2 = min(w, x1 + new_w)
    y2 = min(h, y1 + new_h)
    if x2 - x1 <= 0 or y2 - y1 <= 0:
        return frame
    cropped = frame[y1:y2, x1:x2]
    return cv2.resize(cropped, (w, h))


In [6]:

def process_video(video_path, output_path):
    cap = cv2.VideoCapture(video_path)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    original_fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), original_fps, (width, height))

    scene_frames = detect_scenes(video_path)
    print(f"Detected {len(scene_frames)} scenes.")

    frame_idx = 0
    start_time = time.time()

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        scene_fps = original_fps
        for s_start, s_end in scene_frames:
            if s_start <= frame_idx <= s_end:
                scene_duration = (s_end - s_start) / original_fps
                scene_fps = 30 if scene_duration > 5 else original_fps
                break

        results = model.predict(frame, conf=CONFIDENCE_THRESHOLD, device=device, verbose=False)
        detections = results[0].boxes.data.cpu().numpy()
        track_inputs = []

        for *xyxy, conf, cls in detections:
            if int(cls) == TARGET_CLASS:
                x1, y1, x2, y2 = map(int, xyxy)
                track_inputs.append(([x1, y1, x2 - x1, y2 - y1], conf, 'person'))

        tracks = deepsort.update_tracks(track_inputs, frame=frame)

        subjects = []
        for track in tracks:
            if not track.is_confirmed():
                continue
            x1, y1, x2, y2 = map(int, track.to_ltrb())
            center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
            area_ratio = ((x2 - x1) * (y2 - y1)) / (width * height)
            if area_ratio == 0:
                continue
            zoom = np.clip(3.0 / area_ratio, ZOOM_MIN, ZOOM_MAX)
            subjects.append((center_x, center_y, zoom))

        if subjects:
            avg_center_x = int(np.mean([s[0] for s in subjects]))
            avg_center_y = int(np.mean([s[1] for s in subjects]))
            avg_zoom = np.mean([s[2] for s in subjects])
            frame = crop_zoom(frame, avg_center_x, avg_center_y, avg_zoom)

        writer.write(frame)
        frame_idx += 1

        elapsed = time.time() - start_time
        fps = frame_idx / elapsed if elapsed > 0 else 0
        eta = (total_frames - frame_idx) / fps if fps > 0 else 0
        pct = (frame_idx / total_frames) * 100
        print(f"[{frame_idx}/{total_frames}] {pct:.1f}% - {fps:.2f} FPS - ETA: {eta:.1f}s")

    cap.release()
    writer.release()
    print(f"✅ Done in {time.time() - start_time:.2f} seconds")


In [None]:
process_video(VIDEO_PATH, OUTPUT_PATH)


ERROR:pyscenedetect:VideoManager is deprecated and will be removed.
INFO:pyscenedetect:Loaded 1 video, framerate: 23.976 FPS, resolution: 1920 x 1080
INFO:pyscenedetect:Duration set, start: None, duration: None, end: None.
INFO:pyscenedetect:Detecting scenes...


Detected 257 scenes.
[1/31151] 0.0% - 0.19 FPS - ETA: 160968.5s
[2/31151] 0.0% - 0.25 FPS - ETA: 127025.7s
[3/31151] 0.0% - 0.28 FPS - ETA: 109291.4s
[4/31151] 0.0% - 0.31 FPS - ETA: 100210.0s
[5/31151] 0.0% - 0.33 FPS - ETA: 94966.4s
[6/31151] 0.0% - 0.33 FPS - ETA: 93852.7s
[7/31151] 0.0% - 0.33 FPS - ETA: 95133.1s
[8/31151] 0.0% - 0.34 FPS - ETA: 92497.6s
[9/31151] 0.0% - 0.34 FPS - ETA: 90436.9s
[10/31151] 0.0% - 0.35 FPS - ETA: 88749.7s
[11/31151] 0.0% - 0.35 FPS - ETA: 88082.1s
[12/31151] 0.0% - 0.35 FPS - ETA: 89971.7s
[13/31151] 0.0% - 0.35 FPS - ETA: 88805.8s
[14/31151] 0.0% - 0.35 FPS - ETA: 87733.6s
[15/31151] 0.0% - 0.36 FPS - ETA: 86696.9s
[16/31151] 0.1% - 0.36 FPS - ETA: 85775.3s
[17/31151] 0.1% - 0.36 FPS - ETA: 87084.9s
[18/31151] 0.1% - 0.36 FPS - ETA: 86843.0s
[19/31151] 0.1% - 0.36 FPS - ETA: 86169.4s
[20/31151] 0.1% - 0.36 FPS - ETA: 85566.5s
[21/31151] 0.1% - 0.37 FPS - ETA: 84989.7s
[22/31151] 0.1% - 0.36 FPS - ETA: 85626.0s
[23/31151] 0.1% - 0.36 FPS - ETA: 8592


## QA Notes & Validation

### Libraries Used & Rationale
| Library | Use | Rationale |
|--------|-----|-----------|
| YOLOv8 | Detection | High-speed, high-accuracy object detection |
| Deep SORT | Tracking | Real-time re-identification with long-term memory |
| OpenCV | Video I/O | Fast GPU-compatible frame processing |
| SceneDetect | Scene splitting | High-confidence video segmentation |
| Torch | Inference backend | Optimized for YOLO inference with CUDA |

###  QA Parameters to Validate
- Smooth zooming around detected subject(s)
- No jitter between frames while zooming/panning
- Subjects tracked even through partial occlusions
- Scene changes respect logical breaks (SceneDetect verified)
- Frame rate and resolution are preserved
- GPU load and CUDA utilization (`nvidia-smi` check)

---

Ready to extend with:
-  Social media cropping presets (IG, TikTok, etc.)
-  Push outputs to cloud
-  CI/CD automation via GitHub Actions
