In [1]:
#Libraries and Dependencies installations..

!pip install -U ultralytics
!pip install ultralytics==8.0.196 \
            deep-sort-realtime==1.3.2 \
            scenedetect==0.6.1 \
            opencv-python \
            numpy \
            psutil \
            GPUtil \
            torch \
            torchvision \
            torchaudio

Collecting ultralytics
  Downloading ultralytics-8.3.179-py3-none-any.whl.metadata (37 kB)
Collecting matplotlib>=3.3.0 (from ultralytics)
  Downloading matplotlib-3.10.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting opencv-python>=4.6.0 (from ultralytics)
  Downloading opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (19 kB)
Collecting scipy>=1.4.1 (from ultralytics)
  Downloading scipy-1.16.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Collecting tqdm>=4.64.0 (from ultralytics)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting py-cpuinfo (from ultralytics)
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)
Collecting pandas>=1.1.4 (from ultralytics)
  Downloading pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop

YOLO model loaded successfully with weights_only=False.


In [24]:
!pip install -q gdown
!gdown 1-It9RlqOYAEIDbKB5sHFOlaYRRKUmiCp -O input_file.mp4


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Downloading...
From (original): https://drive.google.com/uc?id=1-It9RlqOYAEIDbKB5sHFOlaYRRKUmiCp
From (redirected): https://drive.google.com/uc?id=1-It9RlqOYAEIDbKB5sHFOlaYRRKUmiCp&confirm=t&uuid=d2ca2a68-55f4-455e-bcb4-16dc356bd19f
To: /workspace/input_file.mp4
100%|████████████████████████████████████████| 285M/285M [00:04<00:00, 58.1MB/s]


In [5]:
!pip install imageio-ffmpeg

Collecting imageio-ffmpeg
  Downloading imageio_ffmpeg-0.6.0-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Downloading imageio_ffmpeg-0.6.0-py3-none-manylinux2014_x86_64.whl (29.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.5/29.5 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25hInstalling collected packages: imageio-ffmpeg
Successfully installed imageio-ffmpeg-0.6.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [38]:
import torch

# Original torch.load ko safe rakhna
_torch_load_orig = torch.load

def torch_load_safe(f, *args, **kwargs):
    # Sirf tab patch kare jab weights_only explicitly set na ho
    if "weights_only" not in kwargs:
        kwargs["weights_only"] = False
    return _torch_load_orig(f, *args, **kwargs)

# Ek baar hi patch kare
if torch.load is not torch_load_safe:
    torch.load = torch_load_safe


In [8]:
# === Single cell: Safe YOLO load (no recursion) + full processing ===
import os, time, psutil, platform, subprocess
import imageio_ffmpeg as ffmpeg
import numpy as np
import cv2
import GPUtil
import torch
from functools import partial
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
from scipy.interpolate import make_interp_spline

# ---------------- CONFIG ----------------
CONFIG = {
    "video_path": "workspace/input_file.mp4",           # change to your input
    "output_path": "output_intermediate.mp4",
    "final_output_path": "output_final.mp4",
    "target_classes": ["person", "car"],
    "yolo_model": "yolov8x.pt",
    "conf_threshold": 0.35,
    "zoom_min": 2.2,
    "zoom_max": 3.4,
    "scene_threshold": 30.0,
    "fps": 24,
    "deep_sort": {"max_age": 30, "n_init": 3, "max_iou_distance": 0.7},
    "yolo_img_size": 640,
    "smoothing_factor": 0.2,
    "zoom_floor_frames": 10
}

device = "cuda" if torch.cuda.is_available() else "cpu"

# ---- Safe loader: temporarily force weights_only=False while loading YOLO ----
def safe_load_yolo(yolo_path, device=None):
    """
    Temporarily call torch.load with weights_only=False while constructing the model,
    then restore the original torch.load implementation. This avoids persistent
    monkey-patching and prevents recursion problems.
    """
    _orig_load = torch.load
    try:
        # Bind weights_only=False to the original loader via partial
        torch.load = partial(_orig_load, weights_only=False)
        print("Loading YOLO model with temporary safe loader (weights_only=False)...")
        model = YOLO(yolo_path)
        if device:
            model.to(device)
        print("YOLO loaded successfully (temporary loader restored afterwards).")
        return model
    finally:
        # Restore original torch.load no matter what
        torch.load = _orig_load

# ----- Load models safely -----
model = safe_load_yolo(CONFIG["yolo_model"], device=device)
deepsort = DeepSort(**CONFIG["deep_sort"])

# ---------------- UTILS ----------------
def detect_scenes(video_path, threshold):
    video_manager = VideoManager([video_path])
    scene_manager = SceneManager()
    scene_manager.add_detector(ContentDetector(threshold=threshold))
    video_manager.set_duration()
    video_manager.start()
    scene_manager.detect_scenes(frame_source=video_manager)
    scene_list = scene_manager.get_scene_list()
    video_manager.release()
    return [(int(start.get_frames()), int(end.get_frames())) for start, end in scene_list]

def crop_zoom(frame, cx, cy, zoom):
    h, w = frame.shape[:2]
    nw, nh = int(w / zoom), int(h / zoom)
    x1, y1 = max(0, cx - nw // 2), max(0, cy - nh // 2)
    x2, y2 = min(w, x1 + nw), min(h, y1 + nh)
    if x2 <= x1 or y2 <= y1:
        return frame
    return cv2.resize(frame[y1:y2, x1:x2], (w, h))

def ema_smooth(prev, current, alpha):
    return current if prev is None else alpha * current + (1 - alpha) * prev

def optical_flow_center(prev_frame, curr_frame, prev_center):
    prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    curr_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY)
    p0 = np.array([[prev_center]], dtype=np.float32)
    p1, st, _ = cv2.calcOpticalFlowPyrLK(prev_gray, curr_gray, p0, None)
    if p1 is not None and st[0][0] == 1:
        return int(p1[0][0][0]), int(p1[0][0][1])
    return prev_center

def smooth_with_bezier(points):
    if len(points) < 3:
        return points
    x = np.arange(len(points))
    spline = make_interp_spline(x, np.array(points), k=3)
    return spline(np.linspace(0, len(points)-1, len(points)))

# ---------------- MAIN PROCESS ----------------
def process_video(video_path):
    print("Starting processing:", video_path)
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise RuntimeError(f"Cannot open video: {video_path}")
    width, height = int(cap.get(3)), int(cap.get(4))
    writer = cv2.VideoWriter(CONFIG["output_path"], cv2.VideoWriter_fourcc(*"mp4v"), CONFIG["fps"], (width, height))
    scene_frames = detect_scenes(video_path, CONFIG["scene_threshold"])
    prev_center, prev_zoom, prev_frame = None, None, None
    zoom_buffer = []

    class_map = {name: idx for idx, name in model.names.items()}
    target_ids = [class_map[c] for c in CONFIG["target_classes"] if c in class_map]

    for (start, end) in scene_frames:
        cap.set(cv2.CAP_PROP_POS_FRAMES, start)
        centers_history = []
        for _f in range(start, end + 1):
            ret, frame = cap.read()
            if frame is None or frame.size == 0:
                continue
            if not ret:
                break

            img = cv2.resize(frame, (CONFIG["yolo_img_size"], CONFIG["yolo_img_size"]))
            results = model.predict(img, conf=CONFIG["conf_threshold"], device=device, verbose=False)
            detections = results[0].boxes.data.cpu().numpy()

            track_inputs = []
            for *xyxy, conf_, cls in detections:
                cls_id = int(cls)
                if cls_id in target_ids:
                    sx, sy = width / CONFIG["yolo_img_size"], height / CONFIG["yolo_img_size"]
                    x1, y1, x2, y2 = map(int, [xyxy[0]*sx, xyxy[1]*sy, xyxy[2]*sx, xyxy[3]*sy])
                    track_inputs.append(([x1, y1, x2-x1, y2-y1], float(conf_), model.names[cls_id]))

            tracks = deepsort.update_tracks(track_inputs, frame=frame)
            if tracks:
                centers = [
                    (int((t.to_ltrb()[0] + t.to_ltrb()[2]) // 2),
                     int((t.to_ltrb()[1] + t.to_ltrb()[3]) // 2))
                    for t in tracks if t.is_confirmed()
                ]
                zooms = []
                for t in tracks:
                    if t.is_confirmed():
                        x1, y1, x2, y2 = map(int, t.to_ltrb())
                        area_ratio = ((x2 - x1) * (y2 - y1)) / (width * height)
                        if area_ratio == 0:
                            continue  # skip this track
                        # area_ratio = ((x2-x1)*(y2-y1)) / (width * height)
                        zooms.append(np.clip(3.0 / area_ratio, CONFIG["zoom_min"], CONFIG["zoom_max"]))
            
                if centers:  # Only process if we have at least one center
                    avg_center = np.mean(centers, axis=0).astype(int)
                    if prev_frame is not None:
                        avg_center = optical_flow_center(prev_frame, frame, tuple(avg_center))
                    avg_zoom = float(np.mean(zooms))
            
                    avg_center = (
                        int(ema_smooth(prev_center[0] if prev_center else avg_center[0],
                                       avg_center[0], CONFIG["smoothing_factor"])),
                        int(ema_smooth(prev_center[1] if prev_center else avg_center[1],
                                       avg_center[1], CONFIG["smoothing_factor"]))
                    )
                    avg_zoom = float(ema_smooth(prev_zoom if prev_zoom else avg_zoom,
                                                avg_zoom, CONFIG["smoothing_factor"]))
            
                    zoom_buffer.append(avg_zoom)
                    if len(zoom_buffer) > CONFIG["zoom_floor_frames"]:
                        zoom_buffer.pop(0)
                    avg_zoom = max(avg_zoom, min(zoom_buffer))
            
                    frame = crop_zoom(frame, avg_center[0], avg_center[1], avg_zoom)
                    prev_center, prev_zoom, prev_frame = avg_center, avg_zoom, frame.copy()


            writer.write(frame)

        # optional: you could reprocess scene using smoothed centers if desired
        if centers_history:
            smoothed = smooth_with_bezier(centers_history)  # unused but available

    cap.release()
    writer.release()

    # Finalize with ffmpeg re-encode (optional)
    ffmpeg_path = ffmpeg.get_ffmpeg_exe()  # already done
    subprocess.run([
            ffmpeg_path, "-y", "-i", CONFIG["output_path"],
            "-c:v", "libx264", "-crf", "23", "-preset", "slow",
            "-r", str(CONFIG["fps"]),
            CONFIG["final_output_path"]
        ], check=True)

    # subprocess.run([
    #     "ffmpeg", "-y", "-i", CONFIG["output_path"],
    #     "-c:v", "libx264", "-crf", "23", "-preset", "slow",
    #     "-r", str(CONFIG["fps"]),
    #     CONFIG["final_output_path"]
    # ], check=True)
    print("Finished:", CONFIG["final_output_path"])

# --------------- RUN ----------------
process_video(CONFIG["video_path"])


Loading YOLO model with temporary safe loader (weights_only=False)...
YOLO loaded successfully (temporary loader restored afterwards).
Starting processing: workspace/input_file.mp4


VideoManager is deprecated and will be removed.
[NULL @ 0x1a5342c0] Invalid NAL unit size (22294813 > 73540).
[NULL @ 0x1a5342c0] missing picture in access unit with size 73544
[h264 @ 0xa6d4cc0] Invalid NAL unit size (22294813 > 73540).
[h264 @ 0xa6d4cc0] Error splitting the input into NAL units.
[NULL @ 0x1a5342c0] Invalid NAL unit size (196608 > 43610).
[NULL @ 0x1a5342c0] missing picture in access unit with size 43614
[h264 @ 0x12e4e1c0] Invalid NAL unit size (196608 > 43610).
[h264 @ 0x12e4e1c0] Error splitting the input into NAL units.
[NULL @ 0x1a5342c0] Invalid NAL unit size (-547593495 > 20550).
[NULL @ 0x1a5342c0] missing picture in access unit with size 20554
[h264 @ 0x50d99c80] Invalid NAL unit size (-547593495 > 20550).
[h264 @ 0x50d99c80] Error splitting the input into NAL units.
[NULL @ 0x1a5342c0] Invalid NAL unit size (8029744 > 51887).
[NULL @ 0x1a5342c0] missing picture in access unit with size 51891
[h264 @ 0x54fbfe80] Invalid NAL unit size (8029744 > 51887).
[h264 

Finished: output_final.mp4


[out#0/mp4 @ 0x7d08a40] video:359315KiB audio:0KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.022331%
frame=12058 fps=6.3 q=-1.0 Lsize=  359396KiB time=00:08:22.33 bitrate=5861.0kbits/s speed=0.264x    
[libx264 @ 0x7d287c0] frame I:857   Avg QP:19.42  size: 46383
[libx264 @ 0x7d287c0] frame P:9504  Avg QP:22.26  size: 29586
[libx264 @ 0x7d287c0] frame B:1697  Avg QP:23.64  size: 27695
[libx264 @ 0x7d287c0] consecutive B-frames: 76.8% 11.7%  4.7%  6.8%
[libx264 @ 0x7d287c0] mb I  I16..4: 25.4% 73.4%  1.2%
[libx264 @ 0x7d287c0] mb P  I16..4: 13.4% 40.3%  0.5%  P16..4: 21.8%  3.6%  1.2%  0.0%  0.0%    skip:19.3%
[libx264 @ 0x7d287c0] mb B  I16..4:  3.5% 13.6%  0.7%  B16..8: 28.6%  7.2%  1.2%  direct: 3.6%  skip:41.6%  L0:57.7% L1:34.0% BI: 8.3%
[libx264 @ 0x7d287c0] 8x8 transform intra:74.4% inter:87.7%
[libx264 @ 0x7d287c0] direct mvs  spatial:99.8% temporal:0.2%
[libx264 @ 0x7d287c0] coded y,uvDC,uvAC intra: 38.9% 47.0% 4.0% inter: 14.9% 18.9% 0.2%
[libx264

In [None]:
#Initial Code from last work..

import json, os, psutil, time, torch, GPUtil, platform
from datetime import datetime
import numpy as np
import cv2
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
import subprocess

# -------- CONFIG --------
CONFIG = {
    "video_path": "/content/drive/MyDrive/Colab Notebooks/input_2.mp4",
    "output_path": "/content/drive/MyDrive/Colab Notebooks/test_output_2.mp4",
    "final_output_path": "/content/drive/MyDrive/Colab Notebooks/with ffmpeg work/test_output_2_final.mp4",  # after ffmpeg
    "target_classes_default": ["person", "car"],
    "yolo_model": "yolov8x.pt",
    "conf_threshold": 0.35,
    "nms_threshold": 0.5,
    "zoom_min": 2.2,
    "zoom_max": 3.4,
    "scene_threshold": 30.0,
    "output_resolution": (3840, 2160),  # 4K UHD
    "fps": 24,
    "deep_sort": {
        "max_age": 30,
        "n_init": 3,
        "max_iou_distance": 0.7
    },
    "yolo_batch_size": 4,
    "yolo_img_size": 640,
    "smoothing_factor": 0.2  # for EMA smoothing of pan/zoom
}

# -------- INIT MODELS --------
device = "cuda" if torch.cuda.is_available() else "cpu"
model = YOLO(CONFIG["yolo_model"])
model.to(device)
deepsort = DeepSort(
    max_age=CONFIG["deep_sort"]["max_age"],
    n_init=CONFIG["deep_sort"]["n_init"],
    max_iou_distance=CONFIG["deep_sort"]["max_iou_distance"]
)

# -------- UTILS --------
def detect_scenes(video_path, threshold):
    video_manager = VideoManager([video_path])
    scene_manager = SceneManager()
    scene_manager.add_detector(ContentDetector(threshold=threshold))
    video_manager.set_duration()
    video_manager.start()
    scene_manager.detect_scenes(frame_source=video_manager)
    scene_list = scene_manager.get_scene_list()
    video_manager.release()
    return [(int(start.get_frames()), int(end.get_frames())) for start, end in scene_list]

def crop_zoom(frame, center_x, center_y, zoom):
    h, w = frame.shape[:2]
    new_w, new_h = int(w / zoom), int(h / zoom)
    x1, y1 = max(0, center_x - new_w // 2), max(0, center_y - new_h // 2)
    x2, y2 = min(w, x1 + new_w), min(h, y1 + new_h)
    if x2 <= x1 or y2 <= y1:
        return frame
    return cv2.resize(frame[y1:y2, x1:x2], (w, h))

def get_hardware_info():
    gpu_info = GPUtil.getGPUs()[0] if torch.cuda.is_available() else None
    return {
        "cpu_physical_cores": psutil.cpu_count(logical=False),
        "cpu_logical_cores": psutil.cpu_count(logical=True),
        "gpu_name": gpu_info.name if gpu_info else "CPU",
        "gpu_memory_total_mb": gpu_info.memoryTotal if gpu_info else None,
        "ram_total_gb": round(psutil.virtual_memory().total / (1024**3), 2),
        "system": platform.system(),
        "python_version": platform.python_version()
    }

# Exponential Moving Average smoothing helper
def ema_smooth(prev, current, alpha):
    if prev is None:
        return current
    return alpha * current + (1 - alpha) * prev

# Placeholder: get per-scene target classes dynamically
def get_classes_for_scene(scene_idx):
    # Example: alternate scenes track only 'person', others track 'person' + 'car'
    if scene_idx % 2 == 0:
        return ["person"]
    else:
        return ["person", "car"]

# -------- MAIN PROCESS --------
def process_video():
    hw_info = get_hardware_info()
    cap = cv2.VideoCapture(CONFIG["video_path"])
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps_input = cap.get(cv2.CAP_PROP_FPS)

    writer = cv2.VideoWriter(CONFIG["output_path"], cv2.VideoWriter_fourcc(*"mp4v"), CONFIG["fps"], (width, height))
    scene_frames = detect_scenes(CONFIG["video_path"], CONFIG["scene_threshold"])
    print(f"Detected {len(scene_frames)} scenes.")

    print("Hardware info:", hw_info)
    print(f"Processing video {CONFIG['video_path']} at {width}x{height} px, input FPS: {fps_input}")

    # Tracking stats init
    frame_idx = 0
    start_time = time.time()
    prev_center = None
    prev_zoom = None
    zoom_values = []
    jitter_values = []
    missed_subjects = 0
    total_tracks = 0
    current_scene_idx = 0

    for scene_idx, (scene_start, scene_end) in enumerate(scene_frames):
        # Update target classes dynamically per scene
        target_classes = get_classes_for_scene(scene_idx)
        class_name_to_id = {name: idx for idx, name in model.names.items()}
        target_class_ids = [class_name_to_id.get(c) for c in target_classes if c in class_name_to_id]

        print(f"\nScene {scene_idx+1}/{len(scene_frames)} frames {scene_start}-{scene_end}: tracking classes {target_classes}")

        for f in range(scene_start, scene_end + 1):
            ret, frame = cap.read()
            if not ret:
                break

            # YOLO inference with batch size and img size (resize)
            img = cv2.resize(frame, (CONFIG["yolo_img_size"], CONFIG["yolo_img_size"]))
            results = model.predict(img, conf=CONFIG["conf_threshold"], device=device, verbose=False, batch=CONFIG["yolo_batch_size"])
            detections = results[0].boxes.data.cpu().numpy()

            track_inputs = []
            for *xyxy, conf, cls in detections:
                cls_id = int(cls)
                if cls_id in target_class_ids:
                    # Scale bbox back to original frame size
                    scale_x = width / CONFIG["yolo_img_size"]
                    scale_y = height / CONFIG["yolo_img_size"]
                    x1, y1, x2, y2 = map(int, [xyxy[0]*scale_x, xyxy[1]*scale_y, xyxy[2]*scale_x, xyxy[3]*scale_y])
                    track_inputs.append(([x1, y1, x2 - x1, y2 - y1], conf, model.names[cls_id]))

            tracks = deepsort.update_tracks(track_inputs, frame=frame)

            subjects = []
            for track in tracks:
                if not track.is_confirmed():
                    continue
                total_tracks += 1
                x1, y1, x2, y2 = map(int, track.to_ltrb())
                center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
                area_ratio = ((x2 - x1) * (y2 - y1)) / (width * height)
                if area_ratio > 0:
                    zoom = np.clip(3.0 / area_ratio, CONFIG["zoom_min"], CONFIG["zoom_max"])
                    subjects.append((center_x, center_y, zoom))

            if subjects:
                avg_center_x = int(np.mean([s[0] for s in subjects]))
                avg_center_y = int(np.mean([s[1] for s in subjects]))
                avg_zoom_raw = np.mean([s[2] for s in subjects])
                # Smooth center and zoom
                avg_center_x = int(ema_smooth(prev_center[0] if prev_center else None, avg_center_x, CONFIG["smoothing_factor"]))
                avg_center_y = int(ema_smooth(prev_center[1] if prev_center else None, avg_center_y, CONFIG["smoothing_factor"]))
                avg_zoom = ema_smooth(prev_zoom, avg_zoom_raw, CONFIG["smoothing_factor"])
                prev_center = (avg_center_x, avg_center_y)
                prev_zoom = avg_zoom

                frame = crop_zoom(frame, avg_center_x, avg_center_y, avg_zoom)
                zoom_values.append(avg_zoom)

                # Jitter calc: distance between prev and current center
                if prev_center is not None and frame_idx > 0:
                    jitter = np.sqrt((avg_center_x - prev_center[0])**2 + (avg_center_y - prev_center[1])**2)
                    jitter_values.append(jitter)

                print(f"[Frame {frame_idx+1}/{total_frames}] Zoom: {avg_zoom:.2f} Center: ({avg_center_x},{avg_center_y})")
            else:
                missed_subjects += 1
                zoom_values.append(None)
                print(f"[Frame {frame_idx+1}/{total_frames}] No subjects detected")

            writer.write(frame)
            frame_idx += 1

    cap.release()
    writer.release()

    elapsed = time.time() - start_time
    avg_zoom = np.mean([z for z in zoom_values if z is not None]) if zoom_values else None
    avg_jitter = np.mean(jitter_values) if jitter_values else 0
    avg_fps = total_frames / elapsed if elapsed > 0 else 0

    print("\n=== Processing Summary ===")
    print(f"Total frames processed: {total_frames}")
    print(f"Total scenes detected: {len(scene_frames)}")
    print(f"Total processing time (seconds): {elapsed:.2f}")
    print(f"Average FPS: {avg_fps:.2f}")
    print(f"Average zoom level: {avg_zoom:.2f}" if avg_zoom else "No zoom data")
    print(f"Average jitter (pixels/frame): {avg_jitter:.2f}")
    print(f"Missed subjects frames: {missed_subjects}")
    print(f"Total confirmed tracks: {total_tracks}")

    # Call ffmpeg to enforce final format and audio encoding
    ffmpeg_cmd = [
        "ffmpeg", "-y", "-i", CONFIG["output_path"],
        "-c:v", "libx264",
        "-crf", "23",  # adjust as needed 18-28 range
        "-preset", "slow",
        "-r", str(CONFIG["fps"]),
        "-c:a", "aac",
        "-b:a", "320k",
        "-ac", "6",  # 5.1 or 7.1 audio layout can be tuned here
        CONFIG["final_output_path"]
    ]
    print("\nRunning ffmpeg to finalize output with codec and audio settings...")
    subprocess.run(ffmpeg_cmd, check=True)
    print(f"Final video saved at {CONFIG['final_output_path']}")

# ---- RUN THE PROCESS ----
process_video()
