In [None]:
#Libraries and Dependencies installations..

!pip install -U ultralytics
!pip install ultralytics==8.0.196 \
            deep-sort-realtime==1.3.2 \
            scenedetect==0.6.1 \
            opencv-python \
            numpy \
            psutil \
            GPUtil \
            torch \
            torchvision \
            torchaudio

In [None]:
import json, os, psutil, time, torch, GPUtil, platform
from datetime import datetime
import numpy as np
import cv2
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
import subprocess

# -------- CONFIG --------
CONFIG = {
    "video_path": "/content/drive/MyDrive/Colab Notebooks/input_2.mp4",
    "output_path": "/content/drive/MyDrive/Colab Notebooks/test_output_2.mp4",
    "final_output_path": "/content/drive/MyDrive/Colab Notebooks/with ffmpeg work/test_output_2_final.mp4",  # after ffmpeg
    "target_classes_default": ["person", "car"],
    "yolo_model": "yolov8x.pt",
    "conf_threshold": 0.35,
    "nms_threshold": 0.5,
    "zoom_min": 2.2,
    "zoom_max": 3.4,
    "scene_threshold": 30.0,
    "output_resolution": (3840, 2160),  # 4K UHD
    "fps": 24,
    "deep_sort": {
        "max_age": 30,
        "n_init": 3,
        "max_iou_distance": 0.7
    },
    "yolo_batch_size": 4,
    "yolo_img_size": 640,
    "smoothing_factor": 0.2  # for EMA smoothing of pan/zoom
}

# -------- INIT MODELS --------
device = "cuda" if torch.cuda.is_available() else "cpu"
model = YOLO(CONFIG["yolo_model"])
model.to(device)
deepsort = DeepSort(
    max_age=CONFIG["deep_sort"]["max_age"],
    n_init=CONFIG["deep_sort"]["n_init"],
    max_iou_distance=CONFIG["deep_sort"]["max_iou_distance"]
)

# -------- UTILS --------
def detect_scenes(video_path, threshold):
    video_manager = VideoManager([video_path])
    scene_manager = SceneManager()
    scene_manager.add_detector(ContentDetector(threshold=threshold))
    video_manager.set_duration()
    video_manager.start()
    scene_manager.detect_scenes(frame_source=video_manager)
    scene_list = scene_manager.get_scene_list()
    video_manager.release()
    return [(int(start.get_frames()), int(end.get_frames())) for start, end in scene_list]

def crop_zoom(frame, center_x, center_y, zoom):
    h, w = frame.shape[:2]
    new_w, new_h = int(w / zoom), int(h / zoom)
    x1, y1 = max(0, center_x - new_w // 2), max(0, center_y - new_h // 2)
    x2, y2 = min(w, x1 + new_w), min(h, y1 + new_h)
    if x2 <= x1 or y2 <= y1:
        return frame
    return cv2.resize(frame[y1:y2, x1:x2], (w, h))

def get_hardware_info():
    gpu_info = GPUtil.getGPUs()[0] if torch.cuda.is_available() else None
    return {
        "cpu_physical_cores": psutil.cpu_count(logical=False),
        "cpu_logical_cores": psutil.cpu_count(logical=True),
        "gpu_name": gpu_info.name if gpu_info else "CPU",
        "gpu_memory_total_mb": gpu_info.memoryTotal if gpu_info else None,
        "ram_total_gb": round(psutil.virtual_memory().total / (1024**3), 2),
        "system": platform.system(),
        "python_version": platform.python_version()
    }

# Exponential Moving Average smoothing helper
def ema_smooth(prev, current, alpha):
    if prev is None:
        return current
    return alpha * current + (1 - alpha) * prev

# Placeholder: get per-scene target classes dynamically
def get_classes_for_scene(scene_idx):
    # Example: alternate scenes track only 'person', others track 'person' + 'car'
    if scene_idx % 2 == 0:
        return ["person"]
    else:
        return ["person", "car"]

# -------- MAIN PROCESS --------
def process_video():
    hw_info = get_hardware_info()
    cap = cv2.VideoCapture(CONFIG["video_path"])
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps_input = cap.get(cv2.CAP_PROP_FPS)

    writer = cv2.VideoWriter(CONFIG["output_path"], cv2.VideoWriter_fourcc(*"mp4v"), CONFIG["fps"], (width, height))
    scene_frames = detect_scenes(CONFIG["video_path"], CONFIG["scene_threshold"])
    print(f"Detected {len(scene_frames)} scenes.")

    print("Hardware info:", hw_info)
    print(f"Processing video {CONFIG['video_path']} at {width}x{height} px, input FPS: {fps_input}")

    # Tracking stats init
    frame_idx = 0
    start_time = time.time()
    prev_center = None
    prev_zoom = None
    zoom_values = []
    jitter_values = []
    missed_subjects = 0
    total_tracks = 0
    current_scene_idx = 0

    for scene_idx, (scene_start, scene_end) in enumerate(scene_frames):
        # Update target classes dynamically per scene
        target_classes = get_classes_for_scene(scene_idx)
        class_name_to_id = {name: idx for idx, name in model.names.items()}
        target_class_ids = [class_name_to_id.get(c) for c in target_classes if c in class_name_to_id]

        print(f"\nScene {scene_idx+1}/{len(scene_frames)} frames {scene_start}-{scene_end}: tracking classes {target_classes}")

        for f in range(scene_start, scene_end + 1):
            ret, frame = cap.read()
            if not ret:
                break

            # YOLO inference with batch size and img size (resize)
            img = cv2.resize(frame, (CONFIG["yolo_img_size"], CONFIG["yolo_img_size"]))
            results = model.predict(img, conf=CONFIG["conf_threshold"], device=device, verbose=False, batch=CONFIG["yolo_batch_size"])
            detections = results[0].boxes.data.cpu().numpy()

            track_inputs = []
            for *xyxy, conf, cls in detections:
                cls_id = int(cls)
                if cls_id in target_class_ids:
                    # Scale bbox back to original frame size
                    scale_x = width / CONFIG["yolo_img_size"]
                    scale_y = height / CONFIG["yolo_img_size"]
                    x1, y1, x2, y2 = map(int, [xyxy[0]*scale_x, xyxy[1]*scale_y, xyxy[2]*scale_x, xyxy[3]*scale_y])
                    track_inputs.append(([x1, y1, x2 - x1, y2 - y1], conf, model.names[cls_id]))

            tracks = deepsort.update_tracks(track_inputs, frame=frame)

            subjects = []
            for track in tracks:
                if not track.is_confirmed():
                    continue
                total_tracks += 1
                x1, y1, x2, y2 = map(int, track.to_ltrb())
                center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
                area_ratio = ((x2 - x1) * (y2 - y1)) / (width * height)
                if area_ratio > 0:
                    zoom = np.clip(3.0 / area_ratio, CONFIG["zoom_min"], CONFIG["zoom_max"])
                    subjects.append((center_x, center_y, zoom))

            if subjects:
                avg_center_x = int(np.mean([s[0] for s in subjects]))
                avg_center_y = int(np.mean([s[1] for s in subjects]))
                avg_zoom_raw = np.mean([s[2] for s in subjects])
                # Smooth center and zoom
                avg_center_x = int(ema_smooth(prev_center[0] if prev_center else None, avg_center_x, CONFIG["smoothing_factor"]))
                avg_center_y = int(ema_smooth(prev_center[1] if prev_center else None, avg_center_y, CONFIG["smoothing_factor"]))
                avg_zoom = ema_smooth(prev_zoom, avg_zoom_raw, CONFIG["smoothing_factor"])
                prev_center = (avg_center_x, avg_center_y)
                prev_zoom = avg_zoom

                frame = crop_zoom(frame, avg_center_x, avg_center_y, avg_zoom)
                zoom_values.append(avg_zoom)

                # Jitter calc: distance between prev and current center
                if prev_center is not None and frame_idx > 0:
                    jitter = np.sqrt((avg_center_x - prev_center[0])**2 + (avg_center_y - prev_center[1])**2)
                    jitter_values.append(jitter)

                print(f"[Frame {frame_idx+1}/{total_frames}] Zoom: {avg_zoom:.2f} Center: ({avg_center_x},{avg_center_y})")
            else:
                missed_subjects += 1
                zoom_values.append(None)
                print(f"[Frame {frame_idx+1}/{total_frames}] No subjects detected")

            writer.write(frame)
            frame_idx += 1

    cap.release()
    writer.release()

    elapsed = time.time() - start_time
    avg_zoom = np.mean([z for z in zoom_values if z is not None]) if zoom_values else None
    avg_jitter = np.mean(jitter_values) if jitter_values else 0
    avg_fps = total_frames / elapsed if elapsed > 0 else 0

    print("\n=== Processing Summary ===")
    print(f"Total frames processed: {total_frames}")
    print(f"Total scenes detected: {len(scene_frames)}")
    print(f"Total processing time (seconds): {elapsed:.2f}")
    print(f"Average FPS: {avg_fps:.2f}")
    print(f"Average zoom level: {avg_zoom:.2f}" if avg_zoom else "No zoom data")
    print(f"Average jitter (pixels/frame): {avg_jitter:.2f}")
    print(f"Missed subjects frames: {missed_subjects}")
    print(f"Total confirmed tracks: {total_tracks}")

    # Call ffmpeg to enforce final format and audio encoding
    ffmpeg_cmd = [
        "ffmpeg", "-y", "-i", CONFIG["output_path"],
        "-c:v", "libx264",
        "-crf", "23",  # adjust as needed 18-28 range
        "-preset", "slow",
        "-r", str(CONFIG["fps"]),
        "-c:a", "aac",
        "-b:a", "320k",
        "-ac", "6",  # 5.1 or 7.1 audio layout can be tuned here
        CONFIG["final_output_path"]
    ]
    print("\nRunning ffmpeg to finalize output with codec and audio settings...")
    subprocess.run(ffmpeg_cmd, check=True)
    print(f"Final video saved at {CONFIG['final_output_path']}")

# ---- RUN THE PROCESS ----
process_video()


ERROR:pyscenedetect:VideoManager is deprecated and will be removed.
INFO:pyscenedetect:Loaded 1 video, framerate: 23.976 FPS, resolution: 1920 x 1080
INFO:pyscenedetect:Duration set, start: None, duration: None, end: None.
INFO:pyscenedetect:Downscale factor set to 7, effective resolution: 274 x 154
INFO:pyscenedetect:Detecting scenes...


Detected 224 scenes.
Hardware info: {'cpu_physical_cores': 1, 'cpu_logical_cores': 2, 'gpu_name': 'CPU', 'gpu_memory_total_mb': None, 'ram_total_gb': 12.67, 'system': 'Linux', 'python_version': '3.11.13'}
Processing video /content/drive/MyDrive/Colab Notebooks/input_2.mp4 at 1920x1080 px, input FPS: 23.976023976023978

Scene 1/224 frames 0-86: tracking classes ['person']
[Frame 1/43149] No subjects detected
[Frame 2/43149] No subjects detected
[Frame 3/43149] Zoom: 3.40 Center: (1387,473)
[Frame 4/43149] Zoom: 3.40 Center: (1391,473)
[Frame 5/43149] Zoom: 3.40 Center: (1397,474)
[Frame 6/43149] Zoom: 3.40 Center: (1405,475)
[Frame 7/43149] Zoom: 3.40 Center: (1413,477)
[Frame 8/43149] Zoom: 3.40 Center: (1423,480)
[Frame 9/43149] Zoom: 3.40 Center: (1432,485)
[Frame 10/43149] Zoom: 3.40 Center: (1441,491)
[Frame 11/43149] Zoom: 3.40 Center: (1451,498)
[Frame 12/43149] Zoom: 3.40 Center: (1461,507)
[Frame 13/43149] Zoom: 3.40 Center: (1470,518)
[Frame 14/43149] Zoom: 3.40 Center: (1481,

In [9]:
# ======= Full minimal notebook snippet with safe YOLO loading =======

import json
import os
import psutil
import time
import torch
import GPUtil
import platform
from datetime import datetime
import numpy as np
import cv2

# Import Ultralytics YOLO
from ultralytics import YOLO
from ultralytics.nn.tasks import DetectionModel

# Import torch Sequential container and any other needed
from torch.nn.modules.container import Sequential

# Add these classes to PyTorch safe globals for loading weights (PyTorch 2.6+)
torch.serialization.add_safe_globals([DetectionModel, Sequential])

# Import DeepSort tracker
from deep_sort_realtime.deepsort_tracker import DeepSort

# Import PySceneDetect
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector


# ==== CONFIG ====
CONFIG = {
    "video_path": "/content/drive/MyDrive/Colab Notebooks/input_2.mp4",
    "output_path": "/content/drive/MyDrive/Colab Notebooks/test_output_2.mp4",
    "target_classes": ["person", "car"],
    "yolo_model": "yolov8x.pt",
    "conf_threshold": 0.35,
    "zoom_min": 2.2,
    "zoom_max": 3.4,
    "scene_threshold": 30.0,
    "fps": 24,
    "deep_sort": {
        "max_age": 30,
        "n_init": 3,
        "max_iou_distance": 0.7
    }
}

# ==== INIT MODELS ====
device = "cuda" if torch.cuda.is_available() else "cpu"
model = YOLO(CONFIG["yolo_model"])  # Loads with safe globals patch
model.to(device)

deepsort = DeepSort(
    max_age=CONFIG["deep_sort"]["max_age"],
    n_init=CONFIG["deep_sort"]["n_init"],
    max_iou_distance=CONFIG["deep_sort"]["max_iou_distance"]
)

CLASS_NAME_TO_ID = {name: idx for idx, name in model.names.items()}
TARGET_CLASS_IDS = [CLASS_NAME_TO_ID.get(name) for name in CONFIG["target_classes"] if name in CLASS_NAME_TO_ID]


# ==== UTILS ====
def detect_scenes(video_path, threshold):
    video_manager = VideoManager([video_path])
    scene_manager = SceneManager()
    scene_manager.add_detector(ContentDetector(threshold=threshold))
    video_manager.set_duration()
    video_manager.start()
    scene_manager.detect_scenes(frame_source=video_manager)
    scene_list = scene_manager.get_scene_list()
    video_manager.release()
    return [(int(start.get_frames()), int(end.get_frames())) for start, end in scene_list]


def crop_zoom(frame, center_x, center_y, zoom):
    h, w = frame.shape[:2]
    new_w, new_h = int(w / zoom), int(h / zoom)
    x1, y1 = max(0, center_x - new_w // 2), max(0, center_y - new_h // 2)
    x2, y2 = min(w, x1 + new_w), min(h, y1 + new_h)
    if x2 <= x1 or y2 <= y1:
        return frame
    return cv2.resize(frame[y1:y2, x1:x2], (w, h))


def get_hardware_info():
    gpu_info = GPUtil.getGPUs()[0] if torch.cuda.is_available() else None
    return {
        "cpu_physical_cores": psutil.cpu_count(logical=False),
        "cpu_logical_cores": psutil.cpu_count(logical=True),
        "gpu_name": gpu_info.name if gpu_info else "CPU",
        "gpu_memory_total_mb": gpu_info.memoryTotal if gpu_info else None,
        "ram_total_gb": round(psutil.virtual_memory().total / (1024**3), 2),
        "system": platform.system(),
        "python_version": platform.python_version()
    }


# ==== MAIN PROCESS ====
def process_video():
    hw_info = get_hardware_info()
    cap = cv2.VideoCapture(CONFIG["video_path"])
    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    writer = cv2.VideoWriter(CONFIG["output_path"], cv2.VideoWriter_fourcc(*"mp4v"), CONFIG["fps"], (width, height))
    scene_frames = detect_scenes(CONFIG["video_path"], CONFIG["scene_threshold"])

    log_data = {
        "hardware": hw_info,
        "parameters": CONFIG,
        "total_scenes": len(scene_frames),
        "total_frames": total_frames,
        "processing_start": datetime.now().isoformat()
    }

    frame_idx = 0
    start_time = time.time()

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        results = model.predict(frame, conf=CONFIG["conf_threshold"], device=device, verbose=False)
        detections = results[0].boxes.data.cpu().numpy()

        track_inputs = []
        for *xyxy, conf, cls in detections:
            if int(cls) in TARGET_CLASS_IDS:
                x1, y1, x2, y2 = map(int, xyxy)
                track_inputs.append(([x1, y1, x2 - x1, y2 - y1], conf, model.names[int(cls)]))

        tracks = deepsort.update_tracks(track_inputs, frame=frame)

        subjects = []
        for track in tracks:
            if not track.is_confirmed():
                continue
            x1, y1, x2, y2 = map(int, track.to_ltrb())
            center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
            area_ratio = ((x2 - x1) * (y2 - y1)) / (width * height)
            if area_ratio > 0:
                zoom = np.clip(3.0 / area_ratio, CONFIG["zoom_min"], CONFIG["zoom_max"])
                subjects.append((center_x, center_y, zoom))

        if subjects:
            avg_center_x = int(np.mean([s[0] for s in subjects]))
            avg_center_y = int(np.mean([s[1] for s in subjects]))
            avg_zoom = np.mean([s[2] for s in subjects])
            frame = crop_zoom(frame, avg_center_x, avg_center_y, avg_zoom)

        writer.write(frame)
        frame_idx += 1

    cap.release()
    writer.release()

    elapsed = time.time() - start_time
    log_data["processing_end"] = datetime.now().isoformat()
    log_data["processing_time_sec"] = elapsed
    log_data["avg_fps"] = total_frames / elapsed
    log_data["output_file"] = CONFIG["output_path"]

    with open(f"log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", "w") as f:
        json.dump(log_data, f, indent=4)

    print(f"✅ Done in {elapsed:.2f}s, avg FPS: {log_data['avg_fps']:.2f}")


# ==== RUN ====
process_video()


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL ultralytics.nn.modules.Conv was not an allowed global by default. Please use `torch.serialization.add_safe_globals([Conv])` or the `torch.serialization.safe_globals([Conv])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.