In [1]:
# Install dependencies
!pip install -q torch torchvision transformers accelerate
!pip install -q "qwen-vl-utils[decord]==0.0.8"

import torch
print(f"Torch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m146.8 MB/s[0m eta [36m0:00:00[0m
[?25hTorch: 2.9.0+cu126
CUDA: True
GPU: NVIDIA A100-SXM4-40GB


In [2]:
import os

ROOT = "/content/camerabench"
VIDS = f"{ROOT}/videos"
OUTS = f"{ROOT}/outputs"

for p in (ROOT, VIDS, OUTS):
    os.makedirs(p, exist_ok=True)

print(f"Videos dir: {VIDS}")
print(f"Outputs dir: {OUTS}")

Videos dir: /content/camerabench/videos
Outputs dir: /content/camerabench/outputs


In [3]:
# multi threading download
import os
import hashlib
import requests
from datasets import load_dataset
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

ds = load_dataset("syCen/CameraBench", split="test")

def safe_filename(url):
    h = hashlib.md5(url.encode()).hexdigest()[:16]
    base = os.path.basename(url).split("?")[0]
    return f"{h}_{base}"

def download_file(item):
    idx, url, path = item
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return {"idx": idx, "url": url, "path": path}
    try:
        r = requests.get(url, stream=True, timeout=30)
        if r.status_code == 200:
            with open(path, "wb") as f:
                for chunk in r.iter_content(chunk_size=1024*1024):  # 1MB chunks
                    f.write(chunk)
            return {"idx": idx, "url": url, "path": path}
    except:
        pass
    return None

# Prepare download tasks
tasks = []
for idx, row in enumerate(ds):
    url = row.get("video") or row.get("Video")
    if not url:
        continue
    fname = safe_filename(url)
    local_path = os.path.join(VIDS, fname if "." in fname else fname + ".gif")
    tasks.append((idx, url, local_path))

# Parallel download with 16 threads
manifest = []
with ThreadPoolExecutor(max_workers=16) as executor:
    futures = {executor.submit(download_file, t): t for t in tasks}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading"):
        result = future.result()
        if result:
            manifest.append(result)

print(f"Downloaded: {len(manifest)} videos")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating test split:   0%|          | 0/1071 [00:00<?, ? examples/s]

Downloading: 100%|██████████| 1071/1071 [00:38<00:00, 27.53it/s]

Downloaded: 1071 videos





In [4]:
# multi threading convert to mp4
import imageio
import numpy as np
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

def gif_to_mp4(args):
    """Convert GIF to MP4 with even dimensions."""
    gif_path, mp4_path = args

    if os.path.exists(mp4_path) and os.path.getsize(mp4_path) > 0:
        return mp4_path  # Already converted

    try:
        frames = []
        with imageio.get_reader(gif_path) as reader:
            for frame in reader:
                if isinstance(frame, np.ndarray):
                    if frame.ndim == 2:
                        frame = np.stack([frame]*3, axis=-1)
                    elif frame.shape[-1] == 4:
                        frame = frame[..., :3]
                frames.append(frame.astype(np.uint8))

        if not frames:
            return None

        h, w = frames[0].shape[:2]
        H, W = h + (h % 2), w + (w % 2)
        if H != h or W != w:
            padded = []
            for f in frames:
                canvas = np.zeros((H, W, 3), dtype=np.uint8)
                canvas[:h, :w] = f
                padded.append(canvas)
            frames = padded

        imageio.mimsave(mp4_path, frames, fps=8.0, codec="libx264",
                        ffmpeg_params=["-pix_fmt", "yuv420p"])
        return mp4_path if os.path.exists(mp4_path) else None
    except:
        return None

# Prepare conversion tasks
convert_tasks = []
for m in manifest:
    if m["path"].lower().endswith(".gif"):
        mp4_path = m["path"].rsplit(".", 1)[0] + ".mp4"
        convert_tasks.append((m, m["path"], mp4_path))
    else:
        m["mp4_path"] = m["path"]

# Parallel conversion with 8 workers
with ProcessPoolExecutor(max_workers=8) as executor:
    futures = {executor.submit(gif_to_mp4, (t[1], t[2])): t[0] for t in convert_tasks}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Converting"):
        m = futures[future]
        result = future.result()
        m["mp4_path"] = result if result else m["path"]

# For non-GIF files that weren't in convert_tasks
for m in manifest:
    if "mp4_path" not in m:
        m["mp4_path"] = m["path"]

print(f"Conversion done! {len(manifest)} videos ready")

Converting: 100%|██████████| 1071/1071 [03:40<00:00,  4.85it/s]

Conversion done! 1071 videos ready





In [None]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype="auto",
    device_map="auto"
)

processor = AutoProcessor.from_pretrained(MODEL_ID)

print(f"Model loaded: {MODEL_ID}")

`torch_dtype` is deprecated! Use `dtype` instead!


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

Model loaded: Qwen/Qwen2.5-VL-7B-Instruct


In [None]:
import gc

def caption_video(video_path, prompt, max_frames=32):
    """Generate caption for a video with given prompt."""
    messages = [{
        "role": "user",
        "content": [
            {"type": "video", "video": video_path, "fps": 6.0, "max_frames": max_frames},
            {"type": "text", "text": prompt},
        ],
    }]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    generated_ids = model.generate(**inputs, max_new_tokens=256)
    generated_ids_trimmed = [
        out[len(inp):] for inp, out in zip(inputs.input_ids, generated_ids)
    ]

    output = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]

    return output.strip()

In [None]:
PROMPT_CAMERA = "Describe the camera motions."
PROMPT_SCENE = "Describe scenes in detail."

results = []

for m in tqdm(manifest, desc="Captioning"):
    video_path = m["mp4_path"]
    idx = m["idx"]

    if not os.path.exists(video_path):
        continue

    try:
        # Clear memory
        torch.cuda.empty_cache()
        gc.collect()

        # Generate camera motion caption
        cap_camera = caption_video(video_path, PROMPT_CAMERA)

        torch.cuda.empty_cache()
        gc.collect()

        # Generate scene description
        cap_scene = caption_video(video_path, PROMPT_SCENE)

        row = ds[idx]
        results.append({
            "row_idx": idx,
            "video_url": m["url"],
            "camera_motion": cap_camera,
            "scene_description": cap_scene,
            "labels": row.get("labels"),
            "human_caption": row.get("caption"),
        })
    except Exception as e:
        print(f"Error on {idx}: {e}")
        continue

print(f"Completed: {len(results)} videos")

Captioning: 100%|██████████| 1071/1071 [4:07:56<00:00, 13.89s/it]

Completed: 1071 videos





In [None]:
!pip install -q xlsxwriter

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/175.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
from google.colab import drive

# Mount Drive
drive.mount('/content/drive')

# Create DataFrame
df = pd.DataFrame(results)

# Save path (adjust as needed)
EXCEL_PATH = "/content/drive/MyDrive/qwen25_captions.xlsx"
os.makedirs(os.path.dirname(EXCEL_PATH), exist_ok=True)

# Export
with pd.ExcelWriter(EXCEL_PATH, engine="xlsxwriter") as writer:
    df.to_excel(writer, index=False, sheet_name="captions")

    # Auto-adjust column widths
    worksheet = writer.sheets["captions"]
    for i, col in enumerate(df.columns):
        max_len = max(df[col].astype(str).map(len).max(), len(col)) + 2
        worksheet.set_column(i, i, min(max_len, 60))

print(f"✅ Saved to: {EXCEL_PATH}")
print(f"Total rows: {len(df)}")
df.head(3)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Saved to: /content/drive/MyDrive/qwen25_captions.xlsx
Total rows: 1071


Unnamed: 0,row_idx,video_url,camera_motion,scene_description,labels,human_caption
0,0,https://huggingface.co/datasets/syCen/CameraBe...,The video begins with a close-up shot of a per...,The video depicts a tense and dramatic scene w...,"[minimal-shaking, complex-motion, regular-spee...",The camera smoothly trucks slightly to the lef...
1,1,https://huggingface.co/datasets/syCen/CameraBe...,The video features a woman standing on a woode...,The video depicts a woman standing on a wooden...,"[minimal-shaking, complex-motion, regular-spee...",The camera smoothly arcs clockwise around the ...
2,2,https://huggingface.co/datasets/syCen/CameraBe...,The video frames depict a dramatic scene of an...,The image depicts a dramatic and chaotic scene...,"[minimal-shaking, complex-motion, regular-spee...",The camera arcs counterclockwise with a smooth...
