# Configuration

In [None]:
# Install dependencies
!pip install -q torch torchvision transformers accelerate
!pip install -q "qwen-vl-utils[decord]==0.0.8"
!pip install -q xlsxwriter imageio[ffmpeg]

# Mount Google Drive early
from google.colab import drive
drive.mount('/content/drive')

import os
import torch

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
# Configuration - Edit these paths as needed
CONFIG = {
    "drive_root": "/content/drive/MyDrive/CameraBench",
    "local_root": "/content/camerabench",
    "model_id": "Qwen/Qwen2.5-VL-7B-Instruct", # change here if you want to change the model
    # "model_id": "chancharikm/qwen2.5-vl-7b-cam-motion",
    "max_frames": 32,
    "fps": 8.0,
    "download_workers": 16,
    "convert_workers": 8,
    "batch_size": 1,  # For parallel inference
}

# Create directories (both local and persistent on Drive)
PATHS = {
    "videos_local": f"{CONFIG['local_root']}/videos",
    "videos_drive": f"{CONFIG['drive_root']}/videos_mp4",  # Persistent storage
    "outputs": f"{CONFIG['drive_root']}/outputs",
}

for p in PATHS.values():
    os.makedirs(p, exist_ok=True)

# System info
print(f"‚úì Torch: {torch.__version__}")
print(f"‚úì CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"‚úì GPU: {torch.cuda.get_device_name(0)}")
print(f"\nüìÅ Persistent videos: {PATHS['videos_drive']}")
print(f"üìÅ Outputs: {PATHS['outputs']}")

‚úì Torch: 2.9.0+cu126
‚úì CUDA: True
‚úì GPU: NVIDIA A100-SXM4-40GB

üìÅ Persistent videos: /content/drive/MyDrive/CameraBench/videos_mp4
üìÅ Outputs: /content/drive/MyDrive/CameraBench/outputs


# Utility Functions

In [27]:
import hashlib
import requests
import numpy as np
import imageio
import shutil
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from tqdm import tqdm
from typing import List, Dict, Optional, Tuple
import random


def safe_filename(url: str) -> str:
    """Generate safe filename from URL."""
    h = hashlib.md5(url.encode()).hexdigest()[:16]
    base = os.path.basename(url).split("?")[0]
    return f"{h}_{base}"


def download_single_file(args: Tuple[int, str, str]) -> Optional[Dict]:
    """Download a single file. Returns dict with metadata or None on failure."""
    idx, url, path = args
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return {"idx": idx, "url": url, "path": path}
    try:
        r = requests.get(url, stream=True, timeout=30)
        if r.status_code == 200:
            with open(path, "wb") as f:
                for chunk in r.iter_content(chunk_size=1024 * 1024):
                    f.write(chunk)
            return {"idx": idx, "url": url, "path": path}
    except Exception as e:
        pass
    return None


def download_videos(dataset, output_dir: str, max_workers: int = 16) -> List[Dict]:
    """Download all videos from dataset in parallel."""
    tasks = []
    for idx, row in enumerate(dataset):
        url = row.get("video") or row.get("Video")
        if not url:
            continue
        fname = safe_filename(url)
        local_path = os.path.join(output_dir, fname if "." in fname else fname + ".gif")
        tasks.append((idx, url, local_path))

    manifest = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(download_single_file, t): t for t in tasks}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading"):
            result = future.result()
            if result:
                manifest.append(result)

    print(f"‚úì Downloaded: {len(manifest)} videos")
    return manifest


def convert_gif_to_mp4(args: Tuple[str, str]) -> Optional[str]:
    """Convert GIF to MP4 with even dimensions."""
    gif_path, mp4_path = args

    if os.path.exists(mp4_path) and os.path.getsize(mp4_path) > 0:
        return mp4_path

    try:
        frames = []
        with imageio.get_reader(gif_path) as reader:
            for frame in reader:
                if isinstance(frame, np.ndarray):
                    if frame.ndim == 2:
                        frame = np.stack([frame] * 3, axis=-1)
                    elif frame.shape[-1] == 4:
                        frame = frame[..., :3]
                frames.append(frame.astype(np.uint8))

        if not frames:
            return None

        # Ensure even dimensions for codec
        h, w = frames[0].shape[:2]
        H, W = h + (h % 2), w + (w % 2)
        if H != h or W != w:
            frames = [
                np.pad(f, ((0, H - h), (0, W - w), (0, 0)), mode='constant')
                for f in frames
            ]

        imageio.mimsave(mp4_path, frames, fps=8.0, codec="libx264",
                        ffmpeg_params=["-pix_fmt", "yuv420p"])
        return mp4_path if os.path.exists(mp4_path) else None
    except Exception:
        return None


def convert_videos(manifest: List[Dict], output_dir: str, max_workers: int = 8) -> List[Dict]:
    """Convert all GIFs to MP4 and store in output directory."""
    convert_tasks = []

    for m in manifest:
        base_name = os.path.basename(m["path"]).rsplit(".", 1)[0] + ".mp4"
        mp4_path = os.path.join(output_dir, base_name)

        if m["path"].lower().endswith(".gif"):
            convert_tasks.append((m, m["path"], mp4_path))
        else:
            # Non-GIF: copy to output dir if not already there
            if not os.path.exists(mp4_path):
                shutil.copy2(m["path"], mp4_path)
            m["mp4_path"] = mp4_path

    # Parallel conversion
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(convert_gif_to_mp4, (t[1], t[2])): t[0] for t in convert_tasks}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Converting"):
            m = futures[future]
            result = future.result()
            m["mp4_path"] = result if result else m["path"]

    # Ensure all entries have mp4_path
    for m in manifest:
        if "mp4_path" not in m:
            base_name = os.path.basename(m["path"]).rsplit(".", 1)[0] + ".mp4"
            m["mp4_path"] = os.path.join(output_dir, base_name)

    valid = [m for m in manifest if os.path.exists(m.get("mp4_path", ""))]
    print(f"‚úì Converted: {len(valid)} videos ready")
    return valid


def load_existing_manifest(video_dir: str, dataset) -> List[Dict]:
    """Load manifest from existing converted videos on Drive."""
    manifest = []

    # Build URL to index mapping
    url_to_idx = {}
    for idx, row in enumerate(dataset):
        url = row.get("video") or row.get("Video")
        if url:
            url_to_idx[safe_filename(url).rsplit(".", 1)[0]] = (idx, url)

    # Scan existing MP4 files
    if os.path.exists(video_dir):
        for fname in os.listdir(video_dir):
            if fname.endswith(".mp4"):
                base = fname.rsplit(".", 1)[0]
                if base in url_to_idx:
                    idx, url = url_to_idx[base]
                    manifest.append({
                        "idx": idx,
                        "url": url,
                        "mp4_path": os.path.join(video_dir, fname)
                    })

    print(f"‚úì Found {len(manifest)} existing converted videos")
    return manifest


def select_samples(manifest: List[Dict], n: Optional[int] = None, seed: int = 42) -> List[Dict]:
    """Randomly select n samples from manifest. If n is None, return all."""
    if n is None or n >= len(manifest):
        return manifest
    random.seed(seed)
    selected = random.sample(manifest, n)
    print(f"‚úì Selected {len(selected)} random samples (seed={seed})")
    return selected


def caption_single(video_path: str, prompt: str, max_frames: int = 32, fps: float = 6.0) -> str:
    """Generate caption for a single video."""
    messages = [{
        "role": "user",
        "content": [
            {"type": "video", "video": video_path, "fps": fps, "max_frames": max_frames},
            {"type": "text", "text": prompt},
        ],
    }]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=256)

    generated_ids_trimmed = [
        out[len(inp):] for inp, out in zip(inputs.input_ids, generated_ids)
    ]

    output = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]

    return output.strip()


def caption_batch(video_paths: List[str], prompt: str, max_frames: int = 32, fps: float = 6.0) -> List[str]:
    """Generate captions for a batch of videos (parallel on GPU)."""
    if not video_paths:
        return []

    messages_batch = []
    for video_path in video_paths:
        messages_batch.append([{
            "role": "user",
            "content": [
                {"type": "video", "video": video_path, "fps": fps, "max_frames": max_frames},
                {"type": "text", "text": prompt},
            ],
        }])

    texts = [
        processor.apply_chat_template(m, tokenize=False, add_generation_prompt=True)
        for m in messages_batch
    ]

    # Process vision info for each
    all_image_inputs = []
    all_video_inputs = []
    for m in messages_batch:
        img_inp, vid_inp = process_vision_info(m)
        all_image_inputs.append(img_inp)
        all_video_inputs.append(vid_inp)

    # Flatten video inputs
    flat_videos = []
    for v in all_video_inputs:
        if v:
            flat_videos.extend(v)

    inputs = processor(
        text=texts,
        images=None,
        videos=flat_videos if flat_videos else None,
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=256)

    generated_ids_trimmed = [
        out[len(inp):] for inp, out in zip(inputs.input_ids, generated_ids)
    ]

    outputs = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    return [o.strip() for o in outputs]


def run_inference(
    manifest: List[Dict],
    dataset,
    prompts: Dict[str, str],
    batch_size: int = 1,
    max_frames: int = 32,
    fps: float = 6.0
) -> List[Dict]:
    """
    Run inference on manifest with optional batching.

    Args:
        manifest: List of video metadata dicts
        dataset: Original dataset for labels
        prompts: Dict of {caption_name: prompt_text}
        batch_size: Number of videos to process together (1 = sequential)
        max_frames: Max frames per video
        fps: Frames per second to sample

    Returns:
        List of result dicts
    """
    results = []

    if batch_size == 1:
        # Sequential processing (more stable)
        for m in tqdm(manifest, desc="Captioning"):
            video_path = m["mp4_path"]
            idx = m["idx"]

            if not os.path.exists(video_path):
                continue

            try:
                torch.cuda.empty_cache()
                gc.collect()

                result = {
                    "row_idx": idx,
                    "video_url": m["url"],
                    "labels": dataset[idx].get("labels"),
                    "human_caption": dataset[idx].get("caption"),
                }

                for name, prompt in prompts.items():
                    result[name] = caption_single(video_path, prompt, max_frames, fps)

                results.append(result)

            except Exception as e:
                print(f"Error on idx {idx}: {e}")
                continue
    else:
        # Batch processing
        for i in tqdm(range(0, len(manifest), batch_size), desc="Batch Captioning"):
            batch = manifest[i:i + batch_size]
            valid_batch = [(m, m["mp4_path"]) for m in batch if os.path.exists(m["mp4_path"])]

            if not valid_batch:
                continue

            try:
                torch.cuda.empty_cache()
                gc.collect()

                batch_results = [{
                    "row_idx": m["idx"],
                    "video_url": m["url"],
                    "labels": dataset[m["idx"]].get("labels"),
                    "human_caption": dataset[m["idx"]].get("caption"),
                } for m, _ in valid_batch]

                video_paths = [vp for _, vp in valid_batch]

                for name, prompt in prompts.items():
                    try:
                        captions = caption_batch(video_paths, prompt, max_frames, fps)
                        for j, cap in enumerate(captions):
                            batch_results[j][name] = cap
                    except Exception as e:
                        print(f"Batch error for {name}, falling back to sequential: {e}")
                        # Fallback to sequential
                        for j, (_, vp) in enumerate(valid_batch):
                            batch_results[j][name] = caption_single(vp, prompt, max_frames, fps)

                results.extend(batch_results)

            except Exception as e:
                print(f"Batch error at {i}: {e}")
                continue

    print(f"‚úì Completed: {len(results)} videos")
    return results

# Optional: Quick Test on Single Video
def test_single_video(idx: int = 0):
    """Quick test on a single video."""
    if idx >= len(manifest):
        print(f"Index {idx} out of range. Max: {len(manifest)-1}")
        return

    m = manifest[idx]
    print(f"Testing video {m['idx']}: {m['url'][:50]}...")

    for name, prompt in PROMPTS.items():
        result = caption_single(m["mp4_path"], prompt)
        print(f"\n{name}:\n{result}")

# (Optional) Download & Convert Videos

You only need to run this once. All the transformed data will be stored in your google drive.

In [23]:
from datasets import load_dataset

# Load dataset
ds = load_dataset("syCen/CameraBench", split="test")
print(f"‚úì Dataset loaded: {len(ds)} samples")

# Check for existing converted videos first
manifest = load_existing_manifest(PATHS["videos_drive"], ds)

# If no existing videos, download and convert
if len(manifest) == 0:
    print("\nüì• No existing videos found. Downloading...")
    manifest = download_videos(
        ds,
        PATHS["videos_local"],
        max_workers=CONFIG["download_workers"]
    )

    print("\nüîÑ Converting to MP4...")
    manifest = convert_videos(
        manifest,
        PATHS["videos_drive"],  # Store on Drive for persistence
        max_workers=CONFIG["convert_workers"]
    )
else:
    print("‚úì Using existing converted videos from Drive")

print(f"\nüìä Total videos available: {len(manifest)}")

‚úì Dataset loaded: 1071 samples
‚úì Found 1071 existing converted videos
‚úì Using existing converted videos from Drive

üìä Total videos available: 1071


# Load Model

In [24]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import gc

# Load model
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    CONFIG["model_id"],
    torch_dtype="auto",
    device_map="auto"
)

processor = AutoProcessor.from_pretrained(CONFIG["model_id"])
processor.tokenizer.padding_side = "left" # for batch inference

print(f"‚úì Model loaded: {CONFIG['model_id']}")

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

‚úì Model loaded: Qwen/Qwen2.5-VL-7B-Instruct


# Run Inference

In [29]:
CURRICULUM_ICL_PROMPT = """
You are an expert video analyst. You must describe BOTH:

1. How the CAMERA moves (camera motion).
2. What the SCENE contains (objects, people, setting, actions).

You will ALWAYS output EXACTLY TWO lines in this format:

CAMERA_MOTION: <one short sentence only about camera movement>
SCENE_DESCRIPTION: <one or two short sentences only about scene content>

Important rules:
- CAMERA_MOTION must ONLY describe camera motion (pan, tilt, zoom, dolly, truck/slide, roll, orbit, handheld shake, static).
- SCENE_DESCRIPTION must ONLY describe visible scene content (subjects, environment, actions); never mention camera movement.
- Do NOT mention ‚Äúvideo‚Äù, ‚Äúframes‚Äù, ‚Äúshot types‚Äù, or technical terms like ‚ÄúFOV‚Äù.
- Keep both lines concise and natural.
- Never output anything except the two required lines.

ICL Strategy (Curriculum Learning):
You will be shown SIX EXAMPLES ordered from simple ‚Üí medium ‚Üí complex in terms of camera motion.
These examples highlight progressively more difficult motion patterns.
Use the style, clarity, and separation demonstrated in the examples to guide your final output.

EXAMPLES
========
Example 1 (simple: static)
CAMERA_MOTION: The camera remains almost completely still on a tripod with only a slight natural sway.
SCENE_DESCRIPTION: A person stands on a small indoor stage speaking to an audience seated in rows.

Example 2 (simple-medium: slow pan)
CAMERA_MOTION: The camera slowly pans from left to right in a smooth, steady motion.
SCENE_DESCRIPTION: A city skyline with tall buildings and a river appears under a warm sunset glow.

Example 3 (medium: handheld forward)
CAMERA_MOTION: A handheld camera walks forward with small side-to-side shakes.
SCENE_DESCRIPTION: Someone moves through a crowded outdoor market lined with colorful food stalls and pedestrians.

Example 4 (medium-hard: zoom-in)
CAMERA_MOTION: The camera gently zooms in toward the subject while maintaining center alignment.
SCENE_DESCRIPTION: A person sits at a desk in a dim room illuminated by a computer monitor and soft ambient light.

Example 5 (hard: orbit)
CAMERA_MOTION: The camera slowly orbits counterclockwise around the main subject.
SCENE_DESCRIPTION: A skateboarder performs tricks in a lively skatepark surrounded by graffiti-covered walls.

Example 6 (hard: tracking)
CAMERA_MOTION: The camera quickly tracks alongside a moving subject, then eases to a stop.
SCENE_DESCRIPTION: A runner moves along a forest trail lined with tall trees before slowing near a bright clearing.

Now describe the current video using ONLY the two-line format:

CAMERA_MOTION:
SCENE_DESCRIPTION:
"""


CONTRASTIVE_ICL_PROMPT = """
You are an expert video analyst. You must describe BOTH:

1. How the CAMERA moves (camera motion).
2. What the SCENE contains (objects, people, setting, actions).

You will ALWAYS output EXACTLY TWO lines in this format:

CAMERA_MOTION: <one short sentence only about camera movement>
SCENE_DESCRIPTION: <one or two short sentences only about scene content>

Important rules:
- CAMERA_MOTION must ONLY describe how the camera moves (pan, tilt, zoom, dolly, truck/slide, roll, orbit, handheld, static).
- SCENE_DESCRIPTION must ONLY describe visible scene content.
- Strictly avoid mixing the two categories.
- Do NOT mention ‚Äúvideo‚Äù, ‚Äúframes‚Äù, or technical shot terminology.
- Output must be concise, natural, and exactly two lines.

ICL Strategy (Contrastive):
You will see SIX EXAMPLES: four GOOD examples that correctly separate camera and scene, and two BAD examples that incorrectly mix them.
Learn from the GOOD examples only; the BAD examples show mistakes to avoid.

EXAMPLES
========

GOOD Example 1 (static)
CAMERA_MOTION: The camera remains fixed with only a slight natural sway.
SCENE_DESCRIPTION: A speaker stands on a small stage addressing an audience seated in rows.

GOOD Example 2 (pan)
CAMERA_MOTION: The camera slowly pans from left to right in a smooth motion.
SCENE_DESCRIPTION: A city skyline with tall buildings and a river appears under the evening sky.

GOOD Example 3 (handheld)
CAMERA_MOTION: A handheld camera advances forward with gentle side-to-side shakes.
SCENE_DESCRIPTION: A crowded market street is filled with food stalls, signs, and pedestrians.

GOOD Example 4 (orbit)
CAMERA_MOTION: The camera gently orbits around the subject in a half circle.
SCENE_DESCRIPTION: A skateboarder practices tricks in a bright skatepark surrounded by graffiti.

BAD Example 5 (incorrect mixing: scene in camera)
CAMERA_MOTION: A person walks through a narrow alley lined with shops.   <-- WRONG
SCENE_DESCRIPTION: The camera slowly pans across the scene.              <-- WRONG

BAD Example 6 (incorrect mixing: camera in scene)
CAMERA_MOTION: The camera shakes as someone moves through the trail.     <-- WRONG
SCENE_DESCRIPTION: A shaky handheld shot follows the subject closely.    <-- WRONG

Now ignore the BAD examples and describe the current video using ONLY the two-line format:

CAMERA_MOTION:
SCENE_DESCRIPTION:
"""


ROLE_BASED_ICL_PROMPT = """
You are an expert video analyst. You must describe BOTH:

1. How the CAMERA moves (camera motion).
2. What the SCENE contains (objects, people, setting, actions).

You will ALWAYS output EXACTLY TWO lines in this format:

CAMERA_MOTION: <one short sentence only about camera movement>
SCENE_DESCRIPTION: <one or two short sentences only about scene content>

Important rules:
- CAMERA_MOTION must ONLY describe motion types (pan, tilt, zoom, dolly, orbit, handheld, static).
- SCENE_DESCRIPTION must ONLY describe scene subjects, environments, and actions.
- You may not reference ‚Äúvideo‚Äù, ‚Äúframes‚Äù, or technical camera terminology beyond motion.
- Output must be exactly two lines.

ICL Strategy (Role-Based Decomposition):
Imagine TWO coordinated experts analyzing the video:
- The CAMERA OPERATOR describes only how the camera moves.
- The SCENE OBSERVER describes only what appears in the scene.
Their combined perspectives should guide your final output, while still using the required tags.

EXAMPLES
========

Example 1 (static)
CAMERA_MOTION: The camera remains almost completely still with a slight natural sway.
SCENE_DESCRIPTION: A person stands on a small indoor stage speaking to an audience.

Example 2 (slow pan)
CAMERA_MOTION: The camera slowly pans from left to right in a smooth arc.
SCENE_DESCRIPTION: A skyline of tall buildings and a river comes into view at sunset.

Example 3 (handheld)
CAMERA_MOTION: A handheld camera moves forward with small side-to-side shakes.
SCENE_DESCRIPTION: A bustling market street is lined with food stalls and pedestrians walking by.

Example 4 (zoom)
CAMERA_MOTION: The camera gently zooms in on the subject while remaining centered.
SCENE_DESCRIPTION: A person works at a desk lit by a computer monitor in a dim room.

Example 5 (orbit)
CAMERA_MOTION: The camera slowly circles around the main subject in a controlled motion.
SCENE_DESCRIPTION: A skateboarder practices tricks in a bright skatepark surrounded by graffiti.

Example 6 (tracking)
CAMERA_MOTION: The camera quickly tracks alongside a moving subject, then slows down.
SCENE_DESCRIPTION: A runner moves along a forest path lined with tall trees before slowing at a clearing.

Now describe the current video using ONLY the two-line format:

CAMERA_MOTION:
SCENE_DESCRIPTION:
"""


In [30]:
# Define prompts --> put your all your prompt here
PROMPTS = {
    "curriculum_icl": CURRICULUM_ICL_PROMPT,          # Stephen ICL 1
    "contrastive_icl": CONTRASTIVE_ICL_PROMPT,        # Stephen ICL 2
    "role_based_icl": ROLE_BASED_ICL_PROMPT,          # Stephen ICL 3
}


# Optional: Select random subset (set to None for all videos)
NUM_SAMPLES = 200
RANDOM_SEED = 42

# Select samples
selected_manifest = select_samples(manifest, n=NUM_SAMPLES, seed=RANDOM_SEED)

# Run inference
# Use batch_size=1 for stability, increase for speed (may cause OOM)
results = run_inference(
    manifest=selected_manifest,
    dataset=ds,
    prompts=PROMPTS,
    batch_size=CONFIG["batch_size"],
    max_frames=CONFIG["max_frames"],
    fps=CONFIG["fps"],
)

‚úì Selected 200 random samples (seed=42)


Captioning:   0%|          | 0/200 [01:34<?, ?it/s]


KeyboardInterrupt: 

In [None]:
failed_global = [780, 364, 851, 741, 617]

failed_manifest = [m for m in selected_manifest if m["idx"] in failed_global]

len(failed_manifest), failed_manifest

recovered_results = run_inference(
    manifest=failed_manifest,
    dataset=ds,
    prompts=PROMPTS,
    batch_size=1,
    max_frames=14,
    fps=CONFIG["fps"],
)



Captioning: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:39<00:00,  7.91s/it]

‚úì Completed: 5 videos





In [None]:
# Build lookup for recovered rows
fix_map = {r["row_idx"]: r for r in recovered_results}

# Update existing rows
results = [fix_map.get(r["row_idx"], r) for r in results]

# Add missing recovered rows that never appeared in results
existing_ids = {r["row_idx"] for r in results}

missing_to_add = [
    r for r in recovered_results
    if r["row_idx"] not in existing_ids
]

# Append them
results.extend(missing_to_add)

print("Final count:", len(results))


Final count: 200


In [None]:
len(results)

200

# Export Result

In [None]:
import pandas as pd
from datetime import datetime

# Create DataFrame
df = pd.DataFrame(results)

# Generate filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
n_samples = len(results)
excel_filename = f"qwen25_redone_captions_{n_samples}samples_{timestamp}.xlsx"
EXCEL_PATH = os.path.join(PATHS["outputs"], excel_filename)

# Export to Excel
with pd.ExcelWriter(EXCEL_PATH, engine="xlsxwriter") as writer:
    df.to_excel(writer, index=False, sheet_name="captions")

    # Auto-adjust column widths
    worksheet = writer.sheets["captions"]
    for i, col in enumerate(df.columns):
        max_len = max(df[col].astype(str).map(len).max(), len(col)) + 2
        worksheet.set_column(i, i, min(max_len, 60))

print(f"‚úÖ Saved to: {EXCEL_PATH}")
print(f"üìä Total rows: {len(df)}")
df.head(3)

‚úÖ Saved to: /content/drive/MyDrive/CameraBench/outputs/qwen25_ft_captions_200samples_20251128_072632.xlsx
üìä Total rows: 200


Unnamed: 0,row_idx,video_url,labels,human_caption,curriculum_icl,contrastive_icl,role_based_icl
0,297,https://huggingface.co/datasets/syCen/CameraBe...,"[no-shaking, complex-motion, regular-speed, pe...",The camera ascends smoothly while tilting down...,CAMERA_MOTION: The camera smoothly descends wh...,CAMERA_MOTION: The camera smoothly descends wh...,CAMERA_MOTION: The camera smoothly descends wh...
1,130,https://huggingface.co/datasets/syCen/CameraBe...,"[minimal-shaking, complex-motion, regular-spee...",The camera tilts upward smoothly with minimal ...,CAMERA_MOTION: The camera tilts upward smoothl...,CAMERA_MOTION: The camera tilts upward smoothl...,CAMERA_MOTION: The camera tilts upward smoothl...
2,626,https://huggingface.co/datasets/syCen/CameraBe...,"[minimal-shaking, no-motion, regular-speed]",The camera remains fixed but slightly unsteady...,CAMERA_MOTION: The camera arcs slowly and smoo...,CAMERA_MOTION: The camera remains fixed but sl...,CAMERA_MOTION: The camera remains fixed but sl...


### Sanity Check

In [3]:
# Load pre-trained base model
model_base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)
processor_base = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

# Load fine-tuned CameraBench model
model_ft = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "chancharikm/qwen2.5-vl-7b-cam-motion",
    torch_dtype="auto",
    device_map="auto"
)
processor_ft = AutoProcessor.from_pretrained("chancharikm/qwen2.5-vl-7b-cam-motion")

print("Loaded both base + FT models ‚úì")


NameError: name 'Qwen2_5_VLForConditionalGeneration' is not defined