In [None]:
pip install opencv-python requests datasets braintrust autoevals openai


In [1]:
import os
import base64
from typing import List, Dict, Any, Optional
import asyncio

import cv2
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

from datasets import load_dataset

import braintrust
import autoevals
from openai import OpenAI


NUM_FRAMES = 32
TARGET_DIMENSIONS = (512, 512)
JPEG_QUALITY = 80

RETRY_TOTAL = 3
RETRY_BACKOFF = 0.5
STATUS_FORCELIST = [502, 503, 504]

os.environ["BRAINTRUST_API_KEY"] = "YOUR_API_KEY_HERE"

client = braintrust.wrap_openai(
    OpenAI(
        api_key=os.environ["BRAINTRUST_API_KEY"],
        base_url="https://api.braintrust.dev/v1/proxy",
    )
)


def extract_frames_b64(video_path: str) -> List[str]:
    base64_frames = []
    count = 0
    video_capture = cv2.VideoCapture(video_path)

    try:
        while video_capture.isOpened() and count < NUM_FRAMES:
            ret, frame = video_capture.read()
            if not ret:
                break

            frame = cv2.resize(frame, TARGET_DIMENSIONS)
            success, encoded_img = cv2.imencode(
                ".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), JPEG_QUALITY]
            )
            if success:
                b64_str = base64.b64encode(encoded_img).decode("utf-8")
                base64_frames.append(b64_str)
            count += 1
    finally:
        # Ensure the capture is always released
        video_capture.release()

    return base64_frames


def get_video_data(video_path: str, session: requests.Session) -> Optional[bytes]:
    try:
        if video_path.startswith("http"):
            response = session.get(video_path, timeout=10)
            response.raise_for_status()
            return response.content
        else:
            with open(video_path, "rb") as f:
                return f.read()
    except Exception as e:
        print(f"Error retrieving video data from {video_path}: {e}")
        return None


def load_data_subset() -> List[Dict[str, Any]]:
    ds = load_dataset("yale-nlp/MMVU", split="validation[:20]")

    session = requests.Session()
    retry = Retry(
        total=RETRY_TOTAL,
        backoff_factor=RETRY_BACKOFF,
        status_forcelist=STATUS_FORCELIST,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    data_list = []
    for row in ds:
        question_type = row["question_type"]
        video_path = row["video"]

        frames_b64 = extract_frames_b64(video_path)
        raw_video = get_video_data(video_path, session)

        choices_data = (
            row.get("choices") if question_type == "multiple-choice" else None
        )

        data_list.append(
            {
                "input": {
                    "frames_b64": frames_b64,
                    "question": row["question"],
                    "question_type": question_type,
                    "choices": choices_data,
                    "video_attachment": braintrust.Attachment(
                        filename=os.path.basename(video_path),
                        content_type="video/mp4",
                        data=raw_video,
                    ),
                },
                "expected": {"answer": row["answer"]},
                "metadata": {
                    "subject": row["metadata"]["subject"],
                    "textbook": row["metadata"]["textbook"],
                    "question_type": question_type,
                },
            }
        )

    session.close()
    return data_list


def video_qa(input_dict: Dict[str, Any]) -> str:
    frames_b64 = input_dict["frames_b64"]
    question = input_dict["question"]
    question_type = input_dict.get("question_type", "open-ended")
    choices_data = input_dict.get("choices")

    content_blocks = [
        {
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"},
        }
        for b64 in frames_b64
    ]

    if question_type == "multiple-choice" and choices_data:
        if isinstance(choices_data, dict):
            options_text = "\n".join(
                f"{key}: {value}" for key, value in choices_data.items()
            )
        else:
            options_text = "\n".join(
                f"{chr(65 + i)}: {option}" for i, option in enumerate(choices_data)
            )
        prompt_text = (
            f"You just saw {NUM_FRAMES} frames from a video. Based on what you see, "
            f"answer the following question: {question}.\n\n"
            f"Here are your options:\n{options_text}\n"
            "Choose the correct option in the format 'answer: X'. If uncertain, guess. You MUST pick something."
        )
    else:
        prompt_text = (
            f"You just saw {NUM_FRAMES} frames from a video. "
            f"Answer the following question: {question}.\n"
            "If uncertain, guess. Provide the best possible answer. You MUST answer to the best of your ability."
        )

    content_blocks.append({"type": "text", "text": prompt_text})

    messages = [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": (
                        "You are a helpful assistant. Provide an answer even if you are uncertain."
                    ),
                }
            ],
        },
        {"role": "user", "content": content_blocks},
    ]

    response = client.chat.completions.create(model="gpt-4o", messages=messages)
    return response.choices[0].message.content


judge_scorer = autoevals.LLMClassifier(
    name="LLMJudge",
    prompt_template=(
        "You are a judge evaluating a model's ability to answer a question "
        f"based on {NUM_FRAMES} frames in a video.\n\n"
        "Model's answer:\n{{output}}\n\n"
        "Expected answer:\n{{expected.answer}}\n\n"
        "Is the model's answer correct? (Y/N)? Only Y or N."
    ),
    choice_scores={"Y": 1, "N": 0},
    use_cot=True,
)


await braintrust.EvalAsync(
    "mmvu_eval_32images",
    data=load_data_subset,
    task=video_qa,
    scores=[judge_scorer],
    metadata={"model": "gpt-4o"},
    experiment_name="mmvu_eval_32images",
)

  from .autonotebook import tqdm as notebook_tqdm
Experiment mmvu_eval_32images is running at https://www.braintrust.dev/app/braintrustdata.com/p/mmvu_eval_32images/experiments/mmvu_eval_32images
mmvu_eval_32images [experiment_name=mmvu_eval_32images] (data): 20it [00:00, 71943.46it/s]
mmvu_eval_32images [experiment_name=mmvu_eval_32images] (tasks): 100%|██████████| 20/20 [00:23<00:00,  1.19s/it]



mmvu_eval_32images compared to mmvu_eval_32images-fb15e25a:
35.00% 'LLMJudge' score

1739843539.26s start
1739843556.42s end
14.50s duration
7.71s llm_duration
2472.25tok prompt_tokens
26.55tok completion_tokens
2498.80tok total_tokens
0.01$ estimated_cost

See results for mmvu_eval_32images at https://www.braintrust.dev/app/braintrustdata.com/p/mmvu_eval_32images/experiments/mmvu_eval_32images


EvalResultWithSummary(summary="...", results=[...])