A notebook to run test of time with synthetic data with `before/after` relations.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import argparse
import os
import random

import numpy as np
import torch
import torch.backends.cudnn as cudnn
import gradio as gr

from video_llama.common.config import Config
from video_llama.common.dist_utils import get_rank
from video_llama.common.registry import registry
from video_llama.conversation.conversation_video import (
    Chat, Conversation, default_conversation,SeparatorStyle,conv_llava_llama_2
)
import decord
decord.bridge.set_bridge('torch')

from video_llama.datasets.builders import *
from video_llama.models import *
from video_llama.processors import *
from video_llama.runners import *
from video_llama.tasks import *

In [2]:
def setup_seeds(seed):
    seed = seed + get_rank()

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    cudnn.benchmark = False
    cudnn.deterministic = True


class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

In [3]:
setup_seeds(0)

args = dict(
    cfg_path="./eval_configs/video_llama_eval_only_vl_edited.yaml",
    model_type="llama_v2",
    gpu_id=0,
    options=[],
)
args = AttrDict(args)
args

{'cfg_path': './eval_configs/video_llama_eval_only_vl_edited.yaml',
 'model_type': 'llama_v2',
 'gpu_id': 0,
 'options': []}

In [6]:
cfg = Config(args)
model_config = cfg.model_cfg
model_config.device_8bit = args.gpu_id
model_cls = registry.get_model_class(model_config.arch)

In [8]:
model_config.equip_audio_branch = False

In [9]:
device_str = 'cuda:{}'.format(args.gpu_id)
model = model_cls.from_config(model_config).to(device_str)
model = model.eval()

vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
vis_processor = registry.get_processor_class(
    vis_processor_cfg.name
).from_config(vis_processor_cfg)

Loading VIT
Loading VIT Done
Loading Q-Former


Using pad_token, but it is not set yet.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.24s/it]


Load first Checkpoint: /work/piyush/pretrained_checkpoints/LargeModels/VideoLLAMA/Video-LLaMA-2-7B-Pretrained/VL_LLaMA_2_7B_Pretrained.pth


In [10]:
n_params = np.sum([p.numel() for p in model.parameters()])
n_params = n_params / 1e9
print(f"Number of parameters: {np.round(n_params, 2)}B")

Number of parameters: 7.85B


In [11]:
# Initialise chat
chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))

In [12]:
def ask_about_video(chat, video_path, question, num_beams=1, temperature=1.0):
    """
    A wrapper function to ask anything about video at given path.
    """
    chat_state = conv_llava_llama_2.copy()
    chat_state.system =  "You are able to understand the visual content that the user provides."\
        "Follow the instructions carefully and explain your answers in detail."
    img_list = []
    llm_message = chat.upload_video_without_audio(
        video_path, chat_state, img_list, video_loader="load_video",
    )

    chat.ask(question, chat_state)
    llm_message = chat.answer(
        conv=chat_state,
        img_list=img_list,
        num_beams=num_beams,
        temperature=temperature,
        max_new_tokens=300,
        max_length=2000,
    )[0]

    return llm_message

**Load data**

In [15]:
import pandas as pd

In [23]:
data_dir = "/scratch/shared/nfs2/piyush/datasets/ToT-syn-v2.0"

video_dir = os.path.join(data_dir, "videos")
metad_dir = os.path.join(data_dir, "metadata")

filenames = sorted([x.split(".mp4")[0] for x in os.listdir(video_dir)])

video_files = [os.path.join(video_dir, x + ".mp4") for x in filenames]
metad_files = [os.path.join(metad_dir, x + ".pt") for x in filenames]

df = pd.DataFrame(
    {
        "name": filenames,
        "video_path": video_files,
        "metad_path": metad_files,
    }
)
df = df[df.video_path.apply(os.path.exists)]
df = df[df.metad_path.apply(os.path.exists)]

df.shape

(180, 3)

In [27]:
# df["metad_path"].apply(lambda x: torch.load(x)["caption"])
captions = []
distractors = []
for i in range(len(df)):
    path = df.iloc[i].to_dict()["metad_path"]
    meta = torch.load(path)
    captions.append(meta["caption"])
    distractors.append(meta["distractor"])
df["caption"] = captions
df["distractor"] = distractors
df.shape

(180, 5)

In [34]:
# Create question

def create_question_answer(caption, distractor):
    coin_toss = np.random.uniform(0, 1)

    if coin_toss:
        options = [caption, distractor]
        correct_answer_string = f"(a)"
    else:
        options = [distractor, caption]
        correct_answer_string = f"(b)"

    question = f"Which of the following accurately described the video? "\
        f"You are given two options. (a) {options[0]} and (b) {options[1]}. "\
        f"You only need to output either (a) or (b)."
    correct_answer_verbose = f"{correct_answer_string} {caption}"
    return question, correct_answer_string, correct_answer_verbose
    

# df["question"] = df[["caption", "distractor"]].apply(lambda x: create_question(*x), axis=1)
for i in range(len(df)):
    row = df.iloc[i].to_dict()
    caption = row["caption"]
    distractor = row["distractor"]
    q, a_short, a_long = create_question_answer(caption, distractor)
    df.at[i, "question"] = q
    df.at[i, "ans_short"] = a_short
    df.at[i, "ans_long"] = a_long
df.shape

(180, 8)

**Debug**

In [35]:
i = 0
row = df.iloc[i].to_dict()

question = row["question"]
video_path = row["video_path"]
video_path, question

('/scratch/shared/nfs2/piyush/datasets/ToT-syn-v2.0/videos/text_pair_after_1.mp4',
 'Which of the following accurately described the video? You are given two options. (a) A yellow circle appears gradually after a red circle and (b) A red circle appears gradually after a yellow circle. You only need to output either (a) or (b).')

In [38]:
llm_output = ask_about_video(chat, video_path, question)
llm_output

"Based on the provided video frames, the accurate description of the video is (a) A yellow circle appears gradually after a red circle.\nHere's a detailed explanation of each frame:\n\n0.0 seconds: The video starts with a red circle in the center of the screen.\n\n0.1 seconds: The red circle starts to fade away, and a small yellow circle appears in the center of the screen, gradually increasing in size.\n\n0.2 seconds: The yellow circle is now larger than the red circle, and it continues to grow in size.\n\n0.4 seconds: The yellow circle has filled the entire screen, and the red circle has completely faded away.\n\n0.5 seconds: The yellow circle remains in the center of the screen, and it starts to shrink in size.\n\n0.6 seconds: The yellow circle has shrunk to about half its original size.\n\n0.7 seconds: The yellow circle has shrunk even further, and it is now just a small circle in the top-left corner of the screen.\n\n0.9 seconds: The video ends with the small yellow circle in the 

In [37]:
def check_answer(llm_output, correct_answer):
    return int(correct_answer.lower() in llm_output.lower())

In [39]:
check_answer(llm_output, row["ans_long"])

1

**Run on entire dataset**

In [40]:
from tqdm import tqdm

In [41]:
iterator = tqdm(range(len(df)), desc="Running on the whole set:")
flags = []
for i in iterator:
    row = df.iloc[i].to_dict()
    question = row["question"]
    video_path = row["video_path"]
    correct_answer = row["ans_long"]

    # Predict
    llm_output = ask_about_video(chat, video_path, question)
    flag = check_answer(llm_output, correct_answer)
    flags.append(flag)

Running on the whole set:: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 180/180 [36:05<00:00, 12.03s/it]


In [42]:
print("Accuracy on test of time: ", np.mean(flags))

Accuracy on test of time:  0.8833333333333333
