A notebook to evaluate on the state-change subset of [`ViLMA`](https://arxiv.org/pdf/2311.07022)

In [1]:
import sys
sys.path.append("../")

In [2]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys
from glob import glob
from tqdm import tqdm
import argparse
import os
import random
from termcolor import colored

import numpy as np
import torch
import torch.backends.cudnn as cudnn
import pandas as pd

In [3]:
from model_api import (
    load_config, load_model, setup_seeds, ask_about_video,
)

**Load model**

In [4]:
# Get config
args, cfg = load_config()

# Load model
chat, model, vis_processor = load_model(args, cfg, low_resource=False)

[32m[:::] Loading model.[0m
Loading VIT
Loading VIT Done
Loading Q-Former


Using pad_token, but it is not set yet.


load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth
Loading Q-Former Done
Loading LLAMA Tokenizer
Loading LLAMA Model


Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:19<00:00,  9.65s/it]


Loading LLAMA Done
Loading LLAMA proj
Loading llama_proj Done
Load first Checkpoint: /work/piyush/pretrained_checkpoints/LargeModels/VideoLLAMA/Video-LLaMA-2-7B-Pretrained/VL_LLaMA_2_7B_Pretrained.pth
[:::] Model has 7.852B parameters.
[32m[:::] Model loaded.[0m


**Load data**

In [5]:
# Directory containing all videos
video_dir = "/scratch/shared/nfs2/piyush/datasets/ViLMA/videos"

# Directory containing metadata
metad_dir = "/users/piyush/projects/ViLMA/data"

In [6]:
def load_json(path: str) -> dict:
    """Helper to load json file"""
    import json
    with open(path, 'rb') as f:
        data = json.load(f)
    return data

In [28]:
main_task = "change-state"
sub_task = "prestate"
task_file = os.path.join(
    metad_dir, f"{main_task}-{sub_task}.json"
)
assert os.path.join(task_file)
task_data = load_json(task_file)
len(task_data)

624

In [29]:
!ls $metad_dir

change-state-action.json	plot.md
change-state-inverse.json	plotter_helper.py
change-state-poststate.json	plotter.py
change-state-prestate.json	quva-proficiency-foils.json
counting-easy-digits.json	quva-templates.json
counting-easy-digits-sec.json	quva-templates-processed.json
counting-easy-spelled-pts.json	rare-actions-noun-foils.json
counting-easy-spelled-sec.json	rare-actions-verb-foils.json
counting-hard-digits.json	relations.json
counting-hard-digits-sec.json	Semantic_Role_Labelling_Data_annotated.json
counting-hard-spelled-pts.json	Semantic_Role_Labelling_Data.json
counting-hard-spelled-sec.json	SRL_Action_Replacement_Top_1000.json
dummy.json			SRL_Actor_Swapping.json


In [30]:
df = pd.DataFrame(task_data).T
df.shape

(624, 18)

In [31]:
df["change_of_state"].apply(lambda x: x["verb"]).value_counts()

change_of_state
uncover     20
unfold      20
reveal      20
unroll      19
fold        19
            ..
clean        1
filtrate     1
cut away     1
leave        1
use          1
Name: count, Length: 93, dtype: int64

In [32]:
df["caption"]

change-state-prestate-0001    Initially, the athlete is in a lower position.
change-state-prestate-0002    Initially, the athlete is in a lower position.
change-state-prestate-0003    Initially, the athlete is in a lower position.
change-state-prestate-0004    Initially, the athlete is in a lower position.
change-state-prestate-0007    Initially, the athlete is in a lower position.
                                                   ...                      
change-state-prestate-0815           Initially, the pipe band are unwrapped.
change-state-prestate-0817                  Initially, the box is unwrapped.
change-state-prestate-0818          Initially, the spring roll is unwrapped.
change-state-prestate-0820           Initially, the bamboo mat is unwrapped.
change-state-prestate-0821                Initially, the dough is unwrapped.
Name: caption, Length: 624, dtype: object

In [33]:
df["foils"].apply(lambda x: len(x)).value_counts()

foils
1    624
Name: count, dtype: int64

In [34]:
df["foils"]

change-state-prestate-0001    [Initially, the athlete is in a higher position.]
change-state-prestate-0002    [Initially, the athlete is in a higher position.]
change-state-prestate-0003    [Initially, the athlete is in a higher position.]
change-state-prestate-0004    [Initially, the athlete is in a higher position.]
change-state-prestate-0007    [Initially, the athlete is in a higher position.]
                                                    ...                        
change-state-prestate-0815              [Initially, the pipe band are wrapped.]
change-state-prestate-0817                     [Initially, the box is wrapped.]
change-state-prestate-0818             [Initially, the spring roll is wrapped.]
change-state-prestate-0820              [Initially, the bamboo mat is wrapped.]
change-state-prestate-0821                   [Initially, the dough is wrapped.]
Name: foils, Length: 624, dtype: object

In [35]:
df.dataset.value_counts()

dataset
coin                      235
something-something-v2    195
youcook2                  147
star                       33
RareAct                    14
Name: count, dtype: int64

In [36]:
df.dataset_idx.isnull().sum()

0

In [37]:
# Add video ID to each row

def get_video_id(item):
    from_yt = ['RareAct', 'VidSitu', 'youcook2', 'coin']
    # find the full path
    dataset = item['dataset']
    video_file = item['video_file']
    # video_path = None
    if dataset == 'QUVA':
        normalized = item.get('normalized')
        assert normalized
        # video_dir = osp.join(self.quva_dir, 'normalized_videos')
        # video_path = osp.join(video_dir, video_file)
        video_id = video_file
    elif dataset == 'something-something-v2':
        # video_dir = self.something_something_dir
        # video_path = osp.join(video_dir, f'{item["dataset_idx"]}.webm')
        video_id = item["dataset_idx"]
    elif dataset == 'star':
        # video_dir = self.star_dir
        # video_path = osp.join(video_dir, f"{video_file}.mp4")
        video_id = video_file
    elif dataset in from_yt:
        # video_dir = self.youtube_dir
        # video_path = osp.join(video_dir, f'{item["youtube_id"]}.mp4')
        video_id = item["youtube_id"]
    else:
        raise NotImplementedError('Not implemented yet.')
    return video_id

video_ids = []
for i in range(len(df)):
    row = df.iloc[i].to_dict()
    video_id = get_video_id(row)
    video_ids.append(video_id)
df["video_id"] = video_ids

df.shape, df.video_id.isnull().sum()

((624, 19), 0)

In [38]:
def search_video_path(video_dir, video_id):
    paths = glob(os.path.join(video_dir, f"{video_id}.*"))
    assert len(paths) in [0, 1]
    if len(paths) == 0:
        return None
    else:
        path = paths[0]
        return path


df["video_path"] = df["video_id"].apply(
    lambda x: search_video_path(video_dir, x)
)
df.shape, df.video_path.isnull().sum()

((624, 20), 6)

In [39]:
subdf = df[df.video_path.apply(lambda x: os.path.exists(x) if x is not None else False)].copy()
subdf.shape

(618, 20)

In [40]:
# Test on a sample row
i = 0
row = subdf.iloc[i].to_dict()
video_path = row["video_path"]

caption = row["caption"].lower()
foil = row["foils"][0].lower()

randomise_options = True
enum_options = ["(a)", "(b)"]
if randomise_options:
    if np.random.uniform() < 0.5:
        text_options = [caption, foil]
        correct_answer = f"{enum_options[0]} {caption}"
    else:
        text_options = [foil, caption]
        correct_answer = f"{enum_options[1]} {caption}"
user_message = "Given this video, you have to select which is the option "\
    "that correctly describes the video: "\
    f"{enum_options[0]} {text_options[0]} "\
    f"{enum_options[1]} {text_options[1]} "\
    f"You have to only answer {enum_options[0]} or {enum_options[0]}."

model_answer = ask_about_video(chat, video_path, user_message)
print(model_answer)

Based on the frames provided in the video, it appears that the athlete starts at a higher position and then performs a jump, landing on the ground. Therefore, the correct answer is (a) initially, the athlete is in a higher position.


In [41]:
correct_answer in model_answer

False

**Evaluate on entire dataset**

In [42]:
def check_row(row, verbose=False, randomise_options=True, enum_options=["(a)", "(b)"]):
    """Checks a single row."""

    video_path = row["video_path"]    
    caption = row["caption"].lower()
    foil = row["foils"][0].lower()
    
    if randomise_options:
        if np.random.uniform() < 0.5:
            text_options = [caption, foil]
            correct_answer = f"{enum_options[0]} {caption}"
        else:
            text_options = [foil, caption]
            correct_answer = f"{enum_options[1]} {caption}"
    else:
        text_options = [caption, foil]
        correct_answer = f"{enum_options[0]} {caption}"

    user_message = "Given this video, you have to select which is the option "\
        "that correctly describes the video: "\
        f"{enum_options[0]} {text_options[0]} "\
        f"{enum_options[1]} {text_options[1]} "\
        f"You have to only answer {enum_options[0]} or {enum_options[0]}."
    
    model_answer = ask_about_video(chat, video_path, user_message)
    flag = correct_answer in model_answer

    if verbose:
        print("QUESTION: ", user_message)
        print("VIDEO: ", video_path)
        print("MODEL ANSWER: ", model_answer)
        print("IDEAL ANSWER: ", correct_answer)

    return flag

In [43]:
from tqdm import tqdm

def tqdm_iterator(items, desc=None, bar_format=None, **kwargs):
    tqdm._instances.clear()
    iterator = tqdm(
        items,
        desc=desc,
        bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}',
        **kwargs,
    )
    tqdm._instances.clear()
    return iterator

In [None]:

iterator = tqdm_iterator(range(len(subdf)), desc="Evaluating on entire dataset")
flags = []
failed = []
for i in iterator:
    row = subdf.iloc[i].to_dict()
    try:
        flag = check_row(row, verbose=False)
    except:
        # Failed on this video
        failed.append(i)
    flags.append(flag)
flags = np.array(flags).astype(int)
print("Accuracy: ", np.mean(flags))

Evaluating on entire dataset:   0%|          | 0/618 [00:00<?, ?it/s]                                                                                                             

In [None]:
np.mean(flags)

In [None]:
np.mean(flags)

In [46]:
np.mean(flags)

0.47572815533980584

**Run without shuffling**

In [None]:
iterator = tqdm_iterator(range(len(subdf)), desc="Evaluating on entire dataset")
flags = []
failed = []
for i in iterator:
    row = subdf.iloc[i].to_dict()
    try:
        flag = check_row(row, verbose=False, randomise_options=False)
    except:
        # Failed on this video
        failed.append(i)
    flags.append(flag)
flags = np.array(flags).astype(int)
print("Accuracy: ", np.mean(flags))

Evaluating on entire dataset:   3%|▎         | 18/618 [03:16<1:37:58,  9.80s/it]                                                                                                  [h264 @ 0x94390500] mmco: unref short failure
Evaluating on entire dataset:  23%|██▎       | 144/618 [24:29<1:16:11,  9.64s/it]                                                                                                 [h264 @ 0x943663c0] mmco: unref short failure
Evaluating on entire dataset:  36%|███▌      | 222/618 [38:22<1:06:25, 10.07s/it]                                                                                                 [h264 @ 0x94330c40] mmco: unref short failure
Evaluating on entire dataset:  46%|████▌     | 284/618 [48:37<56:58, 10.23s/it]                                                                                                   [mov,mp4,m4a,3gp,3g2,mj2 @ 0x94369d00] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x94369d00] moov atom not found
Evaluating on entire dataset:  4

In [25]:
flags

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,

In [26]:
print("Accuracy: ", np.mean(flags))

Accuracy:  0.9352750809061489
