In [1]:
import os
import json
from glob import glob
import random

import torch
import einops
import pandas as pd
import numpy as np
from natsort import natsorted
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "serif"

import shared.utils as su
from notebooks.eval_care_retrieval import load_model, load_data

## Visualize results

## Compute and store results

In [2]:
models = [
    '/work/piyush/pretrained_checkpoints/CaRe-7B/',
    # '/work/piyush/experiments/CaRe/Tarsier-7b/final-10112025/nli_9000+ego_1000+subj_replaced-seed_42/merged_checkpoint',
]
datasets = [
    # 'ssv2',
    # 'epic',
    'charades',
]

for model_id in models:
    for dataset in datasets:
        print("Model ID: ", model_id)
        print("Dataset: ", dataset)

        save_dir = f"{model_id}/metadata_results"
        os.makedirs(save_dir, exist_ok=True)
        save_path = f"{save_dir}/{dataset}_examples.pt"
        if os.path.exists(save_path):
            print(f"Results already exist for {model_id}/{dataset}. Skipping.")
            print('-' * 100)
            continue

        else:
            # Load model
            vfc, tfc, vp = load_model(_id=model_id, device_map='auto', n_frames=12)

            # Load data
            df = load_data(dataset)
            df = df.drop_duplicates(subset=['id', 'text_id']).reset_index(drop=True)

            # For each text query, only select 5 videos (for faster inference)
            np.random.seed(42)
            random.seed(42)
            df_subset = []
            n = 2
            for text_id in df.text_id.unique():
                subdf = df[df.text_id == text_id]
                subdf = subdf.sample(n=min(len(subdf), n), random_state=42)
                df_subset.append(subdf)
            df = pd.concat(df_subset)

            # Compute text features
            text_ids = df['text_id'].unique()
            texts_feat = {}
            for text_id in su.log.tqdm_iterator(text_ids, desc='Computing text features'):
                text = df[df.text_id == text_id].template.unique()[0]
                with torch.no_grad():
                    zt = tfc(text)
                    zt = torch.nn.functional.normalize(zt, dim=-1)
                texts_feat[text_id] = zt.cpu().float()

            # Compute video features
            video_paths = df.video_path.unique()
            video_ids = df.id.unique()
            video_feat = {}
            is_qwen25vl = False
            j = 0
            for video_path in su.log.tqdm_iterator(video_paths, desc='Computing video features'):
                if not is_qwen25vl:
                    video_tensor = vp(video_path)
                    with torch.no_grad():
                        zv = vfc(video_tensor)
                else:
                    zv = vfc.encoder.encode_vision([video_path])[0]
                zv = torch.nn.functional.normalize(zv, dim=-1)
                video_feat[video_ids[j]] = zv.cpu().float()
                j += 1

            data = {
                'video_embeddings': video_feat,
                'text_embeddings': texts_feat,
                'dataframe': df.copy(),
            }
            torch.save(data, save_path)
            print(f"Saved to {save_path}.")
            print('-' * 100)

Model ID:  /work/piyush/pretrained_checkpoints/CaRe-7B/
Dataset:  charades
[33mLoading CaRe model (/work/piyush/pretrained_checkpoints/CaRe-7B/)...............  [0m


`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading EncoderForCaRe from /work/piyush/pretrained_checkpoints/CaRe-7B/


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

::: Number of total parameters in Qwen2VLForConditionalGeneration: 8291.376M
Number of rows:  5498
Sample row: 
{
    "id_base": "YSKX3",
    "subject": "CP6Y",
    "scene": "Bedroom",
    "quality": 5.0,
    "relevance": 6.0,
    "verified": "Yes",
    "length": 16.62,
    "cls_id": "c077",
    "template": "putting a pillow somewhere",
    "start_time": 12.1,
    "end_time": 16.62,
    "object_id": "o027",
    "noun": "na",
    "verb_id": "v016",
    "verb": "put",
    "label": "putting a pillow somewhere",
    "id": "YSKX3_12.1_16.6",
    "chiral_label": 0.0,
    "chiral_triplet_id": "a9be73ec",
    "text_id": "a9be73ec_0.0",
    "video_path": "/scratch/shared/beegfs/piyush/datasets/Charades/Charades_v1_480_cut_clips/YSKX3_12.1_16.6.mp4"
}


Computing text features:   0%|          | 0/56 [00:00<?, ?it/s]

From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


Computing video features:   0%|          | 0/112 [00:00<?, ?it/s]

Saved to /work/piyush/pretrained_checkpoints/CaRe-7B//metadata_results/charades_examples.pt.
----------------------------------------------------------------------------------------------------


## Dev

In [2]:
# Load model
model_id = 
# model_id = '/work/piyush/experiments/CaRe/Tarsier-7b/final-10112025/nli_9000+ego_1000+subj_replaced-seed_42/merged_checkpoint'
vfc, tfc, vp = load_model(_id=model_id, device_map='auto')

[33mLoading CaRe model (/work/piyush/experiments/CaRe/Tarsier-7b/final-10112025/nli_9000+ego_1000+subj_replaced-seed_42/merged_checkpoint).  [0m


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading EncoderForTarsier from /work/piyush/experiments/CaRe/Tarsier-7b/final-10112025/nli_9000+ego_1000+subj_replaced-seed_42/merged_checkpoint
### do_image_padding is set as False, images will be resized directly!


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
TarsierForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

::: Number of total parameters in TarsierForConditionalGeneration: 7063.427M


In [None]:
dataset = 'ssv2'

In [3]:
df = load_data(dataset)
df = df.drop_duplicates(subset=['id', 'text_id']).reset_index(drop=True)
df.shape

Number of rows:  1430
Sample row: 
{
    "id": 69703,
    "label": "moving pen up",
    "template": "Moving [something] up",
    "placeholders": "['pen']",
    "target": 114,
    "chiral_label": 0.0,
    "chiral_triplet_id": "3f20f09b",
    "noun": "['something']",
    "text_id": "3f20f09b_0.0",
    "video_path": "/scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/69703.webm"
}


(1430, 10)

In [14]:
# For each text query, only select 5 videos (for faster inference)
np.random.seed(42)
random.seed(42)

df_subset = []
n = 5
for text_id in df.text_id.unique():
    subdf = df[df.text_id == text_id]
    subdf = subdf.sample(n=min(len(subdf), n), random_state=42)
    df_subset.append(subdf)
df = pd.concat(df_subset)
df.shape

(159, 10)

In [17]:
# Compute text features
text_ids = df['text_id'].unique()
texts_feat = {}
for text_id in su.log.tqdm_iterator(text_ids, desc='Computing text features'):
    text = df[df.text_id == text_id].template.unique()[0]
    zt = tfc(text)
    zt = torch.nn.functional.normalize(zt, dim=-1)
    texts_feat[text_id] = zt.cpu().float()

Computing text features:   0%|          | 0/32 [00:00<?, ?it/s]

In [19]:
# Compute video features
video_paths = df.video_path.unique()
video_ids = df.id.unique()
video_feat = {}
is_qwen25vl = False
j = 0
for video_path in su.log.tqdm_iterator(video_paths, desc='Computing video features'):
    if not is_qwen25vl:
        video_tensor = vp(video_path)
        zv = vfc(video_tensor)
    else:
        zv = vfc.encoder.encode_vision([video_path])[0]
    zv = torch.nn.functional.normalize(zv, dim=-1)
    video_feat[video_ids[j]] = zv.cpu().float()
    j += 1
len(video_feat)

Computing video features:   0%|          | 0/159 [00:00<?, ?it/s]

Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


159

In [20]:
data = {
    'video_embeddings': video_feat,
    'text_embeddings': texts_feat,
    'dataframe': df.copy(),
}
save_dir = f"{model_id}/metadata_results"
os.makedirs(save_dir, exist_ok=True)

save_path = f"{save_dir}/{dataset}_examples.pt"
torch.save(data, save_path)
print(f"Saved to {save_path}.")
print('-' * 100)

Saved to /work/piyush/experiments/CaRe/Tarsier-7b/final-10112025/nli_9000+ego_1000+subj_replaced-seed_42/merged_checkpoint/metadata_results/ssv2_examples.pt.
----------------------------------------------------------------------------------------------------
512	/work/piyush/experiments/CaRe/Tarsier-7b/final-10112025/nli_9000+ego_1000+subj_replaced-seed_42/merged_checkpoint/metadata_results/ssv2_examples.pt


In [22]:
!du -sh $save_path

2.6M	/work/piyush/experiments/CaRe/Tarsier-7b/final-10112025/nli_9000+ego_1000+subj_replaced-seed_42/merged_checkpoint/metadata_results/ssv2_examples.pt


In [24]:
data = torch.load(save_path)
data.keys(), len(data['video_embeddings'])

(dict_keys(['video_embeddings', 'text_embeddings', 'dataframe']), 159)