In [1]:
import os
import json
from glob import glob

import torch
import pandas as pd
import numpy as np
from natsort import natsorted
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "serif"

import shared.utils as su

In [4]:
data_dir = "/scratch/shared/beegfs/piyush/datasets/ReversedInTime"
csv_path = f"{data_dir}/splits/all_meta.csv"
df = pd.read_csv(csv_path)
df = df[df.split == 'test']
df.shape

(1000, 5)

In [10]:
data = su.io.load_json(f"{data_dir}/splits/test.json")
len(data)

1000

In [6]:
df.temporal.value_counts()

temporal
True    1000
Name: count, dtype: int64

In [12]:
df.iloc[0]

video_id                                             33176965
url         https://www.shutterstock.com//zh/video/clip-33...
reverse                                                  True
temporal                                                 True
split                                                    test
Name: 12271, dtype: object

**Load model**

In [14]:
from notebooks.eval_care_retrieval import load_model

model_path = "/work/piyush/experiments/CaRe/Tarsier-7b/nli-9k+ego4d-1k/merged_checkpoint"
vfc, tfc, vp  = load_model(_id=model_path)

[33mLoading CaRe model (/work/piyush/experiments/CaRe/Tarsier-7b/nli-9k+ego4d-1k/merged_checkpoint).  [0m


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading EncoderForTarsier from /work/piyush/experiments/CaRe/Tarsier-7b/nli-9k+ego4d-1k/merged_checkpoint
### do_image_padding is set as False, images will be resized directly!


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
TarsierForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

::: Number of total parameters in TarsierForConditionalGeneration: 7063.427M


**Compute video embeddings**

In [22]:
vid_fwd_emb = {}
vid_rev_emb = {}
for i in su.log.tqdm_iterator(range(len(df))):
    row = df.iloc[i].to_dict()
    vid_fwd = f"{data_dir}/videos/{row['video_id']}.mp4"
    # vid_rev = f"{data_dir}/videos/{row['video_id']}-reverse.mp4"

    vid_fwd_tensor = vp(vid_fwd)
    vid_rev_tensor = torch.flip(vid_fwd_tensor, dims=(0,))

    zv = vfc(vid_fwd_tensor)
    zv = torch.nn.functional.normalize(zv, dim=-1).cpu().float()
    vid_fwd_emb[str(row['video_id'])] = zv

    zv = vfc(vid_rev_tensor)
    zv = torch.nn.functional.normalize(zv, dim=-1).cpu().float()
    vid_rev_emb[str(row['video_id'])] = zv


len(vid_fwd_emb), len(vid_rev_emb)

  0%|          | 0/1000 [00:00<?, ?it/s]

Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


KeyboardInterrupt: 

In [25]:
# captions_fwd = [
#     data[v]['forward_captions'][0] for v in df.video_id.tolist()
# ]
# captions_rev = [
#     data[v]['reverse_captions'][0] for v in df.video_id.tolist()
# ]
# len(captions_fwd), len(captions_rev)


cap_fwd_emb = {}
cap_rev_emb = {}
for i in su.log.tqdm_iterator(range(len(df)), desc='Computing text features'):
    row = df.iloc[i].to_dict()
    video_id = str(row['video_id'])
    x = data[video_id]

    zt = tfc(x['forward_captions'][0])
    zt = torch.nn.functional.normalize(zt, dim=-1).cpu().float()
    cap_fwd_emb[video_id] = zt

    zt = tfc(x['reverse_captions'][0])
    zt = torch.nn.functional.normalize(zt, dim=-1).cpu().float()
    cap_rev_emb[video_id] = zt

len(cap_fwd_emb), len(cap_rev_emb)

Computing text features:   0%|          | 0/1000 [00:00<?, ?it/s]

(1000, 1000)

In [30]:
vid_fwd_emb[33176965].shape

torch.Size([4096])

In [31]:
len(vid_fwd_emb), len(vid_rev_emb)

(25, 25)

In [49]:
from utils.general_retrieval_metrics import itm_eval

ZV = torch.stack([vid_fwd_emb[k] for k in vid_fwd_emb])
ZT = torch.stack([cap_fwd_emb[str(k)] for k in vid_fwd_emb])

scores_i2t = (ZV @ ZT.T).numpy()
scores_t2i = (ZT @ ZV.T).numpy()
scores_i2t.shape, scores_t2i.shape

txt2img = {i:i for i, k in enumerate(list(vid_fwd_emb))}
img2txt = {i:i for i, k in enumerate(list(vid_fwd_emb))}

metrics = itm_eval(scores_i2t, scores_t2i, txt2img, img2txt)
metrics = {k: v.item() for k, v in metrics.items()}
metrics

{'txt_r1': 96.0,
 'txt_r5': 100.0,
 'txt_r10': 100.0,
 'txt_r_mean': 98.66666666666667,
 'img_r1': 96.0,
 'img_r5': 100.0,
 'img_r10': 100.0,
 'img_r_mean': 98.66666666666667,
 'r_mean': 98.66666666666667}

In [47]:
ZV_fwd = torch.stack([vid_fwd_emb[k] for k in vid_fwd_emb])
ZV_rev = torch.stack([vid_rev_emb[k] for k in vid_fwd_emb])
ZT_fwd = torch.stack([cap_fwd_emb[str(k)] for k in vid_fwd_emb])
ZT_rev = torch.stack([cap_rev_emb[str(k)] for k in vid_fwd_emb])
ZV = torch.cat([ZV_fwd, ZV_rev], dim=0)
ZT = torch.cat([ZT_fwd, ZT_rev], dim=0)
scores_i2t = (ZV @ ZT.T).numpy()
scores_t2i = (ZT @ ZV.T).numpy()
txt2img = {i:i for i in range(len(ZT))}
img2txt = {i:i for i in range(len(ZV))}
metrics = itm_eval(scores_i2t, scores_t2i, txt2img, img2txt)
metrics = {k: v.item() for k, v in metrics.items()}
metrics

{'txt_r1': 56.0,
 'txt_r5': 98.0,
 'txt_r10': 100.0,
 'txt_r_mean': 84.66666666666667,
 'img_r1': 60.0,
 'img_r5': 98.0,
 'img_r10': 100.0,
 'img_r_mean': 86.0,
 'r_mean': 85.33333333333334}

In [43]:
t2v_acc = []
v2t_acc = []
for k in vid_fwd_emb:
    zv_fwd = vid_fwd_emb[k]
    zv_rev = vid_rev_emb[k]
    zt_fwd = cap_fwd_emb[str(k)]
    zt_rev = cap_rev_emb[str(k)]
    sim = torch.stack([zv_fwd, zv_rev]) @ torch.stack([zt_fwd, zt_rev]).T
    t2v_acc.append(sim[0, 0] > sim[1, 0])
    v2t_acc.append(sim[0, 0] > sim[0, 1])
t2v_acc = np.mean(t2v_acc).item()
v2t_acc = np.mean(v2t_acc).item()
v2t_acc, t2v_acc

(0.52, 0.68)

In [40]:
sim

tensor([[0.5337, 0.5471],
        [0.5658, 0.5690]])