In [36]:
import os
import json
from glob import glob
import random

import torch
import einops
import pandas as pd
import numpy as np
from natsort import natsorted
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "serif"
from IPython.display import display, Markdown, Latex


import shared.utils as su
from notebooks.eval_care_retrieval import load_data

In [2]:
# Load data
dataset = 'ssv2'
df = load_data(dataset=dataset)
df = df.drop_duplicates(subset=['id', 'text_id']).reset_index(drop=True)
df.shape

Number of rows:  1430
Sample row: 
{
    "id": 69703,
    "label": "moving pen up",
    "template": "Moving [something] up",
    "placeholders": "['pen']",
    "target": 114,
    "chiral_label": 0.0,
    "chiral_triplet_id": "3f20f09b",
    "noun": "['something']",
    "text_id": "3f20f09b_0.0",
    "video_path": "/scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/69703.webm"
}


(1430, 10)

In [5]:
# Sample 20% of the data for closed-source eval
subdf = df.sample(frac=0.2, random_state=42)
subdf.shape

(286, 10)

In [11]:
text_id_to_template = dict(zip(df['text_id'], df['template']))
text_id_to_template

{'3f20f09b_0.0': 'Moving [something] up',
 '12055391_1.0': 'Folding [something]',
 '12055391_0.0': 'Unfolding [something]',
 '3f20f09b_1.0': 'Moving [something] down',
 'b58b2f93_0.0': 'Taking [one of many similar things on the table]',
 '743faf6b_0.0': 'Pushing [something] from right to left',
 '743faf6b_1.0': 'Pushing [something] from left to right',
 '1539a698_1.0': 'Closing [something]',
 'b58b2f93_1.0': 'Putting [something similar to other things that are already on the table]',
 'a1383a77_1.0': 'Moving [something] and [something] away from each other',
 'a1383a77_0.0': 'Moving [something] and [something] closer to each other',
 '81dd0413_0.0': 'Opening [something]',
 '106d2603_0.0': 'Pulling [something] from right to left',
 '584d53c4_0.0': 'Opening [something]',
 '545d0d67_0.0': 'Moving [something] towards the camera',
 'ccff0ef2_0.0': 'Turning the camera right while filming [something]',
 '677ffe97_1.0': 'Closing [something]',
 'c1c62263_0.0': 'Opening [something]',
 '677ffe97_

In [20]:
df

Unnamed: 0,id,label,template,placeholders,target,chiral_label,chiral_triplet_id,noun,text_id,video_path
0,69703,moving pen up,Moving [something] up,['pen'],114,0.0,3f20f09b,['something'],3f20f09b_0.0,/scratch/shared/beegfs/piyush/datasets/SSv2/20...
1,217571,folding cloth,Folding [something],['cloth'],46,1.0,12055391,['something'],12055391_1.0,/scratch/shared/beegfs/piyush/datasets/SSv2/20...
2,24837,folding mat,Folding [something],['mat'],46,1.0,12055391,['something'],12055391_1.0,/scratch/shared/beegfs/piyush/datasets/SSv2/20...
3,96312,unfolding paper,Unfolding [something],['paper'],82,0.0,12055391,['something'],12055391_0.0,/scratch/shared/beegfs/piyush/datasets/SSv2/20...
4,29910,moving wallet down,Moving [something] down,['wallet'],112,1.0,3f20f09b,['something'],3f20f09b_1.0,/scratch/shared/beegfs/piyush/datasets/SSv2/20...
...,...,...,...,...,...,...,...,...,...,...
1425,194086,opening a box,Opening [something],['a box'],115,0.0,677ffe97,['object'],677ffe97_0.0,/scratch/shared/beegfs/piyush/datasets/SSv2/20...
1426,112252,moving charger down,Moving [something] down,['charger'],112,1.0,3f20f09b,['something'],3f20f09b_1.0,/scratch/shared/beegfs/piyush/datasets/SSv2/20...
1427,86588,taking spoon,Taking [one of many similar things on the table],['spoon'],53,0.0,b58b2f93,['something'],b58b2f93_0.0,/scratch/shared/beegfs/piyush/datasets/SSv2/20...
1428,157360,unfolding a shirt,Unfolding [something],['a shirt'],82,0.0,12055391,['something'],12055391_0.0,/scratch/shared/beegfs/piyush/datasets/SSv2/20...


In [53]:
# Load Gemini wrapper

from utils.gemini_utils import GeminiWrapper

model_key = "gemini-2.5-flash"
vlm = GeminiWrapper(model_key=model_key, fps=1.)
vlm

[33mLoading gemini-2.5-flash with FPS=1.0...........................................  [0m


<utils.gemini_utils.GeminiWrapper at 0x7f3e3abccc10>

In [54]:
row

{'id': 42956,
 'label': 'approaching keys with your camera',
 'template': 'Approaching [something] with your camera',
 'placeholders': "['keys']",
 'target': 0,
 'chiral_label': 1.0,
 'chiral_triplet_id': '27f40159',
 'noun': "['something']",
 'text_id': '27f40159_1.0',
 'video_path': '/scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/42956.webm'}

In [55]:
PROMPT_TEMPLATE = """
Watch the given video carefully. Which of the following actions best matches that shown in the video?

(1) %s

(2) %s

Just return the final answer (1) or (2).
"""

In [56]:
row

{'id': 42956,
 'label': 'approaching keys with your camera',
 'template': 'Approaching [something] with your camera',
 'placeholders': "['keys']",
 'target': 0,
 'chiral_label': 1.0,
 'chiral_triplet_id': '27f40159',
 'noun': "['something']",
 'text_id': '27f40159_1.0',
 'video_path': '/scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/42956.webm'}

In [58]:
chiral_ids = subdf.chiral_triplet_id.unique()
verbose = False
answers = {}
chiral_index = 0
for chiral_id in chiral_ids:
    rows = subdf[subdf.chiral_triplet_id == chiral_id]
    chiral_index += 1

    for j in su.log.tqdm_iterator(range(len(rows)), desc=f'Processing videos for chiral ID: {chiral_id} ({chiral_index}/{len(chiral_ids)})'):
        row = rows.iloc[j].to_dict()

        options = [
            text_id_to_template[f'{chiral_id}_0.0'],
            text_id_to_template[f'{chiral_id}_1.0'],
        ]

        # Sort randomly
        np.random.shuffle(options)

        ground_truth = row['template']
        ground_truth_index = options.index(ground_truth)

        if 'opening' in ground_truth.lower() or 'closing' in ground_truth.lower():
            # Replace [something] with noun
            ground_truth = ground_truth.replace('[something]', row['noun'])
            options_ = [o.replace('[something]', row['noun']) for o in options]
        else:
            options_ = options

        prompt = PROMPT_TEMPLATE % (options_[0], options_[1])

        # Generate answer
        try:
            answer = vlm.forward_text_video(prompt, row['video_path'])
            answers[row['video_path']] = dict(
                prompt=prompt,
                options=options_,
                ground_truth_index=ground_truth_index,
                answer=answer,
            )
        except:
            print(f"Failed {j}.")
            continue

        if verbose:
            print(chiral_id, row['chiral_label'], ground_truth_index, options_)
            break

    if verbose:
        display(su.visualize.show_single_video(row['video_path']))
        display(Markdown('---'))
        display(Markdown(prompt))
        display(Markdown('---'))
        display(Markdown(answer))
        break
        print('-' * 120)

len(answers)

Processing videos for chiral ID: 677ffe97 (1/16):   0%|          | 0/16 [00:00<?, ?it/s]

Processing videos for chiral ID: b58b2f93 (2/16):   0%|          | 0/33 [00:00<?, ?it/s]

Processing videos for chiral ID: 584d53c4 (3/16):   0%|          | 0/9 [00:00<?, ?it/s]

Processing videos for chiral ID: 81dd0413 (4/16):   0%|          | 0/5 [00:00<?, ?it/s]

Processing videos for chiral ID: c1c62263 (5/16):   0%|          | 0/9 [00:00<?, ?it/s]

Processing videos for chiral ID: a1383a77 (6/16):   0%|          | 0/4 [00:00<?, ?it/s]

Processing videos for chiral ID: 3f20f09b (7/16):   0%|          | 0/67 [00:00<?, ?it/s]

Processing videos for chiral ID: 12055391 (8/16):   0%|          | 0/67 [00:00<?, ?it/s]

Processing videos for chiral ID: 106d2603 (9/16):   0%|          | 0/15 [00:00<?, ?it/s]

Processing videos for chiral ID: 743faf6b (10/16):   0%|          | 0/35 [00:00<?, ?it/s]

Processing videos for chiral ID: 545d0d67 (11/16):   0%|          | 0/11 [00:00<?, ?it/s]

Processing videos for chiral ID: 1539a698 (12/16):   0%|          | 0/4 [00:00<?, ?it/s]

Processing videos for chiral ID: ccff0ef2 (13/16):   0%|          | 0/5 [00:00<?, ?it/s]

Processing videos for chiral ID: edebebd4 (14/16):   0%|          | 0/2 [00:00<?, ?it/s]

Processing videos for chiral ID: 15c7cb08 (15/16):   0%|          | 0/3 [00:00<?, ?it/s]

Processing videos for chiral ID: 27f40159 (16/16):   0%|          | 0/1 [00:00<?, ?it/s]

286

In [59]:
correct = []
for key in answers:
    out = answers[key]
    try:
        ground_truth_index = out['ground_truth_index']
        predicted_index = int(out['answer'].replace('(', "").replace(")", "")) - 1
        correct.append(ground_truth_index == predicted_index)
    except:
        print(f"Failed to parse: {key}. Skipping.")
        display(Markdown(out['answer']))
        display(Markdown(f"**True:** {ground_truth_index + 1} {out['options'][ground_truth_index]}"))
        display(Markdown('---'))
        continue
correct = np.array(correct)
print("Number of parsed samples: ", len(correct))
accuracy = np.round(np.mean(correct) * 100., 2)
print("Accuracy: ", accuracy)

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/44451.webm. Skipping.


The action shown in the video is a hand picking up several pens from a group of pens laid out on a surface. This is best described as taking something from a collection of similar items.

(1)

**True:** 1 Taking [one of many similar things on the table]

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/157167.webm. Skipping.


The video clearly shows a hand reaching for the car door handle. The hand then pulls the handle and pushes the door outwards, opening it. After a brief moment with the door open, the hand pushes the door back, closing it.

Since both actions (opening and closing) are explicitly shown, and the question asks which action "best matches that shown in the video," and both options are singular actions, it's best to identify the first distinct action performed from the initial state of the door being closed. The door starts closed, and the first action performed is the opening of the door.

The final answer is $\boxed{2}$

**True:** 2 Opening ['door']

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/57991.webm. Skipping.


The action shown in the video is **opening** the bottle. The person twists and removes the lid from the container.

(2)

**True:** 2 Opening ['bottle']

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/111809.webm. Skipping.


The video shows a person's hand entering the space between a plastic fork and a plastic spoon, spreading its fingers slightly, and then retracting.

Let's analyze the movement of the fork and spoon:
1.  **Initial state (0:00):** The fork and spoon are stationary on the wooden surface, separated by a gap.
2.  **Hand enters (0:00-0:01):** The hand moves into the frame and positions itself between the fork and spoon.
3.  **Fingers spread (0:01-0:02):** The fingers spread out, potentially making light contact with the fork and spoon.
4.  **Hand retracts (0:02-0:03):** The hand lifts and moves out of the frame.
5.  **Final state (0:03):** The fork and spoon are in their original, stationary positions, with the same gap between them.

Upon close inspection, the fork and spoon do not appear to move relative to each other throughout the video. Their positions against the wood grain remain constant.

However, the question asks to choose the "best match" from the given actions:
(1) Moving [something] and [something] closer to each other
(2) Moving [something] and [something] away from each other

Since the fork and spoon do not visibly move, neither option is strictly accurate as a description of their movement.
However, if we consider the *action* of the hand and its *potential* effect if the objects were free to move, the situation changes. The hand moves *between* the fork and spoon, and the fingers *spread*. The act of spreading fingers between two objects would naturally exert an outward force, tending to push the objects *away from each other*. There is no action in the video that suggests pulling the objects closer.

Given that a choice must be made, and considering the implied intent or the direction of potential force from the hand's action (spreading fingers), option (2) is the only one that conceptually aligns with the hand's movement, even if actual displacement is imperceptible.

The final answer is $\boxed{2}$

**True:** 2 Moving [something] and [something] away from each other

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/211186.webm. Skipping.


The action shown in the video is the person moving an object upwards, from in front of their chest to above their head.

Therefore, the best match is:
(1) Moving [something] up

**True:** 1 Moving [something] up

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/20005.webm. Skipping.


The video shows a hand holding a jar filled with small objects. The hand then rotates the jar to turn it upside down, causing the contents to fall out.

When a jar is inverted to empty its contents:
- The contents move downwards due to gravity.
- The opening of the jar, which was initially facing upwards or horizontally, is moved downwards to allow the contents to exit.

Therefore, the action best matches "Moving [something] down" (referring to the contents or the opening of the jar).

The final answer is $\boxed{1}$

**True:** 1 Moving [something] down

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/191681.webm. Skipping.


The video shows a hand interacting with a 9V battery.
- At 0:00, the battery is resting on the patterned surface.
- From 0:00 to 0:01, the hand grips the battery and lifts it slightly upwards.
- From 0:01 to 0:02, the hand moves the battery slightly downwards.
- From 0:02 to 0:03, the hand moves the battery slightly upwards again.

Comparing the initial position of the battery (at 0:00) to its final position (at 0:03), the battery is visibly higher at the end of the video than at the beginning. The overall net vertical displacement of the battery from the moment the hand starts to manipulate it until the end of the clip is upwards. Also, the initial action of the hand is to lift the battery from the surface.

Therefore, the action best described is moving it up.

The final answer is $\boxed{1}$

**True:** 2 Moving [something] down

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/115710.webm. Skipping.


The video shows a person holding a yellow stick-like object.

Let's analyze the vertical movement of the object throughout the clip:
*   At 0:00, the object is at a relatively high position.
*   From 0:00 to 0:01, the object moves downwards.
*   From 0:01 to 0:02, the object moves upwards.
*   From 0:02 to 0:03, the object moves downwards again.

While the object moves both up and down, if we consider the distinct segments of motion:
1.  High to Low (Down)
2.  Low to High (Up)
3.  High to Low (Down)

There are two instances of downward movement and one instance of upward movement within this short clip. Additionally, the object's final position at 0:03 appears slightly lower than its initial position at 0:00. Therefore, "Moving [something] down" best describes the overall prominent motion and net displacement shown.

The final answer is $\boxed{1}$

**True:** 1 Moving [something] down

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/14279.webm. Skipping.


The video shows a hand picking up a phone from a surface and then placing it back down.
1.  **From 0:01 to 0:02**, the hand grasps the phone and lifts it upwards, moving the phone up from the surface.
2.  **From 0:03 to 0:04**, the hand lowers the phone back to its original position on the surface, moving the phone down.

Both "moving [something] up" and "moving [something] down" occur in the video. However, the first action that occurs, moving the phone from its resting state, is the upward movement.

The final answer is $\boxed{1}$

**True:** 1 Moving [something] up

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/123585.webm. Skipping.


The video clearly shows a hand bringing a Pepsi bottle into the frame from the bottom, moving it upwards to display it. After a brief moment, the hand then moves the bottle downwards and out of the frame.

The action of moving the bottle *up* is the initial action that brings the object into view and makes it visible to the viewer. While it is also moved down, the first significant action is the upward movement.

Therefore, "Moving [something] up" best matches the action that introduces the object into the video.

The final answer is $\boxed{1}$

**True:** 1 Moving [something] up

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/881.webm. Skipping.


The action shown in the video is a hand lifting *up* from a mouse. Therefore, the best description is:

(1) Moving [something] up

**True:** 2 Moving [something] down

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/74580.webm. Skipping.


The hand in the video is initially resting on the calculator and then lifts *up* and away from it.

Therefore, the action best matches:
(2) Moving [something] up

**True:** 2 Moving [something] up

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/71847.webm. Skipping.


The video shows a hand holding a small white plate and moving it in a rapid, repetitive up-and-down motion. While both "moving something up" and "moving something down" occur, the action starts with a distinct downward movement and also ends with a downward movement, making the downward motion slightly more prominent or initiated.

Therefore, the action that best matches the description, if a single choice must be made, is the downward movement.

The final answer is $\boxed{2}$

**True:** 2 Moving [something] down

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/135409.webm. Skipping.


The best match is:

(1) Unfolding [something]

**True:** 1 Unfolding [something]

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/53081.webm. Skipping.


The action shown in the video starts with the person folding a piece of paper multiple times (from 0:01 to 0:04) and then briefly unfolding it (from 0:04 to 0:05). Since the folding action is more extensive and takes up a larger portion of the video, it best describes the primary action.

The final answer is $\boxed{1}$

**True:** 1 Folding [something]

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/151761.webm. Skipping.


The video clearly shows two distinct actions:
1.  From 0:00 to 0:01, the person takes a folded item of clothing and stretches it out to reveal a full shirt. This is the act of **unfolding** [something].
2.  From 0:02 to 0:03, the person takes the now unfolded shirt and gathers it back into a compact form. This is the act of **folding** [something].

Since the video shows both actions, and the question asks which action "best matches", we should consider the prominence or sequence. The video starts with a folded item, and the first action performed is to unfold it to display it.

Therefore, the action that best matches the initial and complete display of the item is unfolding.

The final answer is $\boxed{1}$

**True:** 2 Folding [something]

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/129654.webm. Skipping.


The hand touches the left side of the box. While the box itself does not move significantly, the hand's subsequent subtle adjustment of its grip involves sliding its fingers and thumb slightly further to the left. If this motion were to be interpreted as pulling, the direction would be from the right side of the box towards its left side.

The best match is (1).

**True:** 2 Pulling [something] from left to right

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/143775.webm. Skipping.


The video displays a sequence of movements involving a blue pen on a white surface.

Let's analyze the movements:
- At the beginning (0:00), the pen is positioned on the right side of the frame.
- From 0:01 to 0:02, a hand moves the pen from its initial position on the right towards the left side of the frame. This is "Pulling [something] from right to left".
- From 0:02 to 0:03, the hand again moves the pen from a right-of-center position towards the left. This is another instance of "Pulling [something] from right to left".
- From 0:03 to 0:04, the hand moves the pen from a left-of-center position towards the right. This is "Pulling [something] from left to right".
- From 0:04 to 0:05, the hand again moves the pen from a left-of-center position towards the right. This is another instance of "Pulling [something] from left to right".

Both actions, "Pulling [something] from right to left" and "Pulling [something] from left to right", are shown in the video, with two instances of each.

However, the question asks which action *best matches* what is shown. The very first action performed in the video, and repeated immediately, is moving the pen from the right side of the frame to the left side. This makes "Pulling [something] from right to left" the initial and therefore a prominent action shown early in the video.

The final answer is $\boxed{2}$

**True:** 2 Pulling [something] from right to left

---

Failed to parse: /scratch/shared/beegfs/piyush/datasets/SSv2/20bn-something-something-v2/122623.webm. Skipping.


The video shows a hand entering the frame from the bottom-left and moving towards a spinning coin located on the right side of the frame. The hand moves in a general direction from left to right to interact with the coin. As the hand approaches and makes contact (or gets very close), the coin stops spinning and settles.

Let's analyze the options:
(1) Pushing [something] from right to left: This would imply the object moves from the right side towards the left. While the coin's final resting position is slightly to the left of its initial spinning spot, the hand initiating the interaction is on the left, not the right. If the coin moved from right to left, the force would typically come from the right.

(2) Pushing [something] from left to right: This would imply the object moves from the left side towards the right, or that the force is applied from the left towards the right. The hand, which is the agent of the action, moves from the left side of the frame towards the right. If the hand were pushing, its action would be in the left-to-right direction.

Considering that the "action" refers to the movement and applied force of the hand, the hand's trajectory and point of contact are consistent with an action originating from the left and moving towards the right. Even if the coin's final small displacement is slightly to the left (possibly due to complex interaction of stopping a spin), the primary action of the hand itself is directed from left to right.

Therefore, describing the hand's action, the best match is pushing from left to right.

The final answer is $\boxed{2}$

**True:** 2 Pushing [something] from left to right

---

Number of parsed samples:  267
Accuracy:  89.14


In [60]:
(sum(correct) + 15) / len(answers)

np.float64(0.8846153846153846)