In [1]:
import os
import sys
import random

import numpy as np
import pandas as pd
from tqdm import tqdm

import shared.utils as su

In [2]:
def display_row(df):
    import json
    print(json.dumps(df.iloc[np.random.randint(len(df))].to_dict(), indent=2))
    print('-' * 100)

### Create ablation splits

In [3]:
data_dir = "/scratch/shared/beegfs/piyush/datasets/SimCSE-NLI"

df_ego = pd.read_csv(f"{data_dir}/ego4d-422K.csv")
df_nli = pd.read_csv(f"{data_dir}/nli-275k.csv")

def filter_csv(df):
    """We do not want any repetition within a triplet for safety."""
    df = df[
        ~((df.sent0 == df.sent1) | (df.sent1 == df.hard_neg) | (df.sent0 == df.hard_neg))
    ].reset_index(drop=True)
    return df

df_ego = filter_csv(df_ego)
df_nli = filter_csv(df_nli)
df_ego['source'] = 'ego4d'
df_nli['source'] = 'nli'

df_ego.shape, df_nli.shape

((422360, 4), (275064, 4))

### Random seeds

In [14]:
seeds = [1, 21, 63, 84]
nnli = 18000
nego = 2000
save_dir = f"{data_dir}/final-10112025"

for seed in seeds:
    print("Seed: ", seed)
    save_path = f"{save_dir}/nli_{nnli}+ego_{nego}-seed_{seed}.csv"
    print("Path: ", save_path)
    if os.path.exists(save_path):
        print(f"File exists at {save_path}. Skipping.")

    df_split = pd.concat([
        df_nli.sample(n=nnli, random_state=seed),
        df_ego.sample(n=nego, random_state=seed),
    ]).reset_index(drop=True)
    print(df_split.shape)
    # print(df_split.iloc[0])
    df_split.to_csv(save_path, index=False)
    
    print('-' * 120)

Seed:  1
Path:  /scratch/shared/beegfs/piyush/datasets/SimCSE-NLI/final-10112025/nli_18000+ego_2000-seed_1.csv
(20000, 4)
------------------------------------------------------------------------------------------------------------------------
Seed:  21
Path:  /scratch/shared/beegfs/piyush/datasets/SimCSE-NLI/final-10112025/nli_18000+ego_2000-seed_21.csv
(20000, 4)
------------------------------------------------------------------------------------------------------------------------
Seed:  63
Path:  /scratch/shared/beegfs/piyush/datasets/SimCSE-NLI/final-10112025/nli_18000+ego_2000-seed_63.csv
(20000, 4)
------------------------------------------------------------------------------------------------------------------------
Seed:  84
Path:  /scratch/shared/beegfs/piyush/datasets/SimCSE-NLI/final-10112025/nli_18000+ego_2000-seed_84.csv
(20000, 4)
------------------------------------------------------------------------------------------------------------------------


### New optimal split

In [12]:
nnli = 18000
nego = 2000
seed = 42
df_split = pd.concat([
    df_nli.sample(n=nnli, random_state=seed),
    df_ego.sample(n=nego, random_state=seed),
]).reset_index(drop=True)
print(df_split.shape)
save_path = f"{save_dir}/nli_{nnli}+ego_{nego}-seed_{seed}.csv"
print(save_path)

(20000, 4)
/scratch/shared/beegfs/piyush/datasets/SimCSE-NLI/final-10112025/nli_18000+ego_2000-seed_42.csv


### Increasing data size

We will vary size of the data: 5K, 10K (done), 20K, 50K, 100K with the same composition of 0.9:0.1. We will make sure the 10K samples in our base split are used in the larger splits.

Then, for 10K samples, we can vary the ratio $\alpha:1-\alpha$.

- For all these experiments, we do not use subject replacement since it is expensive.
- We will run all these experiments and evaluate for temporal chiral actions on SSv2 (avg. of `t->v` and `v2t`)

In [5]:
# Load base split
save_dir = f"{data_dir}/final-10112025"
base_csv = f"{save_dir}/nli_9000+ego_1000-seed_42.csv"
df_base = pd.read_csv(base_csv)
df_base.shape

(10000, 7)

In [10]:
# 5K
df_split = pd.concat([
    df_base[df_base.source == 'nli'].sample(n=4500, random_state=42),
    df_base[df_base.source == 'nli'].sample(n=500, random_state=42),
]).reset_index(drop=True)
df_split.to_csv(f"{save_dir}/nli_4500+ego_500-seed_42.csv", index=False)

In [14]:
# 10K..100K
n = [20000, 50000, 100000]
for num in n:

    nnli = int(num * 0.9)
    nego = int(num * 0.1)
    save_path = f"{save_dir}/nli_{nnli}+ego_{nego}-seed_42.csv"

    # NOTE we use 10K samples already from the original split
    num = num - 10000

    nnli = int(num * 0.9)
    nego = int(num * 0.1)
    df_split = pd.concat([
        df_nli.sample(n=nnli, random_state=42),
        df_ego.sample(n=nego, random_state=42),
    ]).reset_index(drop=True)
    df_split = pd.concat([df_base, df_split]).reset_index(drop=True)

    print(df_split.shape)
    print(save_path)
    df_split.to_csv(save_path, index=False)
    print('-' * 100)

(20000, 7)
/scratch/shared/beegfs/piyush/datasets/SimCSE-NLI/final-10112025/nli_18000+ego_2000-seed_42.csv
----------------------------------------------------------------------------------------------------
(50000, 7)
/scratch/shared/beegfs/piyush/datasets/SimCSE-NLI/final-10112025/nli_45000+ego_5000-seed_42.csv
----------------------------------------------------------------------------------------------------
(100000, 7)
/scratch/shared/beegfs/piyush/datasets/SimCSE-NLI/final-10112025/nli_90000+ego_10000-seed_42.csv
----------------------------------------------------------------------------------------------------


In [21]:
num_nli = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 10000]
for nnli in num_nli:
    nego = 10000 - nnli
    save_path = f"{save_dir}/nli_{nnli}+ego_{nego}-seed_42.csv"
    print(save_path)

    if nnli < 9000:
        # Sample all from df_base
        df_nli_ = df_base[df_base.source == 'nli'].sample(n=nnli, random_state=42)
        df_ego_ = pd.concat(
            [df_base[df_base.source == 'ego4d'], df_ego.sample(n=9000-nnli)]
        )
    else:
        df_nli_ = pd.concat(
            [df_base[df_base.source == 'nli'], df_nli.sample(n=nnli-9000, random_state=42)]
        )
        df_ego_ = pd.DataFrame([])
    df_ = pd.concat([df_nli_, df_ego_]).reset_index(drop=True)
    print(df_.source.value_counts())
    print(nnli, nego)

    df_.to_csv(save_path, index=False)
    
    print('-' * 100)

/scratch/shared/beegfs/piyush/datasets/SimCSE-NLI/final-10112025/nli_0+ego_10000-seed_42.csv
source
ego4d    10000
Name: count, dtype: int64
0 10000
----------------------------------------------------------------------------------------------------
/scratch/shared/beegfs/piyush/datasets/SimCSE-NLI/final-10112025/nli_1000+ego_9000-seed_42.csv
source
ego4d    9000
nli      1000
Name: count, dtype: int64
1000 9000
----------------------------------------------------------------------------------------------------
/scratch/shared/beegfs/piyush/datasets/SimCSE-NLI/final-10112025/nli_2000+ego_8000-seed_42.csv
source
ego4d    8000
nli      2000
Name: count, dtype: int64
2000 8000
----------------------------------------------------------------------------------------------------
/scratch/shared/beegfs/piyush/datasets/SimCSE-NLI/final-10112025/nli_3000+ego_7000-seed_42.csv
source
ego4d    7000
nli      3000
Name: count, dtype: int64
3000 7000
--------------------------------------------------

In [None]:
n_nli = 150000
n_ego = 50000
split_path = f'{data_dir}/final-10112025/nli_{n_nli}+ego_{n_ego}-seed_42.csv'


In [None]:
np.random.seed(42)
random.seed(42)

df_nli_chosen = df_nli.sample(n=n_nli, random_state=42)
df_nli_chosen = df_nli_chosen.reset_index(drop=True)
print(df_nli_chosen.shape)

# Sample 5K rows from Ego
df_ego_chosen = df_ego.sample(n=n_ego, random_state=42)
df_ego_chosen = df_ego_chosen.reset_index(drop=True)
print(df_ego_chosen.shape)

# display_row(df_nli_chosen)
# display_row(df_ego_chosen)

df = pd.concat([df_nli_chosen, df_ego_chosen])
df.shape

display_row(df)

save_dir = f'{data_dir}/final-10112025'
os.makedirs(save_dir, exist_ok=True)

save_path = f"{save_dir}/nli_{n_nli}+ego_{n_ego}-seed_42.csv"
df.to_csv(save_path, index=False)

In [3]:
n_nli = 9000
n_ego = 1000
split_path = f'{data_dir}/final-10112025/nli_{n_nli}+ego_{n_ego}-seed_42.csv'
assert os.path.exists(split_path)
df = pd.read_csv(split_path)
df.shape

(10000, 7)

**Replace all the anonymized subject entires in Ego4D with realistic subjects**

In [4]:
df_ego = df[df.source == 'ego4d']
df_ego.shape

(1000, 7)

In [34]:
llm_server = 'gemini'
if llm_server == 'gemini':
    from utils.gemini_utils import GeminiWrapper
    llm = GeminiWrapper(model_key='gemini-flash-lite-latest')
elif llm_server == 'qwen3':
    llm = QwenWrapper(model_name='/work/piyush/pretrained_checkpoints/Qwen3-4B-Instruct-2507')
else:
    raise ValueError
llm.generate_answer('what is the name of a Tier 3 Indian city? Given one example')

[33mLoading gemini-flash-lite-latest ...............................................  [0m


'A Tier 3 Indian city is generally defined as a city with a population between **50,000 and 100,000** (though definitions can vary slightly based on the source, such as census data or specific government classifications).\n\nHere is one example of a Tier 3 Indian city:\n\n**Example:** **Bhilai, Chhattisgarh**'

In [40]:
prompt_base = """
You are an expert in English comprehension and writing.
Given three sentences where the subjects may be anonymized,
your task is to fill the placeholders for subjects with realistic
subjects.

For example, given these sentences,

S1: #C C Puts down a serving spoon and chop sticks on a cooking pot
S2: #C C puts a spoon in a bowl.
S3: #C C Picks up a serving spoon and chop sticks from a cooking pot

a valid response could be something like:

S1: The chef puts down a serving spoon and chop sticks on a cooking pot
S2: The chef puts a spoon in a bowl.
S3: The chef picks up a serving spoon and chop sticks from a cooking pot

This is only an example, think logically what subject would best fit
the given description and situation. For example, you will not find a cook
doing carpentary. In case you are not sure, you can use generic subject 
pronouns like 'The man' or 'The person' or 'The lady', or use proper nouns
like name of a person etc. Do not just use the template examples, you can
be slightly creative. Make sure it is the same subject in all three sentences.

Test input:

"""


def construct_prompt(row):
    prompt_input = f"S1: {row['sent0']}\nS2: {row['sent1']}\nS3: {row['hard_neg']}"
    prompt = prompt_base + prompt_input
    return prompt

In [47]:
debug = False
iterator = su.log.tqdm_iterator(range(len(df_ego))) if not debug else [np.random.randint(len(df_ego))]
answers = []
for i in iterator:
    row = df_ego.iloc[i].to_dict()
    prompt = construct_prompt(row)

    if debug:
        print(i)
        print('-' * 100)
        print(prompt)
        print('-' * 100)

        answer = llm.generate_answer(prompt)
        print(answer)
        print('-' * 100)
        break
    else:
        answer = llm.generate_answer(prompt)
        answers.append(answer)
len(answers)

  0%|          | 0/1000 [00:00<?, ?it/s]

1000

In [55]:
df_ego_subj = []
for i in su.log.tqdm_iterator(range(len(df_ego))):
    row = df_ego.iloc[i].to_dict()
    answer = answers[i]
    try:
        sentences = answer.split('\n')
        sentences = [s.split(": ")[1] for s in sentences]
    except:
        print(f"Failed for {i}. Skip.")
        sentences = [row['sent0'], row['sent1'], row['hard_neg']]
    row['sent0'], row['sent1'], row['hard_neg'] = sentences
    df_ego_subj.append(row)
df_ego_subj = pd.DataFrame(df_ego_subj)
df_ego_subj.shape

  0%|          | 0/1000 [00:00<?, ?it/s]

Failed for 30. Skip.
Failed for 123. Skip.
Failed for 221. Skip.
Failed for 300. Skip.
Failed for 352. Skip.
Failed for 384. Skip.
Failed for 394. Skip.
Failed for 398. Skip.
Failed for 985. Skip.


(1000, 7)

In [75]:
df_subj = pd.concat([df[df.source == 'nli'], df_ego_subj]).reset_index(drop=True)
df_subj.shape

(10000, 7)

In [76]:
display_row(df_subj)

{
  "sent0": "Many people are resting on park benches situated under beautiful pink trees that are in full bloom.",
  "sent1": "People are sitting in the park.",
  "hard_neg": "People are running past the flowers.",
  "source": "nli",
  "sent0-verbobj": NaN,
  "sent1-verbobj": NaN,
  "hard_neg-verbobj": NaN
}
----------------------------------------------------------------------------------------------------


In [77]:
df_subj = df_subj.reset_index(drop=True)

save_path = f'{data_dir}/final-10112025/nli_{n_nli}+ego_{n_ego}+subj_replaced-seed_42.csv'
df_subj.to_csv(save_path, index=False)

**Fix existing split**

In [68]:
csv_path = f"{data_dir}/final-10112025/nli_25000+ego_5000-seed_42.csv"
df_ = pd.read_csv(csv_path)
df_.sent0.isnull().mean(), \
df_.sent1.isnull().mean(), \
df_.hard_neg.isnull().mean()

(np.float64(0.0), np.float64(0.0), np.float64(0.0))

**Replace all the rows with Ego4D with those from NLI**

In [7]:
# Replace all the rows with Ego4D with those from NLI
df_noego = df.copy()
df_noego = df_noego[df_noego.source == 'nli'].reset_index(drop=True)

df_toadd = df_nli.sample(n=n_ego, random_state=42)
df_noego = pd.concat([df_noego, df_toadd])

df_noego.shape

(10000, 7)

In [8]:
df_noego = df_noego.reset_index(drop=True)

save_path = f'{data_dir}/final-10112025/nli_{n_nli+n_ego}-seed_42.csv'
df_noego.to_csv(save_path, index=False)

### Base: create fresh splits

**Load and process Ego4D metadata**

In [4]:
ego_dir = "/scratch/shared/beegfs/piyush/datasets/Ego4D-HCap/metadata"
df_vo = pd.read_csv(
    f"{ego_dir}/ego4d_5.3M_clips_verb_object_en_core_web_sm.csv"
)
df_vo.shape

(5273287, 3)

In [5]:
tqdm.pandas(desc='Merging verb object')
df_vo['verbobj'] = df_vo[['verb', 'object']].progress_apply(lambda x: f"{x[0]}/{x[1]}", axis=1)
df_vo.shape

Merging verb object: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 5273287/5273287 [01:54<00:00, 46169.29it/s]


(5273287, 4)

In [6]:
df_vo.iloc[10]

verb                          close
object                     cupboard
caption    #C C closes the cupboard
verbobj              close/cupboard
Name: 10, dtype: object

In [7]:
sent_to_verbobj = dict(zip(df_vo['caption'], df_vo['verbobj']))
len(sent_to_verbobj)

1640869

**Load text triplet datasets**

In [8]:
data_dir = "/scratch/shared/beegfs/piyush/datasets/SimCSE-NLI"

df_ego = pd.read_csv(f"{data_dir}/ego4d-422K.csv")
df_nli = pd.read_csv(f"{data_dir}/nli-275k.csv")

def filter_csv(df):
    """We do not want any repetition within a triplet for safety."""
    df = df[
        ~((df.sent0 == df.sent1) | (df.sent1 == df.hard_neg) | (df.sent0 == df.hard_neg))
    ].reset_index(drop=True)
    return df

df_ego = filter_csv(df_ego)
df_nli = filter_csv(df_nli)
df_ego['source'] = 'ego4d'
df_nli['source'] = 'nli'

df_ego.shape, df_nli.shape

((422360, 4), (275064, 4))

In [9]:
# Add verb/obj annotations to Ego4D
for col in su.log.tqdm_iterator(df_ego.columns):
    df_ego[f"{col}-verbobj"] = df_ego[col].map(sent_to_verbobj)
df_ego.shape

  0%|          | 0/4 [00:00<?, ?it/s]

(422360, 8)

In [10]:
# Only consider those triplets where verb/obj match for sent0 & sent1
df_ego = df_ego[df_ego['sent0-verbobj'] == df_ego['sent1-verbobj']].reset_index(drop=True)
del df_ego['source-verbobj']
df_ego.shape

(180006, 7)

In [11]:
j = np.random.randint(len(df_ego))
df_ego.iloc[j]

sent0                 #O person Y puts the glass on the table
sent1                             #C C puts glass on TV stand
hard_neg            #O person Y takes the glass off the table
source                                                  ego4d
sent0-verbobj                                       put/glass
sent1-verbobj                                       put/glass
hard_neg-verbobj                                          NaN
Name: 157783, dtype: object

#### Save 

In [12]:
np.random.seed(42)
random.seed(42)

# Sample 25K rows from NLI
n_nli = 150000
df_nli_chosen = df_nli.sample(n=n_nli, random_state=42)
df_nli_chosen = df_nli_chosen.reset_index(drop=True)
print(df_nli_chosen.shape)

# Sample 5K rows from Ego
n_ego = 50000
df_ego_chosen = df_ego.sample(n=n_ego, random_state=42)
df_ego_chosen = df_ego_chosen.reset_index(drop=True)
print(df_ego_chosen.shape)



display_row(df_nli_chosen)
display_row(df_ego_chosen)

df = pd.concat([df_nli_chosen, df_ego_chosen])
df.shape

display_row(df)

save_dir = f'{data_dir}/final-10112025'
os.makedirs(save_dir, exist_ok=True)

save_path = f"{save_dir}/nli_{n_nli}+ego_{n_ego}-seed_42.csv"
df.to_csv(save_path, index=False)

(150000, 4)
(50000, 7)
{
  "sent0": "A child, roughly ten years old, playing a video game on an old style computer.",
  "sent1": "The child is playing a video game.",
  "hard_neg": "The child is watching a movie.",
  "source": "nli"
}
----------------------------------------------------------------------------------------------------
{
  "sent0": "#C C lifts the cloth on the table with her left hand.",
  "sent1": "#C C lifts the cloth",
  "hard_neg": "#C C lowers the cloth on the table with her left hand.",
  "source": "ego4d",
  "sent0-verbobj": "lift/cloth",
  "sent1-verbobj": "lift/cloth",
  "hard_neg-verbobj": NaN
}
----------------------------------------------------------------------------------------------------
{
  "sent0": "Morgan died in 1588 before his task was complete, but nature finished what he had  Jamaica suffered a powerful earthquake in 1692, and Port Royal sank into the sea, taking with it many of the treasures stolen from the Spanish.",
  "sent1": "He died before c

**Review samples**

In [14]:
i = np.random.randint(len(df_nli))
df_nli.iloc[i].to_dict()

{'sent0': 'Two young blond children are inspecting the lobster tank in a store.',
 'sent1': 'Two young children are inspecting the lobster tank in a store.',
 'hard_neg': 'Three young blond children are inspecting the lobster tank in a store.',
 'source': 'nli'}

(25000, 4)

(5000, 4)

In [27]:
j = np.random.randint(len(df_ego))
df_ego.iloc[j].to_dict()

{'sent0': '#C C closes the cupboard with her left hand',
 'sent1': '#C C opens the kitchen cabinet',
 'hard_neg': '#C C opens the cupboard with her left hand',
 'source': 'ego4d'}

In [None]:
def find_verb_object(sent):
    df_vo[]


x = '#C C picks the bowl from the table'
find_verb_object(x)

In [30]:



df_ego['sent0_verb_obj'] = df_ego.sent0.apply(
    lambda x: find_verb_object(x)
)

Unnamed: 0,sent0,sent1,hard_neg,source
0,#C C places u type cross scissors on the sewin...,#C C puts scissors down,#C C removes u type cross scissors from the se...,ego4d
1,#C C puts the food on the dish,#C C puts a snack in the pan,#C C takes the food off the dish,ego4d
2,#C C puts cup on table,#C C puts the cup on the table with his right ...,#C C takes cup off table,ego4d
3,#C C opens the tin lid,#C C puts the lid in the rack,#C C closes the tin lid,ego4d
4,#C C folds the measuring tape in his left hand.,#C C folds a shirt,#C C unfolds the measuring tape in his left hand.,ego4d
...,...,...,...,...
422355,#C C drops the paint can with his left hand,#C C lowers her left hand.,#C C picks up the paint can with his left hand,ego4d
422356,#C C puts shopping bag on table,#C C puts the pack of apples in the box with h...,#C C takes shopping bag off table,ego4d
422357,#C C closes the container with its cover.,#C C closes the container on the table,#C C opens the container with its cover.,ego4d
422358,#C C lifts her left hand,#C C lifts his right hand from the mouse,#C C lowers her left hand,ego4d


In [29]:
df_vo

Unnamed: 0,verb,object,caption
0,carry,pot,#C C carries a pot from the cooker
1,drop,pot,#C C drops the pot on the cooker
2,carry,pan,#C C carries a sauce pan from the counter
3,drop,pan,#C C drops the sauce pan on the cooker
4,turn,light,#C C turns on a light
...,...,...,...
5273282,pick,bowl,#C C picks the bowl from the table
5273283,walk,sink,#C C walks towards the sink
5273284,put,bowls,#C C puts the bowls on sink
5273285,stare,,#C C stares around
