In [1]:
import os
import sys
import random

import numpy as np
import pandas as pd
from tqdm import tqdm

import shared.utils as su

In [57]:
def display_row(df):
    import json
    print(json.dumps(df.iloc[np.random.randint(len(df))].to_dict(), indent=2))
    print('-' * 100)

### Create ablation splits

In [2]:
data_dir = "/scratch/shared/beegfs/piyush/datasets/SimCSE-NLI"

df_nli = pd.read_csv(f"{data_dir}/nli-275k.csv")

def filter_csv(df):
    """We do not want any repetition within a triplet for safety."""
    df = df[
        ~((df.sent0 == df.sent1) | (df.sent1 == df.hard_neg) | (df.sent0 == df.hard_neg))
    ].reset_index(drop=True)
    return df

df_nli = filter_csv(df_nli)
df_nli['source'] = 'nli'

df_nli.shape

(275064, 4)

In [3]:
n_nli = 9000
n_ego = 1000
split_path = f'{data_dir}/final-10112025/nli_{n_nli}+ego_{n_ego}-seed_42.csv'
assert os.path.exists(split_path)
df = pd.read_csv(split_path)
df.shape

(10000, 7)

**Replace all the anonymized subject entires in Ego4D with realistic subjects**

In [4]:
df_ego = df[df.source == 'ego4d']
df_ego.shape

(1000, 7)

In [34]:
llm_server = 'gemini'
if llm_server == 'gemini':
    from utils.gemini_utils import GeminiWrapper
    llm = GeminiWrapper(model_key='gemini-flash-lite-latest')
elif llm_server == 'qwen3':
    llm = QwenWrapper(model_name='/work/piyush/pretrained_checkpoints/Qwen3-4B-Instruct-2507')
else:
    raise ValueError
llm.generate_answer('what is the name of a Tier 3 Indian city? Given one example')

[33mLoading gemini-flash-lite-latest ...............................................  [0m


'A Tier 3 Indian city is generally defined as a city with a population between **50,000 and 100,000** (though definitions can vary slightly based on the source, such as census data or specific government classifications).\n\nHere is one example of a Tier 3 Indian city:\n\n**Example:** **Bhilai, Chhattisgarh**'

In [40]:
prompt_base = """
You are an expert in English comprehension and writing.
Given three sentences where the subjects may be anonymized,
your task is to fill the placeholders for subjects with realistic
subjects.

For example, given these sentences,

S1: #C C Puts down a serving spoon and chop sticks on a cooking pot
S2: #C C puts a spoon in a bowl.
S3: #C C Picks up a serving spoon and chop sticks from a cooking pot

a valid response could be something like:

S1: The chef puts down a serving spoon and chop sticks on a cooking pot
S2: The chef puts a spoon in a bowl.
S3: The chef picks up a serving spoon and chop sticks from a cooking pot

This is only an example, think logically what subject would best fit
the given description and situation. For example, you will not find a cook
doing carpentary. In case you are not sure, you can use generic subject 
pronouns like 'The man' or 'The person' or 'The lady', or use proper nouns
like name of a person etc. Do not just use the template examples, you can
be slightly creative. Make sure it is the same subject in all three sentences.

Test input:

"""


def construct_prompt(row):
    prompt_input = f"S1: {row['sent0']}\nS2: {row['sent1']}\nS3: {row['hard_neg']}"
    prompt = prompt_base + prompt_input
    return prompt

In [47]:
debug = False
iterator = su.log.tqdm_iterator(range(len(df_ego))) if not debug else [np.random.randint(len(df_ego))]
answers = []
for i in iterator:
    row = df_ego.iloc[i].to_dict()
    prompt = construct_prompt(row)

    if debug:
        print(i)
        print('-' * 100)
        print(prompt)
        print('-' * 100)

        answer = llm.generate_answer(prompt)
        print(answer)
        print('-' * 100)
        break
    else:
        answer = llm.generate_answer(prompt)
        answers.append(answer)
len(answers)

  0%|          | 0/1000 [00:00<?, ?it/s]

1000

In [55]:
df_ego_subj = []
for i in su.log.tqdm_iterator(range(len(df_ego))):
    row = df_ego.iloc[i].to_dict()
    answer = answers[i]
    try:
        sentences = answer.split('\n')
        sentences = [s.split(": ")[1] for s in sentences]
    except:
        print(f"Failed for {i}. Skip.")
        sentences = [row['sent0'], row['sent1'], row['hard_neg']]
    row['sent0'], row['sent1'], row['hard_neg'] = sentences
    df_ego_subj.append(row)
df_ego_subj = pd.DataFrame(df_ego_subj)
df_ego_subj.shape

  0%|          | 0/1000 [00:00<?, ?it/s]

Failed for 30. Skip.
Failed for 123. Skip.
Failed for 221. Skip.
Failed for 300. Skip.
Failed for 352. Skip.
Failed for 384. Skip.
Failed for 394. Skip.
Failed for 398. Skip.
Failed for 985. Skip.


(1000, 7)

In [75]:
df_subj = pd.concat([df[df.source == 'nli'], df_ego_subj]).reset_index(drop=True)
df_subj.shape

(10000, 7)

In [76]:
display_row(df_subj)

{
  "sent0": "Many people are resting on park benches situated under beautiful pink trees that are in full bloom.",
  "sent1": "People are sitting in the park.",
  "hard_neg": "People are running past the flowers.",
  "source": "nli",
  "sent0-verbobj": NaN,
  "sent1-verbobj": NaN,
  "hard_neg-verbobj": NaN
}
----------------------------------------------------------------------------------------------------


In [77]:
df_subj = df_subj.reset_index(drop=True)

save_path = f'{data_dir}/final-10112025/nli_{n_nli}+ego_{n_ego}+subj_replaced-seed_42.csv'
df_subj.to_csv(save_path, index=False)

**Fix existing split**

In [68]:
csv_path = f"{data_dir}/final-10112025/nli_25000+ego_5000-seed_42.csv"
df_ = pd.read_csv(csv_path)
df_.sent0.isnull().mean(), \
df_.sent1.isnull().mean(), \
df_.hard_neg.isnull().mean()

(np.float64(0.0), np.float64(0.0), np.float64(0.0))

**Replace all the rows with Ego4D with those from NLI**

In [7]:
# Replace all the rows with Ego4D with those from NLI
df_noego = df.copy()
df_noego = df_noego[df_noego.source == 'nli'].reset_index(drop=True)

df_toadd = df_nli.sample(n=n_ego, random_state=42)
df_noego = pd.concat([df_noego, df_toadd])

df_noego.shape

(10000, 7)

In [8]:
df_noego = df_noego.reset_index(drop=True)

save_path = f'{data_dir}/final-10112025/nli_{n_nli+n_ego}-seed_42.csv'
df_noego.to_csv(save_path, index=False)

### Base: create fresh splits

**Load and process Ego4D metadata**

In [2]:
ego_dir = "/scratch/shared/beegfs/piyush/datasets/Ego4D-HCap/metadata"
df_vo = pd.read_csv(
    f"{ego_dir}/ego4d_5.3M_clips_verb_object_en_core_web_sm.csv"
)
df_vo.shape

(5273287, 3)

In [3]:
tqdm.pandas(desc='Merging verb object')
df_vo['verbobj'] = df_vo[['verb', 'object']].progress_apply(lambda x: f"{x[0]}/{x[1]}", axis=1)
df_vo.shape

Merging verb object: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 5273287/5273287 [01:53<00:00, 46544.26it/s]


(5273287, 4)

In [4]:
df_vo.iloc[10]

verb                          close
object                     cupboard
caption    #C C closes the cupboard
verbobj              close/cupboard
Name: 10, dtype: object

In [5]:
sent_to_verbobj = dict(zip(df_vo['caption'], df_vo['verbobj']))
len(sent_to_verbobj)

1640869

**Load text triplet datasets**

In [6]:
data_dir = "/scratch/shared/beegfs/piyush/datasets/SimCSE-NLI"

df_ego = pd.read_csv(f"{data_dir}/ego4d-422K.csv")
df_nli = pd.read_csv(f"{data_dir}/nli-275k.csv")

def filter_csv(df):
    """We do not want any repetition within a triplet for safety."""
    df = df[
        ~((df.sent0 == df.sent1) | (df.sent1 == df.hard_neg) | (df.sent0 == df.hard_neg))
    ].reset_index(drop=True)
    return df

df_ego = filter_csv(df_ego)
df_nli = filter_csv(df_nli)
df_ego['source'] = 'ego4d'
df_nli['source'] = 'nli'

df_ego.shape, df_nli.shape

((422360, 4), (275064, 4))

In [8]:
# Add verb/obj annotations to Ego4D
for col in su.log.tqdm_iterator(df_ego.columns):
    df_ego[f"{col}-verbobj"] = df_ego[col].map(sent_to_verbobj)
df_ego.shape

  0%|          | 0/4 [00:00<?, ?it/s]

(422360, 8)

In [12]:
# Only consider those triplets where verb/obj match for sent0 & sent1
df_ego = df_ego[df_ego['sent0-verbobj'] == df_ego['sent1-verbobj']].reset_index(drop=True)
del df_ego['source-verbobj']
df_ego.shape

(180006, 7)

In [18]:
j = np.random.randint(len(df_ego))
df_ego.iloc[j]

sent0                    #C C puts some sauce in the third flat bread
sent1                          #C C puts some sauce in the flat bread
hard_neg            #C C takes some sauce out of the third flat bread
source                                                          ego4d
sent0-verbobj                                               put/sauce
sent1-verbobj                                               put/sauce
hard_neg-verbobj                                                  NaN
Name: 21900, dtype: object

#### Save 

In [28]:
np.random.seed(42)
random.seed(42)

# Sample 25K rows from NLI
n_nli = 9000
df_nli_chosen = df_nli.sample(n=n_nli, random_state=42)
df_nli_chosen = df_nli_chosen.reset_index(drop=True)
print(df_nli_chosen.shape)

# Sample 5K rows from Ego
n_ego = 1000
df_ego_chosen = df_ego.sample(n=n_ego, random_state=42)
df_ego_chosen = df_ego_chosen.reset_index(drop=True)
print(df_ego_chosen.shape)



display_row(df_nli_chosen)
display_row(df_ego_chosen)

(9000, 4)
(1000, 7)
{
  "sent0": "A black and white dog running on the beach while a man stands behind it.",
  "sent1": "There is a dog running.",
  "hard_neg": "The dog is trying to run away from it's owner.",
  "source": "nli"
}
----------------------------------------------------------------------------------------------------
{
  "sent0": "#C C opens the box on the floor with his left hand.",
  "sent1": "#C C opens the box",
  "hard_neg": "#C C closes the box on the floor with his left hand.",
  "source": "ego4d",
  "sent0-verbobj": "open/box",
  "sent1-verbobj": "open/box",
  "hard_neg-verbobj": NaN
}
----------------------------------------------------------------------------------------------------


In [29]:
df = pd.concat([df_nli_chosen, df_ego_chosen])
df.shape

(10000, 7)

In [31]:
display_row(df)

{
  "sent0": "Two motorcyclists racing neck and neck around a corner.",
  "sent1": "Motorcyclists are in a close race around a corner.",
  "hard_neg": "The motorcyclists are indoors.",
  "source": "nli",
  "sent0-verbobj": NaN,
  "sent1-verbobj": NaN,
  "hard_neg-verbobj": NaN
}
----------------------------------------------------------------------------------------------------


In [32]:
save_dir = f'{data_dir}/final-10112025'
os.makedirs(save_dir, exist_ok=True)

save_path = f"{save_dir}/nli_{n_nli}+ego_{n_ego}-seed_42.csv"
df.to_csv(save_path, index=False)

**Review samples**

In [14]:
i = np.random.randint(len(df_nli))
df_nli.iloc[i].to_dict()

{'sent0': 'Two young blond children are inspecting the lobster tank in a store.',
 'sent1': 'Two young children are inspecting the lobster tank in a store.',
 'hard_neg': 'Three young blond children are inspecting the lobster tank in a store.',
 'source': 'nli'}

(25000, 4)

(5000, 4)

In [27]:
j = np.random.randint(len(df_ego))
df_ego.iloc[j].to_dict()

{'sent0': '#C C closes the cupboard with her left hand',
 'sent1': '#C C opens the kitchen cabinet',
 'hard_neg': '#C C opens the cupboard with her left hand',
 'source': 'ego4d'}

In [None]:
def find_verb_object(sent):
    df_vo[]


x = '#C C picks the bowl from the table'
find_verb_object(x)

In [30]:



df_ego['sent0_verb_obj'] = df_ego.sent0.apply(
    lambda x: find_verb_object(x)
)

Unnamed: 0,sent0,sent1,hard_neg,source
0,#C C places u type cross scissors on the sewin...,#C C puts scissors down,#C C removes u type cross scissors from the se...,ego4d
1,#C C puts the food on the dish,#C C puts a snack in the pan,#C C takes the food off the dish,ego4d
2,#C C puts cup on table,#C C puts the cup on the table with his right ...,#C C takes cup off table,ego4d
3,#C C opens the tin lid,#C C puts the lid in the rack,#C C closes the tin lid,ego4d
4,#C C folds the measuring tape in his left hand.,#C C folds a shirt,#C C unfolds the measuring tape in his left hand.,ego4d
...,...,...,...,...
422355,#C C drops the paint can with his left hand,#C C lowers her left hand.,#C C picks up the paint can with his left hand,ego4d
422356,#C C puts shopping bag on table,#C C puts the pack of apples in the box with h...,#C C takes shopping bag off table,ego4d
422357,#C C closes the container with its cover.,#C C closes the container on the table,#C C opens the container with its cover.,ego4d
422358,#C C lifts her left hand,#C C lifts his right hand from the mouse,#C C lowers her left hand,ego4d


In [29]:
df_vo

Unnamed: 0,verb,object,caption
0,carry,pot,#C C carries a pot from the cooker
1,drop,pot,#C C drops the pot on the cooker
2,carry,pan,#C C carries a sauce pan from the counter
3,drop,pan,#C C drops the sauce pan on the cooker
4,turn,light,#C C turns on a light
...,...,...,...
5273282,pick,bowl,#C C picks the bowl from the table
5273283,walk,sink,#C C walks towards the sink
5273284,put,bowls,#C C puts the bowls on sink
5273285,stare,,#C C stares around
