In [None]:
import clip

In [1]:
import os
import sys

import shared.utils as su

import pandas as pd
import numpy as np


In [None]:
import torch


# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/16", device=device)

def encode_sentences(sentences):
    """
    Encode a list of sentences using CLIP.
    
    Args:
        sentences: List of strings
        
    Returns:
        torch.Tensor of shape [B, D] where B is batch size and D is embedding dimension (512 for ViT-B/16)
    """
    # Tokenize sentences
    text_tokens = clip.tokenize(sentences).to(device)
    
    # Encode text
    with torch.no_grad():
        text_embeddings = model.encode_text(text_tokens)
        # Normalize embeddings (CLIP uses normalized features)
        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True).cpu()
    
    return text_embeddings  # Shape: [B, 512]

# Example usage
sentences = ["a photo of a cat", "a photo of a dog", "a photo of a bird"]
embeddings = encode_sentences(sentences)
print(f"Embeddings shape: {embeddings.shape}")  # [3, 512]

In [2]:
csv_path = "/scratch/shared/beegfs/piyush/datasets/SimCSE-NLI/final-10112025/nli_9000+ego_1000+subj_replaced-seed_42.csv"
df = pd.read_csv(csv_path)
df.shape

(10000, 7)

In [3]:
df

Unnamed: 0,sent0,sent1,hard_neg,source,sent0-verbobj,sent1-verbobj,hard_neg-verbobj
0,A guy in a black shirt and tan pants leaning a...,man leans out window,cat chases other cats,nli,,,
1,"Man with a lit cigarette in mouth, yellow base...",The man is smoking a cigarette.,The man is wearing a sombrero.,nli,,,
2,I hope you can help.,I anticipate that you can lend a hand.,I would appreciate if you would leave this alone.,nli,,,
3,Keeping her eyes fixed steadily on the other's...,Tuppence did not lose sight of her.,Tuppence could not look her in the eye and kep...,nli,,,
4,Man blows bubbles in a bathtub.,A man blowing bubbles.,A man in the ocean.,nli,,,
...,...,...,...,...,...,...,...
9995,The cleaner puts away the spray bottle,The cleaner puts the spray bottle on the paper,The cleaner takes out the spray bottle,ego4d,put/bottle,put/bottle,
9996,The cook picks up a knife from the kitchen boa...,The farmer picks the sickle knife from the gro...,The cook puts down a knife on the kitchen boar...,ego4d,pick/knife,pick/knife,
9997,The woman takes food from the plate with her r...,The person takes a food from a plate on the ki...,The woman puts food onto the plate with her ri...,ego4d,take/food,take/food,
9998,The old man B picks cup,The old man B picks a cup of juice on the tabl...,The old man B puts cup,ego4d,pick/cup,pick/cup,


In [8]:
from tasks.extract_verb_object import *

model_id = "en_core_web_sm"
use_gpu = False
nlp = spacy.load(model_id)

In [12]:
def extract_verb_object_customised(sentence):
    """
    Extract the main action verb and its primary object (non-debug version).
    """
    cleaned_sentence = clean_sentence(sentence)
    
    if not cleaned_sentence:
        return (None, None)
    
    doc = nlp(cleaned_sentence)
    
    # Find the ROOT token (main verb) - even if spaCy tags it wrong
    main_verb = None
    for token in doc:
        if token.dep_ == "ROOT":
            main_verb = token
            break
    
    if not main_verb:
        return (None, None)
    
    # Get the lemma - this will give us the base form even if POS is wrong
    verb_lemma = main_verb.lemma_
    
    # Strategy 1: Look for direct object or appositive (like "fixes wires")
    for child in main_verb.children:
        if child.dep_ in ["dobj", "appos"]:
            return (verb_lemma, child.text)
    
    # Strategy 2: Look for prepositional objects
    for child in main_verb.children:
        if child.dep_ == "prep":
            for grandchild in child.children:
                if grandchild.dep_ == "pobj":
                    return (verb_lemma, grandchild.text)
    
    return (verb_lemma, None)

In [14]:
def extract_verb_object(captions):
    iterator = su.log.tqdm_iterator(captions, desc="Running inference on sample")
    outputs = {"verb": [], "object": [], "caption": []}
    for c in iterator:
        v, o = extract_verb_object_customised(c)
        # try:
        #     # v, o = extract_simple_action_object(c)
        #     v, o = extract_verb_object(c, nlp)
        # except:
        #     v, o = None, None
        outputs['verb'].append(v)
        outputs['object'].append(o)
        outputs['caption'].append(c)
    outputs = pd.DataFrame(outputs)
    return outputs


outputs_sent0 = extract_verb_object(df.sent0.tolist())
outputs_sent0

Running inference on sample:   0%|          | 0/10000 [00:00<?, ?it/s]

Unnamed: 0,verb,object,caption
0,look,something,A guy in a black shirt and tan pants leaning a...
1,turn,,"Man with a lit cigarette in mouth, yellow base..."
2,hope,,I hope you can help.
3,start,,Keeping her eyes fixed steadily on the other's...
4,blow,bubbles,Man blows bubbles in a bathtub.
...,...,...,...
9995,put,bottle,The cleaner puts away the spray bottle
9996,pick,knife,The cook picks up a knife from the kitchen boa...
9997,take,food,The woman takes food from the plate with her r...
9998,pick,cup,The old man B picks cup


In [16]:
outputs_sent1 = extract_verb_object(df.sent1.tolist())
outputs_hard_neg = extract_verb_object(df.hard_neg.tolist())

Running inference on sample:   0%|          | 0/10000 [00:00<?, ?it/s]

Running inference on sample:   0%|          | 0/10000 [00:00<?, ?it/s]

In [17]:
outputs_sent1

Unnamed: 0,verb,object,caption
0,lean,window,man leans out window
1,smoke,cigarette,The man is smoking a cigarette.
2,anticipate,,I anticipate that you can lend a hand.
3,lose,sight,Tuppence did not lose sight of her.
4,man,,A man blowing bubbles.
...,...,...,...
9995,put,bottle,The cleaner puts the spray bottle on the paper
9996,pick,knife,The farmer picks the sickle knife from the gro...
9997,take,food,The person takes a food from a plate on the ki...
9998,pick,cup,The old man B picks a cup of juice on the tabl...


In [18]:
outputs_hard_neg

Unnamed: 0,verb,object,caption
0,chase,cats,cat chases other cats
1,wear,sombrero,The man is wearing a sombrero.
2,appreciate,,I would appreciate if you would leave this alone.
3,look,her,Tuppence could not look her in the eye and kep...
4,man,ocean,A man in the ocean.
...,...,...,...
9995,take,bottle,The cleaner takes out the spray bottle
9996,put,knife,The cook puts down a knife on the kitchen boar...
9997,put,food,The woman puts food onto the plate with her ri...
9998,put,cup,The old man B puts cup


In [21]:
df['anno_sent0'] = outputs_sent0[['verb', 'object']].apply(lambda x: f"{x[0]}/{x[1]}", axis=1)
df['anno_sent1'] = outputs_sent1[['verb', 'object']].apply(lambda x: f"{x[0]}/{x[1]}", axis=1)
df['anno_hard_neg'] = outputs_hard_neg[['verb', 'object']].apply(lambda x: f"{x[0]}/{x[1]}", axis=1)
df.iloc[0].to_dict()

{'sent0': 'A guy in a black shirt and tan pants leaning a little out of a large window looking at something.',
 'sent1': 'man leans out window',
 'hard_neg': 'cat chases other cats',
 'source': 'nli',
 'sent0-verbobj': nan,
 'sent1-verbobj': nan,
 'hard_neg-verbobj': nan,
 'anno_sent0': 'look/something',
 'anno_sent1': 'lean/window',
 'anno_hard_neg': 'chase/cats'}

In [28]:
j = np.random.randint(len(df))
df.iloc[j].to_dict()

{'sent0': 'A young boy wearing blue swim trunks and goggles is jumping into a swimming pool.',
 'sent1': 'A boy in a swimsuit jumps into a pool.',
 'hard_neg': 'A boy in a formal wear jumps into a pool.',
 'source': 'nli',
 'sent0-verbobj': nan,
 'sent1-verbobj': nan,
 'hard_neg-verbobj': nan,
 'anno_sent0': 'jump/pool',
 'anno_sent1': 'jump/pool',
 'anno_hard_neg': 'jump/pool'}

In [33]:
df[df.source == 'ego4d']['hard_neg-verbobj']

9000             NaN
9001             NaN
9002             NaN
9003       close/tap
9004    close/drawer
            ...     
9995             NaN
9996             NaN
9997             NaN
9998             NaN
9999             NaN
Name: hard_neg-verbobj, Length: 1000, dtype: object

In [34]:
df

Unnamed: 0,sent0,sent1,hard_neg,source,sent0-verbobj,sent1-verbobj,hard_neg-verbobj,anno_sent0,anno_sent1,anno_hard_neg
0,A guy in a black shirt and tan pants leaning a...,man leans out window,cat chases other cats,nli,,,,look/something,lean/window,chase/cats
1,"Man with a lit cigarette in mouth, yellow base...",The man is smoking a cigarette.,The man is wearing a sombrero.,nli,,,,turn/None,smoke/cigarette,wear/sombrero
2,I hope you can help.,I anticipate that you can lend a hand.,I would appreciate if you would leave this alone.,nli,,,,hope/None,anticipate/None,appreciate/None
3,Keeping her eyes fixed steadily on the other's...,Tuppence did not lose sight of her.,Tuppence could not look her in the eye and kep...,nli,,,,start/None,lose/sight,look/her
4,Man blows bubbles in a bathtub.,A man blowing bubbles.,A man in the ocean.,nli,,,,blow/bubbles,man/None,man/ocean
...,...,...,...,...,...,...,...,...,...,...
9995,The cleaner puts away the spray bottle,The cleaner puts the spray bottle on the paper,The cleaner takes out the spray bottle,ego4d,put/bottle,put/bottle,,put/bottle,put/bottle,take/bottle
9996,The cook picks up a knife from the kitchen boa...,The farmer picks the sickle knife from the gro...,The cook puts down a knife on the kitchen boar...,ego4d,pick/knife,pick/knife,,pick/knife,pick/knife,put/knife
9997,The woman takes food from the plate with her r...,The person takes a food from a plate on the ki...,The woman puts food onto the plate with her ri...,ego4d,take/food,take/food,,take/food,take/food,put/food
9998,The old man B picks cup,The old man B picks a cup of juice on the tabl...,The old man B puts cup,ego4d,pick/cup,pick/cup,,pick/cup,pick/cup,put/cup


In [36]:
verbs_fwd = df[df.source == 'ego4d']['anno_sent0'].apply(lambda x: x.split('/')[0]).tolist()
verbs_rev = df[df.source == 'ego4d']['anno_hard_neg'].apply(lambda x: x.split('/')[0]).tolist()
len(verbs_fwd), len(verbs_rev)

(1000, 1000)

In [38]:
len(set(verbs_fwd)), len(set(verbs_rev))

(33, 46)

In [45]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
verb_pairs = []
for i in range(len(df)):
    row = df.iloc[i].to_dict()
    verb_a = 