# Import libraries

In [16]:
import numpy as np
import pandas as pd
import os
import torch
import re
import string
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import nltk
from PIL import Image
from nltk.stem import WordNetLemmatizer
from transformers import CLIPProcessor, CLIPModel
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("max_colwidth", None)
from tqdm import tqdm

# Load images metadata to a df

In [2]:
image_dir = "/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final"

image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'}

image_data = []
for root, _, files in os.walk(image_dir):
    for file in files:
        if os.path.splitext(file)[1].lower() in image_extensions:
            image_path = os.path.join(root, file)
            image_data.append({'image_name': file, 'image_path': image_path})

images_metadata_df = pd.DataFrame(image_data)
print(images_metadata_df.shape)
images_metadata_df.head()

(302123, 2)


Unnamed: 0,image_name,image_path
0,000318123.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000318123.jpg
1,000275481.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000275481.jpg
2,000204278.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000204278.jpg
3,000139141.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000139141.jpg
4,000136272.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000136272.jpg


# Load captions to a df

In [3]:
ccneg_preprocessed_data = torch.load("/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_preprocessed.pt")

print(f"Keys: {list(ccneg_preprocessed_data.keys())}")

Keys: ['kept', 'dropped', 'image_paths', 'annotations', 'num_ops']


In [4]:
annotations = ccneg_preprocessed_data['annotations']

rows = []

for i in range(len(annotations)):
    ann = annotations[i]

    row = {
        'image_number': ann.get('image_number'),
        'file_extension':  ann.get('file_extension'),
        'true_caption': ann.get('caption'),
        'labels': ann.get('labels'),
        'negated_caption': ann.get('sop_data', {}).get('negative-prompt'),
        'url': ann.get('json', {}).get('url'),
        'subject': ann.get('sop_data', {}).get('sop_decomposition', {}).get('subject'),
        'object_predicate_pairs': ann.get('sop_data', {}).get('sop_decomposition', {}).get('object-predicate-pairs', {}),
        'predicate': ann.get('sop_data', {}).get('sop_decomposition', {}).get('predicate'),
        'negate_word_present': ann.get('negate_word_present'),
        'num_ops': ann.get('num_ops')
    }

    rows.append(row)

annotiation_df = pd.DataFrame(rows)
print(annotiation_df.shape)
annotiation_df.head(1)


(228246, 11)


Unnamed: 0,image_number,file_extension,true_caption,labels,negated_caption,url,subject,object_predicate_pairs,predicate,negate_word_present,num_ops
0,3,jpg,actor attends the season premiere,"musician,premiere,event,singer,suit,performance","actor, not attending the season premiere",https://media.gettyimages.com/photos/aidan-gillen-attends-the-season-7-premiere-of-hbos-game-of-thrones-at-picture-id817717986?s=612x612,actor,"[{'object': 'season premiere', 'predicate': 'attends the'}]",,True,1


In [5]:
annotiation_df['image_name'] = annotiation_df['image_number'] + '.' + annotiation_df['file_extension']

data_df = pd.merge(annotiation_df, images_metadata_df, on='image_name', how='inner')
print(data_df.shape)
data_df.head(10)

(228246, 13)


Unnamed: 0,image_number,file_extension,true_caption,labels,negated_caption,url,subject,object_predicate_pairs,predicate,negate_word_present,num_ops,image_name,image_path
0,3,jpg,actor attends the season premiere,"musician,premiere,event,singer,suit,performance","actor, not attending the season premiere",https://media.gettyimages.com/photos/aidan-gillen-attends-the-season-7-premiere-of-hbos-game-of-thrones-at-picture-id817717986?s=612x612,actor,"[{'object': 'season premiere', 'predicate': 'attends the'}]",,True,1,000000003.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000003.jpg
1,5,jpg,a woman walks her dog on the beach .,"water,beach,sea,shore,ocean,canidae,dog,sky,wave,coast,mudflat,dog walking,human,sand,photography",a woman walks on the beach without her dog,https://media.gettyimages.com/photos/woman-walks-her-dog-on-the-beach-on-october-21-2014-in-saltcoats-picture-id457587968,woman,"[{'object': 'dog', 'predicate': 'walks her on the beach'}]",,True,1,000000005.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000005.jpg
2,9,jpg,close up portrait of a smiling middle aged woman sitting against white wall,"hair,sitting,facial expression,nose,arm,cheek,smile,chin,lip,hand,close-up,brown hair,finger,wool,fur",close up portrait of a smiling middle aged woman without white wall,http://l7.alamy.com/zooms/29164f933d7340be90af1ab4c91f3644/close-up-portrait-of-a-smiling-middle-aged-woman-sitting-against-white-hxeygn.jpg,portrait,"[{'object': 'woman', 'predicate': 'of a'}, {'object': 'wall', 'predicate': 'against white'}, {'object': 'age', 'predicate': 'middle'}, {'object': 'smile', 'predicate': 'smiling'}]",,True,4,000000009.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000009.jpg
3,18,jpg,man sitting on floor beside a pool using laptop,"sitting,tablet computer,table,technology,grass,drinking,swimming pool,leisure,furniture,gadget,electronic device,vacation,laptop,stock photography,computer",man sitting on floor without a pool using laptop,https://media.gettyimages.com/photos/man-sitting-on-floor-beside-a-pool-using-laptop-picture-id588490995?s=612x612,man,"[{'object': 'floor', 'predicate': 'on'}, {'object': 'pool', 'predicate': 'beside a'}, {'object': 'laptop', 'predicate': 'using'}]",,True,3,000000018.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000018.jpg
4,7,jpg,a beautiful day with some buildings and plants .,"residential area,property,home,building,house,real estate,neighbourhood,town,architecture,estate,suburb,driveway,tree,road surface,road",a beautiful day without plants,https://d1tq208oegmb9e.cloudfront.net/site_photos_image/dbx%3A/urban+project/orange+county/fullerton/chapman+villas/Photos/3.jpg,day,"[{'object': 'buildings', 'predicate': 'with some'}, {'object': 'plants', 'predicate': 'with some'}]",,True,2,000000007.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000007.jpg
5,0,jpg,christmas tree on a black background .,"christmas tree,christmas decoration,font,text,graphic design,illustration,interior design,tree,christmas eve,ornament,fir,plant,pine,pine family,graphics","christmas tree, not on a black background",https://thumb1.shutterstock.com/display_pic_with_logo/261388/223876810/stock-vector-christmas-tree-on-a-black-background-vector-223876810.jpg,tree,"[{'object': 'background', 'predicate': 'on a'}]",,True,1,000000000.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000000.jpg
6,11,jpg,festive banner with flags and an inscription .,"logo,flag,illustration,red,text,font,emblem,graphic design,graphics,label,symbol,art","festive banner with an inscription, but not with flags",https://thumb1.shutterstock.com/display_pic_with_logo/161878175/475029928/stock-vector-vector-festive-banner-with-flags-of-the-vietnam-and-an-inscription-socialist-republic-of-vietnam-475029928.jpg,banner,"[{'object': 'flags', 'predicate': 'with'}, {'object': 'inscription', 'predicate': 'with'}]",,True,2,000000011.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000011.jpg
7,1,jpg,item : drawing of a figure surrounded by person,"drawing,modern art,line,visual arts,art,sketch,artwork,photographic paper,painting,illustration,black-and-white",item : drawing without a figure surrounded by person,https://i.pinimg.com/736x/f9/fd/48/f9fd48780900641ded7ab53d74fe86fe--figure-painting-figure-drawing.jpg,drawing,"[{'object': 'figure', 'predicate': 'of a'}, {'object': 'person', 'predicate': 'surrounded by'}]",,True,2,000000001.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000001.jpg
8,21,jpg,pop artist performs during the media call,"performance,stage,dancer,entertainment,music artist,event,performing arts,singer,performance art,talent show,public event,dance,music venue,concert dance,musician",pop artist performs without the media call,https://media.gettyimages.com/photos/singer-deni-hines-performs-during-the-media-call-for-dusty-the-pop-picture-id57161252?s=612x612,pop artist,"[{'object': 'media call', 'predicate': 'during the'}]",,True,1,000000021.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000021.jpg
9,24,jpg,actor attends the opening night after party,"premiere,carpet,red carpet,bow tie,hat,singer,fashion accessory,performance,tuxedo,formal wear,suit,musician",actor not attending the opening night after party,https://media.gettyimages.com/photos/actor-henry-kelemen-attends-the-our-new-girl-opening-night-after-at-picture-id450417500?s=612x612,actor,"[{'object': 'opening night after party', 'predicate': 'attends the'}]",,True,1,000000024.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000024.jpg


# Split this dataset into different negation types

In [10]:
def clean_caption(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)  # Remove punctuation
        text = text.strip()
        return text
    return text

data_df['true_caption_clean'] = data_df['true_caption'].apply(clean_caption)
data_df['negated_caption_clean'] = data_df['negated_caption'].apply(clean_caption)

In [11]:
syntactic_negators = {"no", "not", "never", "neither", "nobody", "nothing"}

def is_syntactic_negation(caption):
    tokens = caption.lower().split()
    return any(word in syntactic_negators for word in tokens)

def is_morphological_negation(caption):
    # Basic prefix patterns for morphological negation
    prefixes = ("un", "dis", "in", "im", "ir", "il", "non", "mis")
    words = caption.lower().split()
    return any(word.startswith(prefix) for word in words for prefix in prefixes)

lexical_negators = {"without", "lack", "absent", "avoid", "missing"}

def is_lexical_negation(caption):
    tokens = caption.lower().split()
    return any(word in lexical_negators for word in tokens)

def classify_negation(caption):
    if is_syntactic_negation(caption):
        return "syntactic"
    elif is_lexical_negation(caption):
        return "lexical/semantic"
    elif is_morphological_negation(caption):
        return "morphological"
    else:
        return "unknown/pragmatic"
    
data_df["negation_bucket"] = data_df["negated_caption_clean"].apply(classify_negation)

data_df.head()

Unnamed: 0,image_number,file_extension,true_caption,labels,negated_caption,url,subject,object_predicate_pairs,predicate,negate_word_present,num_ops,image_name,image_path,true_caption_clean,negated_caption_clean,negation_bucket
0,3,jpg,actor attends the season premiere,"musician,premiere,event,singer,suit,performance","actor, not attending the season premiere",https://media.gettyimages.com/photos/aidan-gillen-attends-the-season-7-premiere-of-hbos-game-of-thrones-at-picture-id817717986?s=612x612,actor,"[{'object': 'season premiere', 'predicate': 'attends the'}]",,True,1,000000003.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000003.jpg,actor attends the season premiere,actor not attending the season premiere,syntactic
1,5,jpg,a woman walks her dog on the beach .,"water,beach,sea,shore,ocean,canidae,dog,sky,wave,coast,mudflat,dog walking,human,sand,photography",a woman walks on the beach without her dog,https://media.gettyimages.com/photos/woman-walks-her-dog-on-the-beach-on-october-21-2014-in-saltcoats-picture-id457587968,woman,"[{'object': 'dog', 'predicate': 'walks her on the beach'}]",,True,1,000000005.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000005.jpg,a woman walks her dog on the beach,a woman walks on the beach without her dog,lexical/semantic
2,9,jpg,close up portrait of a smiling middle aged woman sitting against white wall,"hair,sitting,facial expression,nose,arm,cheek,smile,chin,lip,hand,close-up,brown hair,finger,wool,fur",close up portrait of a smiling middle aged woman without white wall,http://l7.alamy.com/zooms/29164f933d7340be90af1ab4c91f3644/close-up-portrait-of-a-smiling-middle-aged-woman-sitting-against-white-hxeygn.jpg,portrait,"[{'object': 'woman', 'predicate': 'of a'}, {'object': 'wall', 'predicate': 'against white'}, {'object': 'age', 'predicate': 'middle'}, {'object': 'smile', 'predicate': 'smiling'}]",,True,4,000000009.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000009.jpg,close up portrait of a smiling middle aged woman sitting against white wall,close up portrait of a smiling middle aged woman without white wall,lexical/semantic
3,18,jpg,man sitting on floor beside a pool using laptop,"sitting,tablet computer,table,technology,grass,drinking,swimming pool,leisure,furniture,gadget,electronic device,vacation,laptop,stock photography,computer",man sitting on floor without a pool using laptop,https://media.gettyimages.com/photos/man-sitting-on-floor-beside-a-pool-using-laptop-picture-id588490995?s=612x612,man,"[{'object': 'floor', 'predicate': 'on'}, {'object': 'pool', 'predicate': 'beside a'}, {'object': 'laptop', 'predicate': 'using'}]",,True,3,000000018.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000018.jpg,man sitting on floor beside a pool using laptop,man sitting on floor without a pool using laptop,lexical/semantic
4,7,jpg,a beautiful day with some buildings and plants .,"residential area,property,home,building,house,real estate,neighbourhood,town,architecture,estate,suburb,driveway,tree,road surface,road",a beautiful day without plants,https://d1tq208oegmb9e.cloudfront.net/site_photos_image/dbx%3A/urban+project/orange+county/fullerton/chapman+villas/Photos/3.jpg,day,"[{'object': 'buildings', 'predicate': 'with some'}, {'object': 'plants', 'predicate': 'with some'}]",,True,2,000000007.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/ConClip/ccneg_images/cc3m_subset_images_extracted_final/000000007.jpg,a beautiful day with some buildings and plants,a beautiful day without plants,lexical/semantic


In [12]:
buckets = data_df["negation_bucket"].unique()
print(f"Buckets: {buckets}")

bucket_datasets = {}

for bucket in buckets:
    bucket_datasets[bucket] = data_df[data_df["negation_bucket"] == bucket]

syntactic_df = bucket_datasets.get("syntactic")
print(f"Syntactic DF: {syntactic_df.shape}")

lexical_df = bucket_datasets.get("lexical/semantic")
print(f"Lexical DF: {lexical_df.shape}")


Buckets: ['syntactic' 'lexical/semantic']
Syntactic DF: (150735, 16)
Lexical DF: (77511, 16)


In [13]:
def evaluate_negation_sensitivity(df, model, processor):
    """
    Evaluate the negation sensitivity score for a given dataframe.

    Args:
        df (pd.DataFrame): DataFrame containing 'image_path', 'true_caption', and 'negated_caption'.
        model: The vision-language model for embedding generation.
        processor: The corresponding processor to preprocess data.

    Returns:
        results_df (pd.DataFrame): DataFrame with added columns for similarity and comparison.
        negation_sensitivity_score (float): Final computed score.
    """
    # Prepare lists to store results
    similarity_true_list = []
    similarity_neg_list = []
    comparison_list = []

    # Iterate through each row in the dataframe with tqdm
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating rows"):
        image_path = row["image_path"]
        true_caption = row["true_caption"]
        negated_caption = row["negated_caption"]

        # Load image
        image = Image.open(image_path).convert("RGB")

        # Evaluate true caption
        inputs_true = processor(text=[true_caption], images=image, return_tensors="pt", padding=True)
        with torch.no_grad():
            outputs_true = model(**inputs_true)
            similarity_true = torch.nn.functional.cosine_similarity(
                outputs_true.image_embeds, outputs_true.text_embeds
            ).item()

        # Evaluate negated caption
        inputs_neg = processor(text=[negated_caption], images=image, return_tensors="pt", padding=True)
        with torch.no_grad():
            outputs_neg = model(**inputs_neg)
            similarity_neg = torch.nn.functional.cosine_similarity(
                outputs_neg.image_embeds, outputs_neg.text_embeds
            ).item()

        # Store results
        similarity_true_list.append(similarity_true)
        similarity_neg_list.append(similarity_neg)
        comparison_list.append(int(similarity_true > similarity_neg))

    # Create a new dataframe with results
    results_df = df.copy()
    results_df["similarity_true"] = similarity_true_list
    results_df["similarity_neg"] = similarity_neg_list
    results_df["true_greater_than_negated"] = comparison_list

    # Compute the final evaluation metric
    negation_sensitivity_score = sum(comparison_list) / len(comparison_list)

    return results_df, negation_sensitivity_score

# Run Clip ViT Large Patch14 @336px

In [14]:
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14-336" )
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14-336")

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [17]:
results_df, score = evaluate_negation_sensitivity(syntactic_df, model, processor)
print(f"\nNegation Sensitivity Score: {score:.4f}")
results_df.head()

Evaluating rows:   0%|          | 50/150735 [01:37<81:21:00,  1.94s/it]


KeyboardInterrupt: 

In [None]:
results_df, score = evaluate_negation_sensitivity(lexical_df, model, processor)
print(f"\nNegation Sensitivity Score: {score:.4f}")
results_df.head()