# Import libraries

In [132]:
import numpy as np
import pandas as pd
import os
import torch
import re
import string
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import tqdm
from tqdm import tqdm
from itertools import chain
import spacy
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Load MCQ Dataset - Caption

In [2]:
coco_mcq = '/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/NegBench/evaluation data/images/COCO_val_mcq_llama3.1_rephrased.csv'
voc_mcq = '/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/NegBench/evaluation data/images/VOC2007_mcq_llama3.1_rephrased.csv'
# synthetic_mcq = '/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/NegBench/evaluation data/images/synthetic_mcq_llama3.1_rephrased.csv'
msr_vtt_mcq = '/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/NegBench/evaluation data/videos/msr_vtt_mcq_rephrased_llama.csv'

coco_df = pd.read_csv(coco_mcq)
voc_df = pd.read_csv(voc_mcq)
# synthetic_df = pd.read_csv(synthetic_mcq)
msr_vtt_df = pd.read_csv(msr_vtt_mcq)
print(f"COCO shape: {coco_df.shape}")
print(f"VOC 2007 shape: {voc_df.shape}")
# print(f"Synthetic shape: {synthetic_df.shape}")
print(f"MSR VTT shape: {msr_vtt_df.shape}")

metadata_caption_df = pd.concat([coco_df, voc_df, msr_vtt_df], axis=0, ignore_index=True)
print(f"Final Metadata shape: {metadata_caption_df.shape}")

metadata_caption_df[['dataset_name', 'file_name']] = metadata_caption_df['image_path'].str.extract(r'data/([^/]+)/.*?/([^/]+\.[a-z0-9]+)$')

metadata_caption_df.head(3)

COCO shape: (5914, 7)
VOC 2007 shape: (5031, 7)
MSR VTT shape: (1000, 7)
Final Metadata shape: (11945, 7)


Unnamed: 0,image_path,correct_answer,caption_0,caption_1,caption_2,caption_3,correct_answer_template,dataset_name,file_name
0,data/coco/images/val2017/000000397133.jpg,0,"This image features a knife, but no car is present.","A car is present in this image, but there is no knife.",This image features a car,This image does not feature a knife.,hybrid,coco,000000397133.jpg
1,data/coco/images/val2017/000000397133.jpg,0,A chair is not present in this image.,"This image shows a chair, but no spoon is present.",A chair is present in this image.,A spoon is not included in this image.,negative,coco,000000397133.jpg
2,data/coco/images/val2017/000000397133.jpg,0,"A person is present in this image, but there's no fork.","This image shows a fork, with no person in sight.",A fork is shown in this image.,No person is present in this image.,hybrid,coco,000000397133.jpg


In [130]:
metadata_caption_df['dataset_name'].value_counts()

dataset_name
coco       5914
voc2007    5031
video      1000
Name: count, dtype: int64

# Load MCQ Dataset - Images / Videos

In [3]:
coco_val_dir = '/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017'
mediafire_val_dir = '/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/mediafire_val_videos'
voc_val_dir = '/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/VOCtest_06-Nov-2007/VOC2007/JPEGImages'
# synthetic_mcq = '/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/NegBench/evaluation data/images/synthetic_mcq_llama3.1_rephrased.csv'

coco_image_files = [f for f in os.listdir(coco_val_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
mediafire_video_files = [f for f in os.listdir(mediafire_val_dir) if f.lower().endswith('.mp4')]
voc_image_files = [f for f in os.listdir(voc_val_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

coco_val_df = pd.DataFrame({'file_name': coco_image_files, 'file_path': [os.path.join(coco_val_dir, f) for f in coco_image_files],'dataset_name': 'coco'})
mediafire_val_df = pd.DataFrame({'file_name': mediafire_video_files, 'file_path': [os.path.join(mediafire_val_dir, f) for f in mediafire_video_files], 'dataset_name': 'video'})
voc_val_df = pd.DataFrame({'file_name': voc_image_files, 'file_path': [os.path.join(voc_val_dir, f) for f in voc_image_files], 'dataset_name': 'voc2007'})

print(f"COCO shape: {coco_val_df.shape}")
print(f"Mediafire shape: {mediafire_val_df.shape}")
print(f"VOC shape: {voc_val_df.shape}")

metadata_images_videos_df = pd.concat([coco_val_df, voc_val_df, mediafire_val_df], axis=0, ignore_index=True)
print(f"Final Metadata shape: {metadata_images_videos_df.shape}")

metadata_images_videos_df.head()

COCO shape: (5000, 3)
Mediafire shape: (7010, 3)
VOC shape: (4952, 3)
Final Metadata shape: (16962, 3)


Unnamed: 0,file_name,file_path,dataset_name
0,000000182611.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017/000000182611.jpg,coco
1,000000335177.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017/000000335177.jpg,coco
2,000000278705.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017/000000278705.jpg,coco
3,000000463618.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017/000000463618.jpg,coco
4,000000568981.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017/000000568981.jpg,coco


# Merge the captions and images/videos path

In [7]:
metadata_caption_df['dataset_name'].value_counts()
# metadata_images_videos_df['dataset_name'].value_counts()

merged_df = pd.merge(
    metadata_caption_df,
    metadata_images_videos_df,
    on=['file_name', 'dataset_name'],
    how='inner'
)

print(f"Merged shape: {merged_df.shape}")
merged_df.head()

Merged shape: (10945, 10)


Unnamed: 0,image_path,correct_answer,caption_0,caption_1,caption_2,caption_3,correct_answer_template,dataset_name,file_name,file_path
0,data/coco/images/val2017/000000397133.jpg,0,"This image features a knife, but no car is present.","A car is present in this image, but there is no knife.",This image features a car,This image does not feature a knife.,hybrid,coco,000000397133.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017/000000397133.jpg
1,data/coco/images/val2017/000000397133.jpg,0,A chair is not present in this image.,"This image shows a chair, but no spoon is present.",A chair is present in this image.,A spoon is not included in this image.,negative,coco,000000397133.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017/000000397133.jpg
2,data/coco/images/val2017/000000397133.jpg,0,"A person is present in this image, but there's no fork.","This image shows a fork, with no person in sight.",A fork is shown in this image.,No person is present in this image.,hybrid,coco,000000397133.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017/000000397133.jpg
3,data/coco/images/val2017/000000397133.jpg,0,No car is present in this image.,"This image features a car, but there is no spoon.",A car is present in this image.,No spoon is visible in this image.,negative,coco,000000397133.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017/000000397133.jpg
4,data/coco/images/val2017/000000397133.jpg,0,A sink and a spoon are included in this image.,"A car is present in this image, but a sink is not.",This image features a car.,A sink is not present in this image.,positive,coco,000000397133.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017/000000397133.jpg


In [31]:
merged_df['dataset_name'].value_counts()

dataset_name
coco       5914
voc2007    5031
Name: count, dtype: int64

# Split dataset into different negation types

In [113]:
def clean_caption(text):
    if isinstance(text, str):
        text = text.lower()
        
        contractions = {
            "isn't": "isnot", "aren't": "arenot","wasn't": "wasnot","weren't": "werenot","don't": "donot","doesn't": "doesnot",
            "didn't": "didnot","can't": "cannot","couldn't": "couldnot","won't": "willnot","wouldn't": "wouldnot","shouldn't": "shouldnot",
            "mustn't": "mustnot","hadn't": "hadnot","hasn't": "hasnot","haven't": "havenot","mightn't": "mightnot",
            "needn't": "neednot","shan't": "shallnot",
        }

        for contraction, replacement in contractions.items():
            text = text.replace(contraction, replacement)

        text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
        text = text.strip()
        return text
    return text

merged_df['caption_0_clean'] = merged_df['caption_0'].apply(clean_caption)
merged_df['caption_1_clean'] = merged_df['caption_1'].apply(clean_caption)
merged_df['caption_2_clean'] = merged_df['caption_2'].apply(clean_caption)
merged_df['caption_3_clean'] = merged_df['caption_3'].apply(clean_caption)

# merged_df.head(3)

In [114]:
syntactic_negators = {"no", "not", "never", "neither", "nobody", "nothing"}
ignore_words = {"image", "images", "included", "include", "includes", "displays", 
"introduces", "information", "individual", "individuals", "insert", "image”", "improve"}
prefixes = ("un", "dis", "in", "im", "ir", "il", "non", "mis")
lexical_negators = {"without", "lack", "absent", "avoid", "missing"}

def is_syntactic_negation(caption):
    tokens = caption.lower().split()
    return any(word in syntactic_negators for word in tokens)

def is_morphological_negation(caption):
    words = caption.lower().split()
    for word in words:
        if word in ignore_words:
            continue
        if any(word.startswith(prefix) and len(word) > len(prefix) for prefix in prefixes):
            return True
    return False

def is_lexical_negation(caption):
    tokens = caption.lower().split()
    return any(word in lexical_negators for word in tokens)

def classify_negation(caption):
    if is_syntactic_negation(caption):
        return "syntactic"
    elif is_morphological_negation(caption):
        return "morphological"
    elif is_lexical_negation(caption):
        return "lexical/semantic"
    else:
        return "unknown/pragmatic"
    
merged_df["caption_0_negation_bucket"] = merged_df["caption_0_clean"].apply(classify_negation)
merged_df["caption_1_negation_bucket"] = merged_df["caption_1_clean"].apply(classify_negation)
merged_df["caption_2_negation_bucket"] = merged_df["caption_2_clean"].apply(classify_negation)
merged_df["caption_3_negation_bucket"] = merged_df["caption_3_clean"].apply(classify_negation)

merged_df.head(3)

Unnamed: 0,image_path,correct_answer,caption_0,caption_1,caption_2,caption_3,correct_answer_template,dataset_name,file_name,file_path,caption_0_clean,caption_1_clean,caption_2_clean,caption_3_clean,caption_0_negation_bucket,caption_1_negation_bucket,caption_2_negation_bucket,caption_3_negation_bucket
0,data/coco/images/val2017/000000397133.jpg,0,"This image features a knife, but no car is present.","A car is present in this image, but there is no knife.",This image features a car,This image does not feature a knife.,hybrid,coco,000000397133.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017/000000397133.jpg,this image features a knife but no car is present,a car is present in this image but there is no knife,this image features a car,this image does not feature a knife,syntactic,syntactic,unknown/pragmatic,syntactic
1,data/coco/images/val2017/000000397133.jpg,0,A chair is not present in this image.,"This image shows a chair, but no spoon is present.",A chair is present in this image.,A spoon is not included in this image.,negative,coco,000000397133.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017/000000397133.jpg,a chair is not present in this image,this image shows a chair but no spoon is present,a chair is present in this image,a spoon is not included in this image,syntactic,syntactic,unknown/pragmatic,syntactic
2,data/coco/images/val2017/000000397133.jpg,0,"A person is present in this image, but there's no fork.","This image shows a fork, with no person in sight.",A fork is shown in this image.,No person is present in this image.,hybrid,coco,000000397133.jpg,/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/coco_val2017/000000397133.jpg,a person is present in this image but there s no fork,this image shows a fork with no person in sight,a fork is shown in this image,no person is present in this image,syntactic,syntactic,unknown/pragmatic,syntactic


In [115]:
# Split dataset
negation_cols = [ 'caption_0_negation_bucket', 'caption_1_negation_bucket', 'caption_2_negation_bucket', 'caption_3_negation_bucket']

syntactic_df = merged_df[merged_df[negation_cols].apply(lambda x: x.astype(str).str.contains("syntactic", case=False).any(), axis=1)]
morphological_df = merged_df[merged_df[negation_cols].apply(lambda x: x.astype(str).str.contains("morphological", case=False).any(), axis=1)]
lexical_semantic_df = merged_df[merged_df[negation_cols].apply(lambda x: x.astype(str).str.contains("lexical/semantic", case=False, regex=True).any(), axis=1)]

# Optional: Print counts
print(f"Syntactic: {syntactic_df.shape}")
print(f"Morphological: {morphological_df.shape}")
print(f"Lexical/Semantic: {lexical_semantic_df.shape}")


Syntactic: (10931, 18)
Morphological: (43, 18)
Lexical/Semantic: (814, 18)


# Run Models

In [126]:
from tqdm import tqdm

def evaluate_mcq_accuracy(df, model, processor):
    """
    Evaluate multiple-choice question accuracy for negation understanding.

    Args:
        df (pd.DataFrame): DataFrame with columns: 'file_path', 'caption_0', ..., 'caption_3', and 'correct_caption_index'.
        model: Vision-language model (e.g., CLIP).
        processor: Preprocessor for the model.

    Returns:
        results_df (pd.DataFrame): DataFrame with similarity scores and predictions.
        accuracy (float): Accuracy of the model on the MCQ dataset.
    """
    all_predictions = []
    all_correct = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating MCQ"):
        image = Image.open(row["file_path"]).convert("RGB")
        captions = [row[f"caption_{i}"] for i in range(4)]
        correct_index = row["correct_caption_index"]

        # Preprocess inputs
        inputs = processor(text=captions, images=[image]*4, return_tensors="pt", padding=True)

        with torch.no_grad():
            outputs = model(**inputs)
            similarities = torch.nn.functional.cosine_similarity(
                outputs.image_embeds, outputs.text_embeds
            )

        predicted_index = similarities.argmax().item()
        all_predictions.append(predicted_index)
        all_correct.append(int(predicted_index == correct_index))

    # Prepare results
    results_df = df.copy()
    results_df["predicted_index"] = all_predictions
    results_df["correct_prediction"] = all_correct

    # Compute accuracy
    accuracy = sum(all_correct) / len(all_correct)

    return results_df, accuracy


# Load CLIP ViT Base Patch32 model

In [124]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [129]:
results_df, score = evaluate_negation_sensitivity(syntactic_df, model, processor)
print(f"\nNegation Sensitivity Score: {score:.4f}")
results_df.head()

TypeError: 'module' object is not callable

# Run model