# Import libraries

In [37]:
import numpy as np
import pandas as pd
import os
import torch
import re
import string
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from itertools import chain
import clip
import open_clip
import tqdm
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Load Dataset

## Load MCQ Dataset - Caption

In [3]:
coco_mcq = '/workspace/Dataset/neg_bench/image_dataset_csv/COCO_val_mcq_llama3.1_rephrased.csv'
voc_mcq = '/workspace/Dataset/neg_bench/image_dataset_csv/VOC2007_mcq_llama3.1_rephrased.csv'
# synthetic_mcq = '/workspace/Dataset/neg_bench/image_dataset_csv/synthetic_mcq_llama3.1_rephrased.csv'
# msr_vtt_mcq = '/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/NegBench/evaluation data/videos/msr_vtt_mcq_rephrased_llama.csv'

coco_df = pd.read_csv(coco_mcq)
voc_df = pd.read_csv(voc_mcq)
# synthetic_df = pd.read_csv(synthetic_mcq)
# msr_vtt_df = pd.read_csv(msr_vtt_mcq)
print(f"COCO shape: {coco_df.shape}")
print(f"VOC 2007 shape: {voc_df.shape}")
# print(f"Synthetic shape: {synthetic_df.shape}")
# print(f"MSR VTT shape: {msr_vtt_df.shape}")

metadata_caption_df = pd.concat([coco_df, voc_df], axis=0, ignore_index=True)
print(f"Final Metadata shape: {metadata_caption_df.shape}")

metadata_caption_df[['dataset_name', 'file_name']] = metadata_caption_df['image_path'].str.extract(r'data/([^/]+)/.*?/([^/]+\.[a-z0-9]+)$')

metadata_caption_df.head(3)

COCO shape: (5914, 7)
VOC 2007 shape: (5031, 7)
Final Metadata shape: (10945, 7)


Unnamed: 0,image_path,correct_answer,caption_0,caption_1,caption_2,caption_3,correct_answer_template,dataset_name,file_name
0,data/coco/images/val2017/000000397133.jpg,0,"This image features a knife, but no car is pre...","A car is present in this image, but there is n...",This image features a car,This image does not feature a knife.,hybrid,coco,000000397133.jpg
1,data/coco/images/val2017/000000397133.jpg,0,A chair is not present in this image.,"This image shows a chair, but no spoon is pres...",A chair is present in this image.,A spoon is not included in this image.,negative,coco,000000397133.jpg
2,data/coco/images/val2017/000000397133.jpg,0,"A person is present in this image, but there's...","This image shows a fork, with no person in sight.",A fork is shown in this image.,No person is present in this image.,hybrid,coco,000000397133.jpg


In [4]:
metadata_caption_df['dataset_name'].value_counts()

dataset_name
coco       5914
voc2007    5031
Name: count, dtype: int64

## Load MCQ Dataset - Images

In [6]:
coco_val_dir = '/workspace/Dataset/neg_bench/coco_images'
# mediafire_val_dir = '/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/mediafire_val_videos'
voc_val_dir = '/workspace/Dataset/neg_bench/voc2007_images/VOC2007/JPEGImages'
# synthetic_mcq = '/Users/akanshagautam/Documents/MTech/Thesis/Dataset/NegClip/NegBench/evaluation data/images/synthetic_mcq_llama3.1_rephrased.csv'

coco_image_files = [f for f in os.listdir(coco_val_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
# mediafire_video_files = [f for f in os.listdir(mediafire_val_dir) if f.lower().endswith('.mp4')]
voc_image_files = [f for f in os.listdir(voc_val_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

coco_val_df = pd.DataFrame({'file_name': coco_image_files, 'file_path': [os.path.join(coco_val_dir, f) for f in coco_image_files],'dataset_name': 'coco'})
# mediafire_val_df = pd.DataFrame({'file_name': mediafire_video_files, 'file_path': [os.path.join(mediafire_val_dir, f) for f in mediafire_video_files], 'dataset_name': 'video'})
voc_val_df = pd.DataFrame({'file_name': voc_image_files, 'file_path': [os.path.join(voc_val_dir, f) for f in voc_image_files], 'dataset_name': 'voc2007'})

print(f"COCO shape: {coco_val_df.shape}")
print(f"VOC shape: {voc_val_df.shape}")

metadata_images_videos_df = pd.concat([coco_val_df, voc_val_df], axis=0, ignore_index=True)
print(f"Final Metadata shape: {metadata_images_videos_df.shape}")

metadata_images_videos_df.head()

COCO shape: (5000, 3)
VOC shape: (4952, 3)
Final Metadata shape: (9952, 3)


Unnamed: 0,file_name,file_path,dataset_name
0,000000500826.jpg,/workspace/Dataset/neg_bench/coco_images/00000...,coco
1,000000126137.jpg,/workspace/Dataset/neg_bench/coco_images/00000...,coco
2,000000388258.jpg,/workspace/Dataset/neg_bench/coco_images/00000...,coco
3,000000329219.jpg,/workspace/Dataset/neg_bench/coco_images/00000...,coco
4,000000446005.jpg,/workspace/Dataset/neg_bench/coco_images/00000...,coco


## Merge the captions and images

In [10]:
merged_df = pd.merge(metadata_caption_df,metadata_images_videos_df,on=['file_name', 'dataset_name'],how='inner')

print(f"Merged shape: {merged_df.shape}")
merged_df.head(2)

Merged shape: (10945, 10)


Unnamed: 0,image_path,correct_answer,caption_0,caption_1,caption_2,caption_3,correct_answer_template,dataset_name,file_name,file_path
0,data/coco/images/val2017/000000397133.jpg,0,"This image features a knife, but no car is present.","A car is present in this image, but there is no knife.",This image features a car,This image does not feature a knife.,hybrid,coco,000000397133.jpg,/workspace/Dataset/neg_bench/coco_images/000000397133.jpg
1,data/coco/images/val2017/000000397133.jpg,0,A chair is not present in this image.,"This image shows a chair, but no spoon is present.",A chair is present in this image.,A spoon is not included in this image.,negative,coco,000000397133.jpg,/workspace/Dataset/neg_bench/coco_images/000000397133.jpg


# Split dataset into different negation types

In [11]:
def clean_caption(text):
    if isinstance(text, str):
        text = text.lower()
        
        contractions = {
            "isn't": "isnot", "aren't": "arenot","wasn't": "wasnot","weren't": "werenot","don't": "donot","doesn't": "doesnot",
            "didn't": "didnot","can't": "cannot","couldn't": "couldnot","won't": "willnot","wouldn't": "wouldnot","shouldn't": "shouldnot",
            "mustn't": "mustnot","hadn't": "hadnot","hasn't": "hasnot","haven't": "havenot","mightn't": "mightnot",
            "needn't": "neednot","shan't": "shallnot",
        }

        for contraction, replacement in contractions.items():
            text = text.replace(contraction, replacement)

        text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
        text = text.strip()
        return text
    return text

merged_df['caption_0_clean'] = merged_df['caption_0'].apply(clean_caption)
merged_df['caption_1_clean'] = merged_df['caption_1'].apply(clean_caption)
merged_df['caption_2_clean'] = merged_df['caption_2'].apply(clean_caption)
merged_df['caption_3_clean'] = merged_df['caption_3'].apply(clean_caption)

In [12]:
syntactic_negators = {"no", "not", "never", "neither", "nobody", "nothing"}
ignore_words = {"image", "images", "included", "include", "includes", "displays", 
"introduces", "information", "individual", "individuals", "insert", "image”", "improve"}
prefixes = ("un", "dis", "in", "im", "ir", "il", "non", "mis")
lexical_negators = {"without", "lack", "absent", "avoid", "missing"}

def is_syntactic_negation(caption):
    tokens = caption.lower().split()
    return any(word in syntactic_negators for word in tokens)

def is_morphological_negation(caption):
    words = caption.lower().split()
    for word in words:
        if word in ignore_words:
            continue
        if any(word.startswith(prefix) and len(word) > len(prefix) for prefix in prefixes):
            return True
    return False

def is_lexical_negation(caption):
    tokens = caption.lower().split()
    return any(word in lexical_negators for word in tokens)

def classify_negation(caption):
    if is_syntactic_negation(caption):
        return "syntactic"
    elif is_morphological_negation(caption):
        return "morphological"
    elif is_lexical_negation(caption):
        return "lexical/semantic"
    else:
        return "unknown/pragmatic"
    
merged_df["caption_0_negation_bucket"] = merged_df["caption_0_clean"].apply(classify_negation)
merged_df["caption_1_negation_bucket"] = merged_df["caption_1_clean"].apply(classify_negation)
merged_df["caption_2_negation_bucket"] = merged_df["caption_2_clean"].apply(classify_negation)
merged_df["caption_3_negation_bucket"] = merged_df["caption_3_clean"].apply(classify_negation)

merged_df.head(2)

Unnamed: 0,image_path,correct_answer,caption_0,caption_1,caption_2,caption_3,correct_answer_template,dataset_name,file_name,file_path,caption_0_clean,caption_1_clean,caption_2_clean,caption_3_clean,caption_0_negation_bucket,caption_1_negation_bucket,caption_2_negation_bucket,caption_3_negation_bucket
0,data/coco/images/val2017/000000397133.jpg,0,"This image features a knife, but no car is present.","A car is present in this image, but there is no knife.",This image features a car,This image does not feature a knife.,hybrid,coco,000000397133.jpg,/workspace/Dataset/neg_bench/coco_images/000000397133.jpg,this image features a knife but no car is present,a car is present in this image but there is no knife,this image features a car,this image does not feature a knife,syntactic,syntactic,unknown/pragmatic,syntactic
1,data/coco/images/val2017/000000397133.jpg,0,A chair is not present in this image.,"This image shows a chair, but no spoon is present.",A chair is present in this image.,A spoon is not included in this image.,negative,coco,000000397133.jpg,/workspace/Dataset/neg_bench/coco_images/000000397133.jpg,a chair is not present in this image,this image shows a chair but no spoon is present,a chair is present in this image,a spoon is not included in this image,syntactic,syntactic,unknown/pragmatic,syntactic


In [13]:
# Split dataset
negation_cols = [ 'caption_0_negation_bucket', 'caption_1_negation_bucket', 'caption_2_negation_bucket', 'caption_3_negation_bucket']

syntactic_df = merged_df[merged_df[negation_cols].apply(lambda x: x.astype(str).str.contains("syntactic", case=False).any(), axis=1)]
morphological_df = merged_df[merged_df[negation_cols].apply(lambda x: x.astype(str).str.contains("morphological", case=False).any(), axis=1)]
lexical_semantic_df = merged_df[merged_df[negation_cols].apply(lambda x: x.astype(str).str.contains("lexical/semantic", case=False, regex=True).any(), axis=1)]

print(f"Syntactic: {syntactic_df.shape}")
print(f"Morphological: {morphological_df.shape}")
print(f"Lexical/Semantic: {lexical_semantic_df.shape}")


Syntactic: (10931, 18)
Morphological: (43, 18)
Lexical/Semantic: (814, 18)


# Run models on all the split dataset

In [40]:
from tqdm import tqdm

def evaluate_mcq_accuracy(df, model, processor, device):
    """
    Evaluate MCQ accuracy on negation understanding using CLIP with GPU.

    Args:
        df (pd.DataFrame): DataFrame with 'file_path', 'caption_0' to 'caption_3', and 'correct_caption_index'.
        model: CLIPModel instance on GPU.
        processor: CLIPProcessor for input preprocessing.
        device: torch.device ('cuda' or 'cpu').

    Returns:
        results_df (pd.DataFrame): Results with predicted indices.
        accuracy (float): Accuracy score over the dataset.
    """
    predictions = []
    correct_flags = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating MCQ"):
        image = Image.open(row["file_path"]).convert("RGB")
        captions = [row[f"caption_{i}"] for i in range(4)]
        correct_index = row["correct_answer"]

        # Preprocess and move to device
        inputs = processor(text=captions, images=[image]*4, return_tensors="pt", padding=True).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            similarities = torch.nn.functional.cosine_similarity(
                outputs.image_embeds, outputs.text_embeds
            )

        pred_index = similarities.argmax().item()
        predictions.append(pred_index)
        correct_flags.append(int(pred_index == correct_index))

    # Results and accuracy
    results_df = df.copy()
    results_df["predicted_index"] = predictions
    results_df["correct_prediction"] = correct_flags
    accuracy = sum(correct_flags) / len(correct_flags)

    return results_df, accuracy


In [43]:
def evaluate_mcq_accuracy_v2(df, model, preprocess, tokenizer, device):
    """
    Evaluate MCQ accuracy for OpenCLIP-based model on negation-sensitive MCQ dataset.

    Args:
        df (pd.DataFrame): DataFrame with 'file_path', 'caption_0' to 'caption_3', and 'correct_answer'.
        model: OpenCLIP model.
        preprocess: Image preprocessing function.
        tokenizer: Tokenizer for text.
        device: torch.device.

    Returns:
        results_df (pd.DataFrame): DataFrame with predictions and correctness.
        accuracy (float): Proportion of correct predictions.
    """
    predictions = []
    correct_flags = []

    model.eval()

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating MCQ"):
        image = preprocess(Image.open(row["file_path"]).convert("RGB")).unsqueeze(0).to(device)
        captions = [row[f"caption_{i}"] for i in range(4)]
        correct_index = row["correct_answer"]

        # Repeat image 4 times for each caption
        image_batch = image.repeat(4, 1, 1, 1)

        # Tokenize captions
        text_tokens = tokenizer(captions).to(device)

        with torch.no_grad():
            image_features = model.encode_image(image_batch)
            text_features = model.encode_text(text_tokens)

            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)

            similarity = (image_features * text_features).sum(dim=-1)  # Dot product

        pred_index = similarity.argmax().item()
        predictions.append(pred_index)
        correct_flags.append(int(pred_index == correct_index))

    results_df = df.copy()
    results_df["predicted_index"] = predictions
    results_df["correct_prediction"] = correct_flags
    accuracy = sum(correct_flags) / len(correct_flags)

    return results_df, accuracy

In [55]:
def evaluate_mcq_accuracy_v3(df, model, preprocess, device):
    """
    Evaluate MCQ accuracy using OpenAI-style CLIP with a custom checkpoint.

    Args:
        df (pd.DataFrame): Must contain 'file_path', 'caption_0' to 'caption_3', and 'correct_answer'.
        model: CLIP model (from clip.load or ConCLIP).
        preprocess: Image preprocessing transform (from clip.load).
        device: 'cuda' or 'cpu'.

    Returns:
        results_df (pd.DataFrame), accuracy (float)
    """
    import clip  # Make sure the clip package is imported

    predictions = []
    correct_flags = []

    model.eval()

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating MCQ"):
        image = preprocess(Image.open(row["file_path"]).convert("RGB")).unsqueeze(0).to(device)
        captions = [row[f"caption_{i}"] for i in range(4)]
        correct_index = row["correct_answer"]

        # Repeat image to match number of captions
        image_input = image.repeat(len(captions), 1, 1, 1)  # Shape: [4, 3, H, W]
        text_tokens = clip.tokenize(captions).to(device)   # Shape: [4, 77]

        with torch.no_grad():
            image_features = model.encode_image(image_input)
            text_features = model.encode_text(text_tokens)

            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)

            similarities = (image_features * text_features).sum(dim=-1)  # Dot product = cosine similarity

        pred_index = similarities.argmax().item()
        predictions.append(pred_index)
        correct_flags.append(int(pred_index == correct_index))

    results_df = df.copy()
    results_df["predicted_index"] = predictions
    results_df["correct_prediction"] = correct_flags
    accuracy = sum(correct_flags) / len(correct_flags)

    return results_df, accuracy

In [49]:
def load_clip_with_custom_checkpoint(model_name: str, checkpoint_path: str, device: str = "cuda"):
    """
    Load CLIP model with a custom checkpoint.
    """
    model, preprocess = clip.load(model_name, device=device)
    ckpt = torch.load(checkpoint_path, weights_only=False)
    model = model.float()
    model.load_state_dict(ckpt["model"])
    model = model.to(device)
    return model.eval(), preprocess

In [63]:
def load_negclip_checkpoint(model_name: str, checkpoint_path: str, device: str = "cuda"):
    model, preprocess = clip.load(model_name, device=device)

    # Load checkpoint
    ckpt = torch.load(checkpoint_path, map_location=device, weights_only=False)
    state_dict = ckpt["state_dict"]  # extract the actual model weights

    # Clean up keys (remove any "model." or "module." prefixes)
    cleaned_state_dict = {}
    for k, v in state_dict.items():
        new_key = k
        if k.startswith("model."):
            new_key = k[len("model."):]
        elif k.startswith("module."):
            new_key = k[len("module."):]
        cleaned_state_dict[new_key] = v

    # Load weights
    model.load_state_dict(cleaned_state_dict, strict=False)  # use strict=False to skip harmless mismatches
    model = model.to(device)
    return model.eval(), preprocess

## Clip ViT Base Patch32 

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [29]:
results_df, score = evaluate_mcq_accuracy(syntactic_df, model, processor, device)
print(f"Negation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy(lexical_semantic_df, model, processor, device)
print(f"Negation Score on lexical/semantic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy(morphological_df, model, processor, device)
print(f"Negation Score on morphological dataset: {score:.4f}")

Evaluating MCQ: 100%|██████████| 10931/10931 [07:23<00:00, 24.66it/s]



Negation Score on syntactic dataset: 0.3900


Evaluating MCQ: 100%|██████████| 814/814 [00:33<00:00, 24.60it/s]



Negation Score on lexical/semantic dataset: 0.3378


Evaluating MCQ: 100%|██████████| 43/43 [00:01<00:00, 26.34it/s]


Negation Score on morphological dataset: 0.4186





## Clip ViT Base Patch16

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

In [31]:
results_df, score = evaluate_mcq_accuracy(syntactic_df, model, processor, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy(lexical_semantic_df, model, processor, device)
print(f"\nNegation Score on lexical/semantic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy(morphological_df, model, processor, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating MCQ: 100%|██████████| 10931/10931 [07:51<00:00, 23.16it/s]



Negation Score on syntactic dataset: 0.3990


Evaluating MCQ: 100%|██████████| 814/814 [00:35<00:00, 23.16it/s]



Negation Score on lexical/semantic dataset: 0.3575


Evaluating MCQ: 100%|██████████| 43/43 [00:01<00:00, 24.22it/s]


Negation Score on morphological dataset: 0.4651





## Clip ViT Large Patch14

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

In [33]:
results_df, score = evaluate_mcq_accuracy(syntactic_df, model, processor, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy(lexical_semantic_df, model, processor, device)
print(f"\nNegation Score on lexical/semantic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy(morphological_df, model, processor, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating MCQ: 100%|██████████| 10931/10931 [18:06<00:00, 10.06it/s]



Negation Score on syntactic dataset: 0.3852


Evaluating MCQ: 100%|██████████| 814/814 [01:21<00:00, 10.05it/s]



Negation Score on lexical/semantic dataset: 0.3305


Evaluating MCQ: 100%|██████████| 43/43 [00:04<00:00, 10.07it/s]


Negation Score on morphological dataset: 0.4651





## LAION CLIP-ViT-H-14-laion2B-s32B-b79K model

In [45]:
model_name = "ViT-H-14"
pretrained = "laion2B-s32B-b79K"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, _, preprocess = open_clip.create_model_and_transforms(model_name=model_name,pretrained=pretrained,device=device)

tokenizer = open_clip.get_tokenizer(model_name)

model = model.to(device)
model.eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-31): 32 x ResidualAttentionBlock(
          (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1280, out_features=5120, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=5120, out_features=1280, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((1280,), eps=1e-05, elementwi

In [47]:
results_df, score = evaluate_mcq_accuracy_v2(syntactic_df, model, preprocess, tokenizer, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v2(lexical_semantic_df, model, preprocess, tokenizer, device)
print(f"\nNegation Score on lexical/semantic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v2(morphological_df, model, preprocess, tokenizer, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating MCQ: 100%|██████████| 10931/10931 [31:03<00:00,  5.87it/s]



Negation Score on syntactic dataset: 0.3110


Evaluating MCQ: 100%|██████████| 814/814 [02:19<00:00,  5.82it/s]



Negation Score on lexical/semantic dataset: 0.2322


Evaluating MCQ: 100%|██████████| 43/43 [00:07<00:00,  5.86it/s]


Negation Score on morphological dataset: 0.4884





## CoN-CLIP ViT-B/32

In [53]:
checkpoint_path = "/workspace/Models/conclip/conclip_vit_b32.pt"
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-B/32"

model, preprocess = load_clip_with_custom_checkpoint(model_name, checkpoint_path, device)

In [56]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(lexical_semantic_df, model, preprocess, device)
print(f"\nNegation Score on lexical/semantic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating MCQ: 100%|██████████| 10931/10931 [06:47<00:00, 26.84it/s]



Negation Score on syntactic dataset: 0.3051


Evaluating MCQ: 100%|██████████| 814/814 [00:30<00:00, 26.97it/s]



Negation Score on lexical/semantic dataset: 0.2236


Evaluating MCQ: 100%|██████████| 43/43 [00:01<00:00, 28.35it/s]


Negation Score on morphological dataset: 0.2558





## CON-CLIP ViT-B/16

In [57]:
checkpoint_path = "/workspace/Models/conclip/conclip_vit_b16.pt"
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-B/16"

model, preprocess = load_clip_with_custom_checkpoint(model_name, checkpoint_path, device)

100%|███████████████████████████████████████| 335M/335M [00:04<00:00, 77.3MiB/s]


In [59]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(lexical_semantic_df, model, preprocess, device)
print(f"\nNegation Score on lexical/semantic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating MCQ: 100%|██████████| 10931/10931 [07:05<00:00, 25.69it/s]



Negation Score on syntactic dataset: 0.2991


Evaluating MCQ: 100%|██████████| 814/814 [00:30<00:00, 26.42it/s]



Negation Score on lexical/semantic dataset: 0.2555


Evaluating MCQ: 100%|██████████| 43/43 [00:01<00:00, 24.53it/s]


Negation Score on morphological dataset: 0.2791





## CON-CLIP ViT-L/14

In [60]:
checkpoint_path = "/workspace/Models/conclip/conclip_vit_l14.pt"
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-L/14"

model, preprocess = load_clip_with_custom_checkpoint(model_name, checkpoint_path, device)

100%|███████████████████████████████████████| 890M/890M [00:31<00:00, 29.4MiB/s]


In [61]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(lexical_semantic_df, model, preprocess, device)
print(f"\nNegation Score on lexical/semantic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating MCQ: 100%|██████████| 10931/10931 [16:23<00:00, 11.12it/s]



Negation Score on syntactic dataset: 0.3365


Evaluating MCQ: 100%|██████████| 814/814 [01:13<00:00, 11.10it/s]



Negation Score on lexical/semantic dataset: 0.2531


Evaluating MCQ: 100%|██████████| 43/43 [00:03<00:00, 11.04it/s]


Negation Score on morphological dataset: 0.3023





## NegClip ICLR 2023

In [64]:
checkpoint_path = "/workspace/Models/negclip/negclip_iclr_2023.pth"
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-B/32"

model, preprocess = load_negclip_checkpoint(model_name, checkpoint_path, device)

In [65]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(lexical_semantic_df, model, preprocess, device)
print(f"\nNegation Score on lexical/semantic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating MCQ: 100%|██████████| 10931/10931 [07:01<00:00, 25.91it/s]



Negation Score on syntactic dataset: 0.2876


Evaluating MCQ: 100%|██████████| 814/814 [00:30<00:00, 27.04it/s]



Negation Score on lexical/semantic dataset: 0.2187


Evaluating MCQ: 100%|██████████| 43/43 [00:01<00:00, 24.09it/s]


Negation Score on morphological dataset: 0.3488





# Clip CC12M Negfull ViT-B/32 LR-1e-8

In [66]:
checkpoint_path = "/workspace/Models/negclip/clip_cc12m_negfull_vit_b_32_lr1e-8_clw_0_99_mlw_0_01.pt"
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-B/32"

model, preprocess = load_negclip_checkpoint(model_name, checkpoint_path, device)

In [67]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(lexical_semantic_df, model, preprocess, device)
print(f"\nNegation Score on lexical/semantic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating MCQ: 100%|██████████| 10931/10931 [06:53<00:00, 26.46it/s]



Negation Score on syntactic dataset: 0.5786


Evaluating MCQ: 100%|██████████| 814/814 [00:30<00:00, 26.85it/s]



Negation Score on lexical/semantic dataset: 0.5283


Evaluating MCQ: 100%|██████████| 43/43 [00:01<00:00, 26.41it/s]


Negation Score on morphological dataset: 0.5814





## Negclip CC12M Negfull ViT-B/32 LR-1e-8

In [68]:
checkpoint_path = "/workspace/Models/negclip/negclip_cc12m_negfull_vit_b_32_lr1e-8_clw_0_99_mlw_0_01.pt"
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-B/32"

model, preprocess = load_negclip_checkpoint(model_name, checkpoint_path, device)

In [69]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(lexical_semantic_df, model, preprocess, device)
print(f"\nNegation Score on lexical/semantic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating MCQ: 100%|██████████| 10931/10931 [06:52<00:00, 26.49it/s]



Negation Score on syntactic dataset: 0.5795


Evaluating MCQ: 100%|██████████| 814/814 [00:30<00:00, 26.72it/s]



Negation Score on lexical/semantic dataset: 0.5246


Evaluating MCQ: 100%|██████████| 43/43 [00:01<00:00, 33.36it/s]


Negation Score on morphological dataset: 0.6744





## CLIP-ViT-B-32-DataComp.S-s13M-b4K


In [70]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms('hf-hub:laion/CLIP-ViT-B-32-DataComp.S-s13M-b4K')
tokenizer = open_clip.get_tokenizer('hf-hub:laion/CLIP-ViT-B-32-DataComp.S-s13M-b4K')
model = model.to(device)
model.eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [71]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(lexical_semantic_df, model, preprocess, device)
print(f"\nNegation Score on lexical/semantic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating MCQ: 100%|██████████| 10931/10931 [06:37<00:00, 27.47it/s]



Negation Score on syntactic dataset: 0.2658


Evaluating MCQ: 100%|██████████| 814/814 [00:30<00:00, 26.88it/s]



Negation Score on lexical/semantic dataset: 0.2432


Evaluating MCQ: 100%|██████████| 43/43 [00:01<00:00, 28.63it/s]


Negation Score on morphological dataset: 0.3256



