# Import libraries

In [3]:
import numpy as np
import pandas as pd
import os
import torch
import re
import string
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from datasets import load_dataset
from itertools import chain
import clip
import open_clip
import tqdm
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Load Dataset

In [4]:
negvqa = load_dataset("yuhuizhang/NegVQA")

dev_df = pd.DataFrame(negvqa['dev'])
test_df = pd.DataFrame(negvqa['test'])

print("Dev Shape:", dev_df.shape)
print("Test Shape:", test_df.shape)

# merged_df = pd.concat([dev_df, test_df], ignore_index=True)

# print("Merged Shape:", merged_df.shape)
# merged_df.head(3)

KeyboardInterrupt: 

# Split dataset into different negation types

In [3]:
def clean_caption(text):
    if isinstance(text, str):
        text = text.lower()
        
        contractions = {
            "isn't": "isnot", "aren't": "arenot","wasn't": "wasnot","weren't": "werenot","don't": "donot","doesn't": "doesnot",
            "didn't": "didnot","can't": "cannot","couldn't": "couldnot","won't": "willnot","wouldn't": "wouldnot","shouldn't": "shouldnot",
            "mustn't": "mustnot","hadn't": "hadnot","hasn't": "hasnot","haven't": "havenot","mightn't": "mightnot",
            "needn't": "neednot","shan't": "shallnot",
        }

        for contraction, replacement in contractions.items():
            text = text.replace(contraction, replacement)

        text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
        text = text.strip()
        return text
    return text

dev_df['negated_question_clean'] = dev_df['negated_question'].apply(clean_caption)

In [4]:
syntactic_negators = {"no", "not", "never", "neither", "nobody", "nothing"}
ignore_words = {"image", "images", "included", "include", "includes", "displays", 
"introduces", "information", "individual", "individuals", "insert", "image”", "improve"}
prefixes = ("un", "dis", "in", "im", "ir", "il", "non", "mis")
lexical_negators = {"without", "lack", "absent", "avoid", "missing"}

def is_syntactic_negation(caption):
    tokens = caption.lower().split()
    return any(word in syntactic_negators for word in tokens)

def is_morphological_negation(caption):
    words = caption.lower().split()
    for word in words:
        if word in ignore_words:
            continue
        if any(word.startswith(prefix) and len(word) > len(prefix) for prefix in prefixes):
            return True
    return False

def is_lexical_negation(caption):
    tokens = caption.lower().split()
    return any(word in lexical_negators for word in tokens)

def classify_negation(caption):
    if is_syntactic_negation(caption):
        return "syntactic"
    elif is_morphological_negation(caption):
        return "morphological"
    elif is_lexical_negation(caption):
        return "lexical/semantic"
    else:
        return "unknown/pragmatic"
    
dev_df["negated_question_bucket"] = dev_df["negated_question_clean"].apply(classify_negation)

dev_df.head(5)

Unnamed: 0,index,category,image,original_question,negated_question,A,B,original_answer,negated_answer,negated_question_clean,negated_question_bucket
0,33,MMMU,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=355x174 at 0x7C165942E990>,"In the phylogenetic tree shown, which organism is the most recent common ancestor of organisms 1 and 3? <image 1>","In the phylogenetic tree shown, which organism is not the most recent common ancestor of organisms 1 and 3?",Organism B,Organism A,A,B,in the phylogenetic tree shown which organism is not the most recent common ancestor of organisms 1 and 3,syntactic
1,36,MMMU,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=819x687 at 0x7C1659465F10>,Works like the one (<image 1>) shown from Micronesia utilized abstract geometric lines and shapes because they functioned to record,Works like the one (<image 1>) shown from Micronesia did not utilize abstract geometric lines and shapes because they functioned to record,weather patterns,ocean currents and landforms,B,A,works like the one image 1 shown from micronesia did not utilize abstract geometric lines and shapes because they functioned to record,syntactic
2,48,MMMU,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x426 at 0x7C1659466390>,"Refer to the figure <image 1>, which term refers to traditional folktales with a contemporary twist or a tale told from a new perspective, as seen in works like 'There's a Wolf at the Door' and 'The Adventures of the Dish and the Spoon'?","Refer to the figure <image 1>, which term does not refer to traditional folktales with a contemporary twist or a tale told from a new perspective, as seen in works like 'There's a Wolf at the Door' and 'The Adventures of the Dish and the Spoon'?",Classic Folktales,Fractured Folktales,B,A,refer to the figure image 1 which term does not refer to traditional folktales with a contemporary twist or a tale told from a new perspective as seen in works like there s a wolf at the door and the adventures of the dish and the spoon,syntactic
3,57,MMMU,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1188x136 at 0x7C1659466990>,"In 2/4 time signature, which of the following is the correct notation? <image 1>","In 2/4 time signature, which of the following is not the correct notation? <image 1>",A measure with a half note and a quarter note,B,B,A,in 2 4 time signature which of the following is not the correct notation image 1,syntactic
4,99,MMMU,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=810x628 at 0x7C1659466F90>,<image 1> Which of the following was another common criticism of Andrew Carnegie?,Which of the following was not another common criticism of Andrew Carnegie?,Carnegie systematically increased wages above industry standards to prevent labor unrest.,Carnegie did nothing to prevent the use of violence against his workers when they asked for better wages and working conditions.,B,A,which of the following was not another common criticism of andrew carnegie,syntactic


In [5]:
# Split dataset
negation_cols = [ 'negated_question_bucket']

syntactic_df = dev_df[dev_df[negation_cols].apply(lambda x: x.astype(str).str.contains("syntactic", case=False).any(), axis=1)]
morphological_df = dev_df[dev_df[negation_cols].apply(lambda x: x.astype(str).str.contains("morphological", case=False).any(), axis=1)]
lexical_semantic_df = dev_df[dev_df[negation_cols].apply(lambda x: x.astype(str).str.contains("lexical/semantic", case=False, regex=True).any(), axis=1)]

print(f"Syntactic: {syntactic_df.shape}")
print(f"Morphological: {morphological_df.shape}")
print(f"Lexical/Semantic: {lexical_semantic_df.shape}")


Syntactic: (970, 11)
Morphological: (5, 11)
Lexical/Semantic: (0, 11)


# Run models on all the split dataset

In [14]:
from tqdm import tqdm

from tqdm import tqdm
import torch

def evaluate_mcq_accuracy(df, model, processor, device):
    """
    Evaluate multiple-choice accuracy on NegVQA negated questions using CLIP.

    Metric: Accuracy – percent of correct negated answers.
    (Matches evaluation in: NegVQA: Can Vision Language Models Understand Negation)

    Args:
        df (pd.DataFrame): Contains 'image', 'A', 'B', and 'negated_answer' ('A' or 'B').
        model: CLIPModel.
        processor: CLIPProcessor.
        device: torch.device.

    Returns:
        results_df (pd.DataFrame): Original with added 'predicted_answer' and 'correct_prediction'.
        accuracy (float): Overall accuracy (0–1).
    """
    predictions = []
    correct_flags = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating NegVQA MCQ"):
        image = row["image"]
        choices = [row["A"], row["B"]]
        correct_label = row["negated_answer"]  # 'A' or 'B'
        correct_index = 0 if correct_label == "A" else 1

        # Preprocess input
        inputs = processor(
            text=choices,
            images=[image] * 2,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            sims = torch.nn.functional.cosine_similarity(outputs.image_embeds, outputs.text_embeds)

        pred_index = sims.argmax().item()
        pred_label = "A" if pred_index == 0 else "B"

        predictions.append(pred_label)
        correct_flags.append(int(pred_index == correct_index))

    results_df = df.copy()
    results_df["predicted_answer"] = predictions
    results_df["correct_prediction"] = correct_flags
    accuracy = sum(correct_flags) / len(correct_flags)

    print(f"📊 NegVQA Accuracy: {accuracy*100:.2f}%")
    return results_df, accuracy



In [23]:
def evaluate_mcq_accuracy_v2(df, model, preprocess, tokenizer, device):
    """
    Evaluate MCQ accuracy for OpenCLIP-style model on NegVQA.

    Args:
        df (pd.DataFrame): Must include 'image', 'A', 'B', and 'negated_answer' ("A" or "B").
        model: OpenCLIP model.
        preprocess: Image preprocessing function (e.g., from open_clip).
        tokenizer: Tokenizer for text (e.g., open_clip.tokenize).
        device: torch.device.

    Returns:
        results_df (pd.DataFrame): DataFrame with predictions and flags.
        accuracy (float): Overall accuracy (0–1).
    """
    predictions = []
    correct_flags = []

    model.eval()

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating OpenCLIP MCQ"):
        image_tensor = preprocess(row["image"]).unsqueeze(0).to(device)  # shape: (1, 3, H, W)
        choices = [row["A"], row["B"]]
        correct_label = row["negated_answer"]  # "A" or "B"
        correct_index = 0 if correct_label == "A" else 1

        # Repeat image for both choices
        image_batch = image_tensor.repeat(len(choices), 1, 1, 1)  # shape: (2, 3, H, W)
        text_tokens = tokenizer(choices).to(device)  # shape: (2, seq_len)

        with torch.no_grad():
            image_features = model.encode_image(image_batch)
            text_features = model.encode_text(text_tokens)

            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)

            similarity = (image_features * text_features).sum(dim=-1)  # cosine similarity via dot product

        pred_index = similarity.argmax().item()
        pred_label = "A" if pred_index == 0 else "B"

        predictions.append(pred_label)
        correct_flags.append(int(pred_index == correct_index))

    results_df = df.copy()
    results_df["predicted_answer"] = predictions
    results_df["correct_prediction"] = correct_flags
    accuracy = sum(correct_flags) / len(correct_flags)

    print(f"📊 OpenCLIP MCQ Accuracy: {accuracy*100:.2f}%")
    return results_df, accuracy


In [29]:
def evaluate_mcq_accuracy_v3(df, model, preprocess, device):
    """
    Evaluate MCQ accuracy using OpenAI-style CLIP (with custom checkpoint).

    Args:
        df (pd.DataFrame): Contains 'image' (PIL.Image), 'A', 'B', and 'negated_answer' ("A" or "B").
        model: CLIP model (from clip.load or ConCLIP).
        preprocess: Image transform (from clip.load).
        device: 'cuda' or 'cpu'.

    Returns:
        results_df (pd.DataFrame): With predicted answer and flags.
        accuracy (float): Overall accuracy (0–1).
    """
    predictions = []
    correct_flags = []

    model.eval()

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating CLIP MCQ"):
        image_tensor = preprocess(row["image"]).unsqueeze(0).to(device)
        choices = [row["A"], row["B"]]
        correct_label = row["negated_answer"]
        correct_index = 0 if correct_label == "A" else 1

        # Prepare inputs
        image_batch = image_tensor.repeat(2, 1, 1, 1)  # [2, 3, H, W]
        text_tokens = clip.tokenize(choices, truncate=True).to(device)

        with torch.no_grad():
            image_features = model.encode_image(image_batch)
            text_features = model.encode_text(text_tokens)

            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)

            similarities = (image_features * text_features).sum(dim=-1)  # cosine similarity

        pred_index = similarities.argmax().item()
        pred_label = "A" if pred_index == 0 else "B"

        predictions.append(pred_label)
        correct_flags.append(int(pred_index == correct_index))

    results_df = df.copy()
    results_df["predicted_answer"] = predictions
    results_df["correct_prediction"] = correct_flags
    accuracy = sum(correct_flags) / len(correct_flags)

    print(f"📊 CLIP MCQ Accuracy: {accuracy * 100:.2f}%")
    return results_df, accuracy


In [10]:
def load_clip_with_custom_checkpoint(model_name: str, checkpoint_path: str, device: str = "cuda"):
    """
    Load CLIP model with a custom checkpoint.
    """
    model, preprocess = clip.load(model_name, device=device)
    ckpt = torch.load(checkpoint_path, weights_only=False)
    model = model.float()
    model.load_state_dict(ckpt["model"])
    model = model.to(device)
    return model.eval(), preprocess

In [11]:
def load_negclip_checkpoint(model_name: str, checkpoint_path: str, device: str = "cuda"):
    model, preprocess = clip.load(model_name, device=device)

    # Load checkpoint
    ckpt = torch.load(checkpoint_path, map_location=device, weights_only=False)
    state_dict = ckpt["state_dict"]  # extract the actual model weights

    # Clean up keys (remove any "model." or "module." prefixes)
    cleaned_state_dict = {}
    for k, v in state_dict.items():
        new_key = k
        if k.startswith("model."):
            new_key = k[len("model."):]
        elif k.startswith("module."):
            new_key = k[len("module."):]
        cleaned_state_dict[new_key] = v

    # Load weights
    model.load_state_dict(cleaned_state_dict, strict=False)  # use strict=False to skip harmless mismatches
    model = model.to(device)
    return model.eval(), preprocess

## Clip ViT Base Patch32 

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [15]:
results_df, score = evaluate_mcq_accuracy(syntactic_df, model, processor, device)
print(f"Negation Score on syntactic dataset: {score:.4f}")

# results_df, score = evaluate_mcq_accuracy(lexical_semantic_df, model, processor, device)
# print(f"Negation Score on lexical/semantic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy(morphological_df, model, processor, device)
print(f"Negation Score on morphological dataset: {score:.4f}")

Evaluating NegVQA MCQ: 100%|██████████| 970/970 [00:38<00:00, 24.98it/s]


📊 NegVQA Accuracy: 42.99%
Negation Score on syntactic dataset: 0.4299


Evaluating NegVQA MCQ: 100%|██████████| 5/5 [00:00<00:00, 32.17it/s]

📊 NegVQA Accuracy: 60.00%
Negation Score on morphological dataset: 0.6000





## Clip ViT Base Patch16

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

In [18]:
results_df, score = evaluate_mcq_accuracy(syntactic_df, model, processor, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy(morphological_df, model, processor, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating NegVQA MCQ: 100%|██████████| 970/970 [00:49<00:00, 19.70it/s]


📊 NegVQA Accuracy: 40.00%

Negation Score on syntactic dataset: 0.4000


Evaluating NegVQA MCQ: 100%|██████████| 5/5 [00:00<00:00, 32.80it/s]

📊 NegVQA Accuracy: 80.00%

Negation Score on morphological dataset: 0.8000





## Clip ViT Large Patch14

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

In [20]:
results_df, score = evaluate_mcq_accuracy(syntactic_df, model, processor, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy(morphological_df, model, processor, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating NegVQA MCQ: 100%|██████████| 970/970 [01:19<00:00, 12.28it/s]


📊 NegVQA Accuracy: 37.73%

Negation Score on syntactic dataset: 0.3773


Evaluating NegVQA MCQ: 100%|██████████| 5/5 [00:00<00:00, 18.37it/s]

📊 NegVQA Accuracy: 80.00%

Negation Score on morphological dataset: 0.8000





## LAION CLIP-ViT-H-14-laion2B-s32B-b79K model

In [21]:
model_name = "ViT-H-14"
pretrained = "laion2B-s32B-b79K"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, _, preprocess = open_clip.create_model_and_transforms(model_name=model_name,pretrained=pretrained,device=device)

tokenizer = open_clip.get_tokenizer(model_name)

model = model.to(device)
model.eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-31): 32 x ResidualAttentionBlock(
          (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1280, out_features=5120, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=5120, out_features=1280, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((1280,), eps=1e-05, elementwi

In [24]:
results_df, score = evaluate_mcq_accuracy_v2(syntactic_df, model, preprocess, tokenizer, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v2(morphological_df, model, preprocess, tokenizer, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating OpenCLIP MCQ: 100%|██████████| 970/970 [01:32<00:00, 10.51it/s]


📊 OpenCLIP MCQ Accuracy: 34.64%

Negation Score on syntactic dataset: 0.3464


Evaluating OpenCLIP MCQ: 100%|██████████| 5/5 [00:00<00:00, 10.25it/s]

📊 OpenCLIP MCQ Accuracy: 20.00%

Negation Score on morphological dataset: 0.2000





## CoN-CLIP ViT-B/32

In [30]:
checkpoint_path = "/workspace/Models/conclip/conclip_vit_b32.pt"
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-B/32"

model, preprocess = load_clip_with_custom_checkpoint(model_name, checkpoint_path, device)

In [31]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating CLIP MCQ: 100%|██████████| 970/970 [00:39<00:00, 24.66it/s]


📊 CLIP MCQ Accuracy: 39.90%

Negation Score on syntactic dataset: 0.3990


Evaluating CLIP MCQ: 100%|██████████| 5/5 [00:00<00:00, 28.03it/s]

📊 CLIP MCQ Accuracy: 80.00%

Negation Score on morphological dataset: 0.8000





## CON-CLIP ViT-B/16

In [32]:
checkpoint_path = "/workspace/Models/conclip/conclip_vit_b16.pt"
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-B/16"

model, preprocess = load_clip_with_custom_checkpoint(model_name, checkpoint_path, device)

100%|███████████████████████████████████████| 335M/335M [00:05<00:00, 68.2MiB/s]


In [34]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating CLIP MCQ: 100%|██████████| 970/970 [00:38<00:00, 24.89it/s]


📊 CLIP MCQ Accuracy: 38.25%

Negation Score on syntactic dataset: 0.3825


Evaluating CLIP MCQ: 100%|██████████| 5/5 [00:00<00:00, 25.80it/s]

📊 CLIP MCQ Accuracy: 40.00%

Negation Score on morphological dataset: 0.4000





## CON-CLIP ViT-L/14

In [40]:
checkpoint_path = "/workspace/Models/conclip/conclip_vit_l14.pt"
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-L/14"

model, preprocess = load_clip_with_custom_checkpoint(model_name, checkpoint_path, device)

In [36]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating CLIP MCQ: 100%|██████████| 970/970 [00:49<00:00, 19.47it/s]


📊 CLIP MCQ Accuracy: 37.73%

Negation Score on syntactic dataset: 0.3773


Evaluating CLIP MCQ: 100%|██████████| 5/5 [00:00<00:00, 21.05it/s]

📊 CLIP MCQ Accuracy: 60.00%

Negation Score on morphological dataset: 0.6000





## NegClip ICLR 2023

In [37]:
checkpoint_path = "/workspace/Models/negclip/negclip_iclr_2023.pth"
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-B/32"

model, preprocess = load_negclip_checkpoint(model_name, checkpoint_path, device)

In [38]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating CLIP MCQ: 100%|██████████| 970/970 [00:40<00:00, 23.89it/s]


📊 CLIP MCQ Accuracy: 42.06%

Negation Score on syntactic dataset: 0.4206


Evaluating CLIP MCQ: 100%|██████████| 5/5 [00:00<00:00, 26.01it/s]

📊 CLIP MCQ Accuracy: 80.00%

Negation Score on morphological dataset: 0.8000





# Clip CC12M Negfull ViT-B/32 LR-1e-8

In [41]:
checkpoint_path = "/workspace/Models/negclip/clip_cc12m_negfull_vit_b_32_lr1e-8_clw_0_99_mlw_0_01.pt"
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-B/32"

model, preprocess = load_negclip_checkpoint(model_name, checkpoint_path, device)

In [42]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating CLIP MCQ: 100%|██████████| 970/970 [00:40<00:00, 23.76it/s]


📊 CLIP MCQ Accuracy: 43.09%

Negation Score on syntactic dataset: 0.4309


Evaluating CLIP MCQ: 100%|██████████| 5/5 [00:00<00:00, 27.53it/s]

📊 CLIP MCQ Accuracy: 60.00%

Negation Score on morphological dataset: 0.6000





# Negclip CC12M Negfull ViT-B/32 LR-1e-8

In [43]:
checkpoint_path = "/workspace/Models/negclip/negclip_cc12m_negfull_vit_b_32_lr1e-8_clw_0_99_mlw_0_01.pt"
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-B/32"

model, preprocess = load_negclip_checkpoint(model_name, checkpoint_path, device)

In [44]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating CLIP MCQ: 100%|██████████| 970/970 [00:40<00:00, 24.11it/s]


📊 CLIP MCQ Accuracy: 43.92%

Negation Score on syntactic dataset: 0.4392


Evaluating CLIP MCQ: 100%|██████████| 5/5 [00:00<00:00, 19.61it/s]

📊 CLIP MCQ Accuracy: 60.00%

Negation Score on morphological dataset: 0.6000





# CLIP-ViT-B-32-DataComp.S-s13M-b4K


In [45]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms('hf-hub:laion/CLIP-ViT-B-32-DataComp.S-s13M-b4K')
tokenizer = open_clip.get_tokenizer('hf-hub:laion/CLIP-ViT-B-32-DataComp.S-s13M-b4K')
model = model.to(device)
model.eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [46]:
results_df, score = evaluate_mcq_accuracy_v3(syntactic_df, model, preprocess, device)
print(f"\nNegation Score on syntactic dataset: {score:.4f}")

results_df, score = evaluate_mcq_accuracy_v3(morphological_df, model, preprocess, device)
print(f"\nNegation Score on morphological dataset: {score:.4f}")

Evaluating CLIP MCQ: 100%|██████████| 970/970 [00:37<00:00, 25.93it/s]


📊 CLIP MCQ Accuracy: 47.32%

Negation Score on syntactic dataset: 0.4732


Evaluating CLIP MCQ: 100%|██████████| 5/5 [00:00<00:00, 39.38it/s]

📊 CLIP MCQ Accuracy: 60.00%

Negation Score on morphological dataset: 0.6000



