In [None]:
###### FOR PACE ICE - replace GT username below ######
%cd /home/hice1/nbalakrishna3/scratch
!pwd

In [None]:
import os
import json
import base64
from openai import OpenAI
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
from dotenv import load_dotenv
from tqdm import tqdm

In [None]:
# load_dotenv()
# API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_KEY = "ENTER-KEY-HERE"
ANTHROPIC_API_KEY = "ENTER-KEY-HERE"

In [None]:
if not OPENAI_API_KEY:
    raise ValueError(" OPENAI_API_KEY not found in .env file")

In [None]:
IMAGE_FOLDER = "pipeline_images"           
OUTPUT_PATH = "captions.jsonl"    
GPT_MODEL = "gpt-4.1-mini"
CLAUDE_MODEL = "claude-3-5-sonnet-20241022"
MAX_OUTPUT = 200               

In [None]:
openai_client = OpenAI(api_key=OPENAI_API_KEY)
anthropic_client = Anthropic(api_key=ANTHROPIC_API_KEY)
print(anthropic_client.models.list())

In [None]:
def encode_image(image_path):
    with open(image_path, "rb") as img:
        return base64.b64encode(img.read()).decode("utf-8")

In [None]:
def generate_questions(base64_image):
    prompt = """
You are preparing controlled experimental materials for multimodal evaluation.

Given the IMAGE (provided separately), generate the following:

----------------------------------------------------------------------
1. A correct caption
----------------------------------------------------------------------
• Must accurately describe the visible scene.
• 7–15 words, objective, simple, and factual.
• Must mention the main subject(s) and one key attribute
  (e.g., species, color, object type, action, or spatial relation).
• Should be worded well and clearly. 

----------------------------------------------------------------------
2. A deliberately incorrect caption
----------------------------------------------------------------------
• Must keep the same length and sentence structure style as the correct caption.
• MUST change EXACTLY TWO meaningful visual attributes from the correct caption.
  Allowed attribute types:
     – species/category of the main object
     – color of a main object
     – pattern/texture of a main object
     – object type that a person is holding/using
     – action the main subject is performing
     – spatial relation (e.g., “in front of” → “behind”)
     
• The incorrect caption MUST be **factually wrong for THIS image**.
  It should contradict TWO concrete visual facts visible in the picture, not merely
  describe an alternative plausible real-world scenario -- VERY IMPORTANT!
  (Example: If the scene shows a lake, “ocean” is *not* allowed because both can
   coexist conceptually; the changed attributes must be unambiguously false.)

• The incorrect caption must remain syntactically valid and plausible for the 
  kind of world the image depicts, but factually wrong.

• The two changed attributes MUST be *the most visually important attributes*
  from the correct caption.

----------------------------------------------------------------------
3. Five Visual Necessity Ladder (VNL) questions (L0–L4)
----------------------------------------------------------------------

L0 — Pure language prior  
• Must be answerable with NO access to the image.  
• General world knowledge only; do NOT reference animals, people,
  objects, nature, or environments.  
• 6–14 words.

L1 — Probe changed attribute #1 
• MUST directly probe the FIRST changed attribute from the incorrect caption.  
• Example:If species changed, ask “What type of animal…?”  
          If color changed, ask “What color is…?”  
          If object type changed, ask “What object is… holding?”  
• No attributes other than the first changed one.  
• 6–14 words.

L2 — Probe changed attribute #2
• MUST directly probe the SECOND changed attribute from the incorrect caption.  
• Same rules as L2 but targeting the second changed detail.  
• Should not be the same question as L1. 
• 6–14 words.

L3 — High-level reasoning
• Ask a reasoning question that is loosely related to the scene shown in the image.
• The question MUST NOT depend on the two changed attributes.
• The question MUST NOT target the same object/attribute as L1 or L2.
• The question SHOULD require general common-sense or contextual reasoning.
• The question SHOULD still be answerable using the image (but only its general context, not the altered details).
• 6–14 words.

----------------------------------------------------------------------
GENERAL RULES
----------------------------------------------------------------------
• Do NOT provide answers.
• Do NOT describe the image outside captions.
• All questions must be 6–14 words.
• Output MUST be a single JSON object in the exact format below.

----------------------------------------------------------------------
Return EXACTLY this JSON structure:
----------------------------------------------------------------------
{
  "correct_caption": "<string>",
  "incorrect_caption": "<string>",
  "L0": "<string>",
  "L1": "<string>",    // targets changed attribute #1
  "L2": "<string>",    // targets changed attribute #2
  "L3": "<string>"   
}


"""
    response = openai_client.responses.create(
        model=GPT_MODEL,
        max_output_tokens=MAX_OUTPUT,
        input=[
            {
                "role": "user",
                "content": [
                    {"type": "input_text", "text": prompt},
                    {
                        "type": "input_image",
                        "image_url": f"data:image/jpeg;base64,{base64_image}"
                    }
                ]
            }
        ]
    )

    return json.loads(response.output_text)

In [None]:
def ask_blip2(image_path, caption, question, max_new_tokens=50):
    """
    Runs BLIP-2 Flan-T5-xl on:
        IMAGE + (caption + question) text prompt.
    Returns: the generated answer as a clean string.
    """

    # ---- Load image ----
    image = Image.open(image_path).convert("RGB")

    # ---- Build prompt for BLIP-2 ----
    # Format:  "<caption>\n\nQuestion: <question>\nAnswer:"
    prompt = f"{caption}\n\nQuestion: {question}\nAnswer:"

    # ---- Preprocess ----
    inputs = processor(
        image,
        prompt,
        return_tensors="pt"
    ).to(model.device, dtype=torch.float16)

    # ---- Generate answer ----
    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.0,      # deterministic & stable (recommended)
        do_sample=False
    )

    # ---- Decode ----
    answer = processor.tokenizer.decode(
        output_ids[0],
        skip_special_tokens=True
    )

    # ---- Strip the prompt part (BLIP-2 often echoes input) ----
    if "Answer:" in answer:
        answer = answer.split("Answer:", 1)[-1].strip()

    # Clean spacing
    return answer.strip()

In [None]:
###### FOR PACE ICE ONLY - replace GT username below ######

# Tells HuggingFace to save all downloaded models + datasets in scratch directory instead of home directory
os.environ["HF_HOME"] = "/home/hice1/nbalakrishna3/scratch/huggingface"
os.environ["HF_DATASETS_CACHE"] = "/home/hice1/nbalakrishna3/scratch/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/home/hice1/nbalakrishna3/scratch/hf_cache"

In [None]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
from PIL import Image

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ Using device:", device)

# "Salesforce/blip2-flan-t5-xxl" -> bigger, might need 2x memory
MODEL_NAME = "Salesforce/blip2-flan-t5-xl"

processor = Blip2Processor.from_pretrained(MODEL_NAME)
model = Blip2ForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.eval()
print("✅ Model loaded!")

In [None]:
def create_eval_prompt(caption, condition, question, model_answer):
    return f"""
You are an automated scoring module. You MUST output ONLY a single character: "0" or "1".
NEVER output words, sentences, explanations, punctuation, or reasoning.

YOUR TASK:
Judge whether the MODEL_ANSWER correctly matches what is visually true in the IMAGE.

IMPORTANT:
• The CAPTION was only input context for another model. Do NOT treat it as truth.
• Ignore any misleading or incorrect information in the CAPTION.
• Use ONLY the IMAGE + the QUESTION + world knowledge to judge correctness.
• If the MODEL_ANSWER matches the visible truth in the image, output "1".
• If the MODEL_ANSWER is wrong, unsupported, or contradicts the image, output "0".
• Your ENTIRE reply MUST be exactly one character: "0" or "1".

Now evaluate:

CAPTION (context only): {caption}
CONDITION: {condition}
QUESTION: {question}
MODEL_ANSWER: {model_answer}

Reply with ONLY "0" or "1".
"""

In [None]:
def eval_answer(image_path, caption, condition, question, model_answer):
    """
    Builds the judge prompt, encodes the image,
    calls Claude directly, and returns 0 or 1.
    """

    # ---- Build prompt ----
    prompt = create_eval_prompt(caption, condition, question, model_answer)

    # ---- Encode image ----
    with open(image_path, "rb") as f:
        img_bytes = f.read()
    b64img = base64.b64encode(img_bytes).decode("utf-8")

    # ---- Call Claude ----
    response = anthropic_client.messages.create(
        model="claude-sonnet-4-5-20250929",
        max_tokens=5,
        temperature=0,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/jpeg",
                            "data": b64img
                        }
                    },
                    {
                        "type": "text",
                        "text": prompt
                    }
                ]
            }
        ]
    )

    # ---- Parse output ----
    output = response.content[0].text.strip()

    if output not in ("0", "1"):
        raise ValueError(f"Unexpected Claude judge output: {output}")

    return int(output)

In [None]:
# Used to setup eval metric calculation

def pair_stats_by_level(jsonl_path):
    levels = ["L0", "L1", "L2", "L3"]

    # Tallies per level
    pair_stats = {
        lvl: {(1,1):0, (1,0):0, (0,1):0, (0,0):0}
        for lvl in levels
    }

    # ---- Single JSONL pass ----
    with open(jsonl_path, "r") as f:
        for line in f:
            item = json.loads(line)
            eval_scores = item["eval_scores"]

            for lvl in levels:
                s_c = eval_scores[lvl]["correct_caption_score"]
                s_i = eval_scores[lvl]["incorrect_caption_score"]
                pair_stats[lvl][(s_c, s_i)] += 1

    return pair_stats

def conf_pairs_by_level(pair_stats):
    return pair_stats  # already exactly the confusion matrix

In [None]:
# Eval Metric - fooling rate by level 
# When the model answers correctly in the correct-caption condition but answers incorrectly in the incorrect-caption condition.

def fooling_rate_by_level(pair_stats):
    results = {}

    for lvl, counts in pair_stats.items():
        c10 = counts[(1,0)]
        total = sum(counts.values())
        rate = c10 / total if total > 0 else 0

        results[lvl] = {
            "fooled": c10,
            "total": total,
            "rate": rate,
        }

    return results

In [None]:
# Eval Metrics - per-level answer accuracy and MDI, computed separately for the correct-caption and incorrect-caption conditions.

def acc_mdi_by_level(pair_stats):
    results = {}

    for lvl, counts in pair_stats.items():
        c11 = counts[(1,1)]
        c10 = counts[(1,0)]
        c01 = counts[(0,1)]
        c00 = counts[(0,0)]
        total = c11 + c10 + c01 + c00

        if total == 0:
            results[lvl] = None
            continue

        # accuracy under correct caption = model is correct (regardless of incorrect-caption score)
        acc_correct = (c11 + c10) / total
        # accuracy under incorrect caption = model is correct under wrong caption
        acc_incorrect = (c11 + c01) / total

        mdi = acc_correct - acc_incorrect

        results[lvl] = {
            "accuracy_correct_caption": acc_correct,
            "accuracy_incorrect_caption": acc_incorrect,
            "MDI": mdi,
        }

    return results

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def generate_BLIP_outputs():
    image_files = [
        f for f in os.listdir(IMAGE_FOLDER)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ]

    print(f"Found {len(image_files)} images.\n")

    with open(OUTPUT_PATH, "w", encoding="utf-8") as out:
        for img_file in tqdm(image_files, desc="Processing"):
            image_id = os.path.splitext(img_file)[0]
            path = os.path.join(IMAGE_FOLDER, img_file)

            try:
                # ---- 1) GPT captions + questions ----
                b64 = encode_image(path)
                q = generate_questions(b64)

                correct_caption = q["correct_caption"]
                incorrect_caption = q["incorrect_caption"]

                L0 = q["L0"]
                L1 = q["L1"]
                L2 = q["L2"]
                L3 = q["L3"]

                # ---- 2) BLIP-2 answers ----
                answers_correct = {
                    "L0": ask_blip2(path, correct_caption, L0),
                    "L1": ask_blip2(path, correct_caption, L1),
                    "L2": ask_blip2(path, correct_caption, L2),
                    "L3": ask_blip2(path, correct_caption, L3)
                }

                answers_incorrect = {
                    "L0": ask_blip2(path, incorrect_caption, L0),
                    "L1": ask_blip2(path, incorrect_caption, L1),
                    "L2": ask_blip2(path, incorrect_caption, L2),
                    "L3": ask_blip2(path, incorrect_caption, L3)
                }

                # ---- 3) Base JSON structure ----
                output = {
                    "image_id": image_id,

                    "captions": {
                        "correct": correct_caption,
                        "incorrect": incorrect_caption
                    },

                    "questions": {
                        "L0": L0,
                        "L1": L1,
                        "L2": L2,
                        "L3": L3
                    },

                    "answers": {
                        "correct_caption": answers_correct,
                        "incorrect_caption": answers_incorrect
                    },

                    "eval_scores": {}   # will be filled next
                }

                # ---- 4) Parallel Claude evaluation ----
                jobs = []
                with ThreadPoolExecutor(max_workers=8) as ex:
                    for level, question in output["questions"].items():

                        # correct caption condition
                        jobs.append(ex.submit(
                            eval_answer,
                            path,
                            output["captions"]["correct"],
                            "correct caption condition",
                            question,
                            output["answers"]["correct_caption"][level]
                        ))

                        # incorrect caption condition
                        jobs.append(ex.submit(
                            eval_answer,
                            path,
                            output["captions"]["incorrect"],
                            "incorrect caption condition",
                            question,
                            output["answers"]["incorrect_caption"][level]
                        ))

                    # collect results
                    ordered_results = [j.result() for j in jobs]

                # ---- 5) Attach scores to JSON in correct structure ----
                idx = 0
                for level in ["L0", "L1", "L2", "L3"]:
                    score_c = ordered_results[idx]; idx += 1
                    score_i = ordered_results[idx]; idx += 1

                    output["eval_scores"][level] = {
                        "correct_caption_score": score_c,
                        "incorrect_caption_score": score_i
                    }

                # ---- 6) Write one JSON line ----
                out.write(json.dumps(output, ensure_ascii=False) + "\n")

            except Exception as e:
                print(f"\nError with {image_id}: {e}")
                

    print(f"\nDone. JSONL saved to: {OUTPUT_PATH}\n")

In [None]:
if __name__ == "__main__":
    
    ######### BLIP #########
    
    # Generates dataset used (correct/incorrect captions, L0-L4 questions)
    # Evaluates BLIP-2 responses via Claude Sonnet 4.5 (0 - incorrect; 1 - correct)
    generate_BLIP_outputs() 
    
    # Compute metrics for BLIP responses
    BLIP_pair_stats = pair_stats_by_level(OUTPUT_PATH)
    BLIP_fooling_rate_per_level = fooling_rate_by_level(BLIP_pair_stats)
    BLIP_acc_mdi_per_level = acc_mdi_by_level(BLIP_pair_stats)
    
    print("\n========================")
    print("FOOLING RATE PER LEVEL")
    print("========================\n")
    for lvl, stats in BLIP_fooling_rate_per_level.items():
        print(f"{lvl}: Fooling Rate = {stats['fooled']}/{stats['total']} "
              f"({stats['rate']:.2f})")

    print("\n========================")
    print("ACCURACY + MDI PER LEVEL")
    print("========================\n")
    for lvl, stats in BLIP_acc_mdi_per_level.items():
        print(f"{lvl}:  "
            f"Acc(correct caption) = {stats['accuracy_correct_caption']:.2f},  "
            f"Acc(incorrect caption) = {stats['accuracy_incorrect_caption']:.2f},  "
            f"MDI = {stats['MDI']:.2f}")
         
    ######## LLAVA ########