In [None]:
###### FOR PACE ICE - replace GT username below ######
%cd /home/hice1/nbalakrishna3/scratch
!pwd

In [None]:
import os
import json
import base64
from openai import OpenAI
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
import google.generativeai as genai
from dotenv import load_dotenv
from tqdm import tqdm

In [None]:
# load_dotenv()
# API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_KEY = ""
ANTHROPIC_API_KEY = ""
GEMINI_API_KEY = ""

In [None]:
if not OPENAI_API_KEY:
    raise ValueError(" OPENAI_API_KEY not found in .env file")

In [None]:
IMAGE_FOLDER = "datasets/coco/images/train2017"          

QWEN_EXP1_SINGLE_OUTPUT_PATH = "qwen_exp1_single_responses.jsonl" 
QWEN_EXP2_SINGLE_OUTPUT_PATH = "qwen_exp2_single_responses.jsonl" 

QWEN_EXP1_MULTI_OUTPUT_PATH = "qwen_exp1_multi_responses.jsonl" 
QWEN_EXP2_MULTI_OUTPUT_PATH = "qwen_exp2_multi_responses.jsonl" 

GPT_MODEL = "gpt-4.1-mini"
CLAUDE_MODEL = "claude-3-5-sonnet-20241022"
GOOGLE_MODEL = genai.GenerativeModel('models/gemini-2.0-flash')
MAX_OUTPUT = 200               

In [None]:
openai_client = OpenAI(api_key=OPENAI_API_KEY)
anthropic_client = Anthropic(api_key=ANTHROPIC_API_KEY)
genai.configure(api_key=GEMINI_API_KEY)

print(anthropic_client.models.list())

In [None]:
def encode_image(image_path):
    with open(image_path, "rb") as img:
        return base64.b64encode(img.read()).decode("utf-8")

In [None]:
def generate_questions_exp1(base64_image):
    prompt = """
You are preparing controlled experimental materials for multimodal evaluation.

Given the IMAGE (provided separately), generate the following:

----------------------------------------------------------------------
1. A correct caption
----------------------------------------------------------------------
• Must accurately describe the visible scene.
• 7–15 words, objective, simple, and factual.
• Must mention the main subject(s) and one key attribute
  (e.g., species, color, object type, action, or spatial relation).
• Should be worded well and clearly. 

----------------------------------------------------------------------
2. A deliberately incorrect caption
----------------------------------------------------------------------
• Must keep the same length and sentence structure style as the correct caption.
• MUST change EXACTLY TWO meaningful visual attributes from the correct caption.
  Allowed attribute types:
     – species/category of the main object
     – color of a main object
     – pattern/texture of a main object
     – object type that a person is holding/using
     – action the main subject is performing
     – spatial relation (e.g., “in front of” → “behind”)
     
• The incorrect caption MUST be **factually wrong for THIS image**.
  It should contradict TWO concrete visual facts visible in the picture, not merely
  describe an alternative plausible real-world scenario -- VERY IMPORTANT!
  (Example: If the scene shows a lake, “ocean” is *not* allowed because both can
   coexist conceptually; the changed attributes must be unambiguously false.)

• The incorrect caption must remain syntactically valid and plausible for the 
  kind of world the image depicts, but factually wrong.

• The two changed attributes MUST be *the most visually important attributes*
  from the correct caption.

----------------------------------------------------------------------
3. Five Visual Necessity Ladder (VNL) questions (L0–L4)
----------------------------------------------------------------------

L0 — Pure language prior  
• Must be answerable with NO access to the image.  
• General world knowledge only; do NOT reference animals, people,
  objects, nature, or environments.  
• 6–14 words.

L1 — Probe changed attribute #1 
• MUST directly probe the FIRST changed attribute from the incorrect caption.  
• Do NOT explicitly mention the changed attribute in the question (may reference attribute category though). 
• Example:If species changed, ask “What type of animal…?”  
          If color changed, ask “What color is…?”  
          If object type changed, ask “What object is… holding?”  
• No attributes other than the first changed one.  
• 6–14 words.

L2 — Probe changed attribute #2
• MUST directly probe the SECOND changed attribute from the incorrect caption.  
• Do NOT explicitly mention the changed attribute in the question (may reference attribute category though). 
• Same rules as L2 but targeting the second changed detail.  
• Should not be the same question as L1. 
• 6–14 words.

L3 — High-level reasoning
• Ask a reasoning question that is loosely related to the scene shown in the image.
• The question MUST NOT depend on the two changed attributes.
• The question MUST NOT target the same object/attribute as L1 or L2.
• The question SHOULD require general common-sense or contextual reasoning.
• The question SHOULD still be answerable using the image (but only its general context, not the altered details).
• 6–14 words.

----------------------------------------------------------------------
GENERAL RULES
----------------------------------------------------------------------
• Do NOT provide answers.
• Do NOT describe the image outside captions.
• All questions must be 6–14 words.
• Output MUST be a single JSON object in the exact format below.

----------------------------------------------------------------------
Return EXACTLY this JSON structure:
----------------------------------------------------------------------
{
  "correct_caption": "<string>",
  "incorrect_caption": "<string>",
  "L0": "<string>",
  "L1": "<string>",    // targets changed attribute #1
  "L2": "<string>",    // targets changed attribute #2
  "L3": "<string>"   
}


"""
    response = openai_client.responses.create(
        model=GPT_MODEL,
        max_output_tokens=MAX_OUTPUT,
        input=[
            {
                "role": "user",
                "content": [
                    {"type": "input_text", "text": prompt},
                    {
                        "type": "input_image",
                        "image_url": f"data:image/jpeg;base64,{base64_image}"
                    }
                ]
            }
        ]
    )

    return json.loads(response.output_text)

In [None]:
def generate_questions_exp2(base64_image):
    prompt = """
You are preparing controlled experimental materials for multimodal evaluation.

Given the IMAGE (provided separately), generate the following:

============================================================
1. Correct Caption
============================================================
• Accurately describe the visible scene.
• 9–15 words, objective, simple, and factual.
• Should mention main objects; avoid inference beyond evidence.

============================================================
2. Visual Necessity Question Ladder (VNL): Levels L0 → L4
============================================================

GENERAL RULES:
• L1–L4 MUST require looking at the image to answer.
• All questions MUST be answerable using only the given image.
• Do NOT include the answers.
• No question should exceed 14 words.
• Return concise, natural wording.

------------------------------------------------------------
L0 – Baseline Question (Language-prior only)
------------------------------------------------------------
• A question humans can answer **without seeing the image**.
• May refer to the world generally (NOT the specific image).
• Purpose: control for language-only biases.
• 6–12 words.
Examples:
– “What season often has the coldest weather?”  
– “Which animal is larger, a dog or an elephant?”  
– “What do people usually use to take photographs?”

------------------------------------------------------------
L1 – Basic Visual Recognition
------------------------------------------------------------
• Requires the image.
• Ask about a **primary object** or its basic property.
• No reasoning, no inference.
Examples:
– “What object is the person holding?”  
– “What color is the animal?”  
– “How many people are visible?”

------------------------------------------------------------
L2 – Intermediate Visual Detail
------------------------------------------------------------
• Also requires the image.
• Ask about a **secondary property** of a main object.
• Slightly more specific than L1.
Examples:
– “What pattern is on the person’s shirt?”  
– “What type of hat is the man wearing?”  
– “What material is the table made of?”

------------------------------------------------------------
L3 – Relational / Spatial Reasoning
------------------------------------------------------------
• Requires image + spatial relations + relational understanding.
Examples:
– “Where is the dog positioned relative to the child?”  
– “What object is behind the bicycle?”  
– “Which person is closest to the camera?”

------------------------------------------------------------
L4 – High-Level Visual Reasoning
------------------------------------------------------------
• Hardest level; requires the entire scene.
• Ask about interactions, goals, implied roles, or multi-object context.
• Still must be answerable from the image alone (no external inference).
Examples:
– “What activity are the people engaged in?”  
– “Why is the man extending his arm?”  
– “What is the group collectively doing?”

============================================================
Return EXACTLY this JSON structure:
{
  "correct_caption": "<string>",
  "L0": "<string>",
  "L1": "<string>",
  "L2": "<string>",
  "L3": "<string>",
  "L4": "<string>"
}
============================================================


"""
    response = openai_client.responses.create(
        model=GPT_MODEL,
        max_output_tokens=MAX_OUTPUT,
        input=[
            {
                "role": "user",
                "content": [
                    {"type": "input_text", "text": prompt},
                    {
                        "type": "input_image",
                        "image_url": f"data:image/jpeg;base64,{base64_image}"
                    }
                ]
            }
        ]
    )

    return json.loads(response.output_text)

In [None]:
# WORKING - but may be wrong!

# def compute_qwen_mdi(attns, inputs, image_token_id=151655):
#     """
#     MDI for a single-image Qwen2.5-VL call.

#     attns  : list of attention tensors captured from decoder layers
#              each with shape (batch, heads, q_len, k_len)
#              (we'll gracefully skip anything that isn't this)
#     inputs : batch dict that contains "input_ids"
#     """

#     import torch

#     if attns is None or len(attns) == 0:
#         print("MDI: no attention tensors")
#         return None

#     if "input_ids" not in inputs:
#         print("MDI: inputs missing input_ids")
#         return None

#     input_ids = inputs["input_ids"][0]          # (seq_len,)
#     img_positions = (input_ids == image_token_id).nonzero().flatten()

#     if img_positions.numel() == 0:
#         print("MDI: no image tokens found in input_ids")
#         return None

#     img_start = img_positions[0].item()
#     img_end   = img_positions[-1].item() + 1    # non-inclusive

#     vision_scores = []
#     text_scores = []

#     for layer_attn in attns:
#         # Some layers may return None or have wrong shape; skip them
#         if layer_attn is None or not torch.is_tensor(layer_attn):
#             continue
#         if layer_attn.dim() != 4:
#             # e.g. (heads, q, k) -> add batch dim
#             if layer_attn.dim() == 3:
#                 layer_attn = layer_attn.unsqueeze(0)
#             else:
#                 continue

#         # layer_attn: (batch, heads, q_len, k_len)
#         attn = layer_attn.mean(dim=1)[0]  # -> (q_len, k_len)

#         # safety in case sequence length changed
#         q_len, k_len = attn.shape
#         if img_end > k_len:
#             continue

#         vis = attn[:, img_start:img_end].sum().item()
#         txt = (attn[:, :img_start].sum() + attn[:, img_end:].sum()).item()

#         vision_scores.append(vis)
#         text_scores.append(txt)

#     if not vision_scores or not text_scores:
#         print("MDI: no valid layers after filtering")
#         return None

#     vis_avg = sum(vision_scores) / len(vision_scores)
#     txt_avg = sum(text_scores) / len(text_scores)

#     mdi = vis_avg / (vis_avg + txt_avg + 1e-9)
#     return float(mdi)

In [None]:
# WORKING - but may be wrong!

# def compute_qwen_mdi(attns, inputs):
#     """
#     Guaranteed-working MDI for Qwen2.5-VL.
#     Automatically detects visual tokens using the known Qwen ranges:
#       - 151552–151654 : image codebook tokens
#       - 151655        : image separator/end token

#     MDI = attention_to_visual_tokens / (attention_to_all_other_tokens)
#     """

#     import torch

#     if attns is None or len(attns) == 0:
#         print("MDI: no attn tensors")
#         return None

#     if "input_ids" not in inputs:
#         print("MDI: missing input_ids")
#         return None

#     input_ids = inputs["input_ids"][0]           # (seq_len,)

#     # --- 1. Detect vision tokens robustly ---
#     vision_mask = ((input_ids >= 151552) & (input_ids <= 151655))

#     visual_positions = vision_mask.nonzero().flatten()
#     if visual_positions.numel() == 0:
#         print("MDI: no visual tokens detected")
#         return None

#     v_start = visual_positions[0].item()
#     v_end   = visual_positions[-1].item() + 1    # non-inclusive

#     vision_scores = []
#     text_scores   = []

#     # --- 2. Iterate over captured attention ---
#     for layer_attn in attns:

#         if layer_attn is None or not torch.is_tensor(layer_attn):
#             continue

#         # Accept shapes:
#         #   (batch, heads, q_len, k_len)
#         #   (heads, q_len, k_len)
#         if layer_attn.dim() == 3:
#             # add batch dim
#             layer_attn = layer_attn.unsqueeze(0)
#         elif layer_attn.dim() != 4:
#             continue

#         # Mean over heads -> (q_len, k_len)
#         attn = layer_attn.mean(dim=1)[0]

#         q_len, k_len = attn.shape

#         # clip if KV cache truncated
#         if v_end > k_len:
#             continue

#         # total attention paid to image tokens
#         vis = attn[:, v_start:v_end].sum().item()

#         # attention paid to everything else
# #         txt = (attn[:, :v_start].sum() + attn[:, v_end:].sum()).item()

#         txt_before = attn[:, :v_start].sum().item() if v_start > 0 else 0
#         txt_after = attn[:, v_end:].sum().item() if v_end < k_len else 0
#         txt = txt_before + txt_after

#         vision_scores.append(vis)
#         text_scores.append(txt)

#     if len(vision_scores) == 0 or len(text_scores) == 0:
#         print("MDI: no valid layers")
#         return None

#     vis_avg = sum(vision_scores) / len(vision_scores)
#     txt_avg = sum(text_scores) / len(text_scores)

#     mdi = vis_avg / (vis_avg + txt_avg + 1e-9)
#     return float(mdi)

In [None]:
# Working - most likely correct!

def compute_qwen_mdi(attns, inputs):
    """
    Guaranteed-correct MDI for Qwen2.5-VL.
    Qwen does NOT place real vision tokens in input_ids.
    It only inserts repeated <image> placeholder tokens with ID 151655.
    The actual image patches stay inside the vision encoder.
    """

    import torch

    if attns is None or len(attns) == 0:
        return None

    if "input_ids" not in inputs:
        return None

    input_ids = inputs["input_ids"][0]          # (seq_len,)
    seq_len = input_ids.shape[0]

    # ---- 1. Correct visual span detection ----
    # Qwen2.5-VL uses ONLY token_id 151655 as the image placeholder
    IMAGE_TOKEN_ID = 151655

    visual_positions = (input_ids == IMAGE_TOKEN_ID).nonzero().flatten()
    if visual_positions.numel() == 0:
        print("No 151655 tokens found → Qwen image token missing?")
        return None

    v_start = visual_positions[0].item()
    v_end   = visual_positions[-1].item() + 1     # non-inclusive

    # ---- 2. Accumulate attention ----
    vision_scores = []
    text_scores = []

    for layer_attn in attns:

        if layer_attn is None or not torch.is_tensor(layer_attn):
            continue

        # expected (batch, heads, q_len, k_len)
        if layer_attn.dim() == 3:
            layer_attn = layer_attn.unsqueeze(0)
        elif layer_attn.dim() != 4:
            continue

        attn = layer_attn.mean(dim=1)[0]    # (q_len, k_len)

        q_len, k_len = attn.shape
        if v_end > k_len:
            continue

        # vision attention
        vis = attn[:, v_start:v_end].sum().item()

        # text attention
        txt_before = attn[:, :v_start].sum().item()
        txt_after  = attn[:, v_end:].sum().item()
        txt = txt_before + txt_after

        vision_scores.append(vis)
        text_scores.append(txt)

    if len(vision_scores) == 0:
        return None

    vis_avg = sum(vision_scores) / len(vision_scores)
    txt_avg = sum(text_scores) / len(text_scores)

    mdi = vis_avg / (vis_avg + txt_avg + 1e-9)
    return float(mdi)


In [None]:
import torch
import math

# NOT NORMALIZED 

def compute_attention_entropy(attns):
    """
    Compute average attention entropy across all Qwen2-VL decoder layers.
    
    attns : list of attention tensors captured by hooks
            each element is (batch, heads, q_len, k_len) or (heads, q_len, k_len)

    Returns:
        float entropy_score  (lower = more focused, higher = more diffuse)
        or None if not computable
    """

    if attns is None or len(attns) == 0:
        print("Entropy: no attention tensors")
        return None

    entropies = []

    for layer_attn in attns:

        # Skip invalid entries
        if layer_attn is None or not torch.is_tensor(layer_attn):
            continue

        # Ensure shape is (batch, heads, q, k)
        if layer_attn.dim() == 3:
            layer_attn = layer_attn.unsqueeze(0)   # (1, heads, q, k)
        elif layer_attn.dim() != 4:
            continue

        # Normalize attention along key dimension
        # shape: (batch, heads, q_len, k_len)
        attn = layer_attn.float()

        # Softmax normalization (just in case the model didn't return normalized attn)
        attn = torch.softmax(attn, dim=-1)

        # Compute entropy per (batch, head, q)
        # H = -sum(p * log(p))
        entropy = -(attn * (attn + 1e-12).log()).sum(dim=-1)  # sum over k_len

        # Mean over batch, heads, and q positions
        entropy = entropy.mean().item()

        entropies.append(entropy)

    if len(entropies) == 0:
        print("Entropy: no valid layers")
        return None

    # Average entropy across layers
    final_entropy = float(sum(entropies) / len(entropies))
    return final_entropy


In [None]:
import torch
import math


# Version w/ normalization (scaled from 0 to 1)

def compute_attention_entropy(attn_maps, mask=None, vision_span=None, normalized=True):
    """
    attn_maps : list of attention tensors
        each tensor has shape (batch, heads, q_len, k_len)
        or (heads, q_len, k_len)
        or (q_len, k_len)

    mask : optional boolean mask of shape (k_len,)
        True = include that key token
        If None, full sequence is used.

    vision_span : optional (start, end)
        If provided, computes entropy only over this token region.

    normalized : bool
        If True -> returns H / log(k)
        If False -> returns raw entropy.

    Returns:
        average entropy across layers + heads as float
    """
    if attn_maps is None or len(attn_maps) == 0:
        return None

    entropies = []

    for layer_attn in attn_maps:

        # ---- 1. Fix shapes ----
        if layer_attn.dim() == 3:       # (heads, q, k)
            layer_attn = layer_attn.unsqueeze(0)  # -> (1, heads, q, k)
        elif layer_attn.dim() == 2:     # (q, k)
            layer_attn = layer_attn.unsqueeze(0).unsqueeze(0)

        if layer_attn.dim() != 4:
            continue  # bad layer

        b, h, q_len, k_len = layer_attn.shape

        # ---- 2. Select region (vision / mask) ----
        attn = layer_attn  # (b, heads, q, k)

        if vision_span is not None:
            start, end = vision_span
            attn = attn[:, :, :, start:end]
            k_len = end - start

        if mask is not None:
            # mask: (k,)
            mask = mask.to(attn.device)
            attn = attn[:, :, :, mask]
            k_len = mask.sum().item()

        if k_len <= 1:
            continue

        # ---- 3. Normalize probabilities (safety) ----
        attn = attn.clamp(min=1e-9)
        attn = attn / attn.sum(dim=-1, keepdim=True)

        # ---- 4. Entropy ----
        H = -(attn * attn.log()).sum(dim=-1)  # (b, heads, q)
        H = H.mean().item()  # average everything

        # ---- 5. Normalize 0–1 if requested ----
        if normalized:
            H = H / math.log(k_len)

        entropies.append(H)

    if len(entropies) == 0:
        return None

    return float(sum(entropies) / len(entropies))

In [None]:
# WRONG - but keeping just in case

# def compute_attention_shift(attns_A, attns_B, vision_span=None, normalized=True):
#     """
#     Computes attention shift between two model runs (A and B).
    
#     Each attns_X is a list of attention tensors:
#        (batch, heads, q_len, k_len) or (heads, q_len, k_len) or (q_len, k_len).

#     vision_span: (start, end)  → compute shift only over visual tokens
#     normalized: normalize final shift to [0, 1]

#     Returns:
#         scalar attention-shift score
#     """

#     if attns_A is None or attns_B is None:
#         print("Shift: missing attn maps")
#         return None

#     if len(attns_A) == 0 or len(attns_B) == 0:
#         print("Shift: empty attn lists")
#         return None

#     shifts = []

#     # Iterate layer-by-layer (stop at min length)
#     L = min(len(attns_A), len(attns_B))

#     for i in range(L):
#         A = attns_A[i]
#         B = attns_B[i]

#         # Ensure both are valid tensors
#         if A is None or B is None:
#             continue
#         if not (torch.is_tensor(A) and torch.is_tensor(B)):
#             continue

#         # ---- 1. Normalize shapes to (1, heads, q, k) ----
#         def fix_shape(X):
#             if X.dim() == 2:      # (q, k)
#                 return X.unsqueeze(0).unsqueeze(0)
#             if X.dim() == 3:      # (heads, q, k)
#                 return X.unsqueeze(0)
#             return X  # assume (batch, heads, q, k)
        
#         A = fix_shape(A)
#         B = fix_shape(B)

#         # ---- 2. Align sequence lengths ----
#         _, hA, qA, kA = A.shape
#         _, hB, qB, kB = B.shape

#         q = min(qA, qB)
#         k = min(kA, kB)

#         A = A[:, :, :q, :k]
#         B = B[:, :, :q, :k]

#         # ---- 3. Optional: restrict to vision tokens ----
#         if vision_span is not None:
#             v_start, v_end = vision_span
#             v_end = min(v_end, k)
#             A = A[:, :, :, v_start:v_end]
#             B = B[:, :, :, v_start:v_end]

#         # ---- 4. Normalize to probability distributions ----
#         A = A.clamp(min=1e-9)
#         B = B.clamp(min=1e-9)

#         A = A / A.sum(dim=-1, keepdim=True)
#         B = B / B.sum(dim=-1, keepdim=True)

#         # ---- 5. L1 distance (attention shift) ----
#         # average across batch, heads, and queries
#         shift_val = torch.abs(A - B).sum(dim=-1).mean().item()

#         shifts.append(shift_val)

#     if len(shifts) == 0:
#         print("Shift: no valid layers after processing")
#         return None

#     shift_raw = sum(shifts) / len(shifts)

#     # ---- 6. Normalize shift to 0–1 scale ----
#     # Maximum L1 between two distributions is 2
#     if normalized:
#         shift_raw = shift_raw / 2.0  

#     return float(shift_raw)

In [None]:
import torch
import torch.nn.functional as F

def compute_attention_shift(prev_attns, curr_attns, inputs):
    """
    Computes attention shift between two levels for Qwen2.5-VL.
    Uses only LAST COMMON QUERY token to avoid shape mismatch.
    """

    if prev_attns is None or curr_attns is None:
        return None
    if len(prev_attns) == 0 or len(curr_attns) == 0:
        return None

    # ===== 1. Find visual region using Qwen input_ids =====
    input_ids = inputs["input_ids"][0]
    visual_mask = ((input_ids >= 151552) & (input_ids <= 151655))
    visual_positions = visual_mask.nonzero().flatten()

    if visual_positions.numel() == 0:
        print("Shift: no visual tokens detected in input_ids")
        return None

    v_start = visual_positions[0].item()
    v_end   = visual_positions[-1].item() + 1

    layer_shifts = []

    # ===== 2. Layer-by-layer shift =====
    for A_prev, A_curr in zip(prev_attns, curr_attns):

        if A_prev is None or A_curr is None:
            continue
        if not (torch.is_tensor(A_prev) and torch.is_tensor(A_curr)):
            continue

        # unify shapes
        if A_prev.dim() == 3: A_prev = A_prev.unsqueeze(0)
        if A_curr.dim() == 3: A_curr = A_curr.unsqueeze(0)
        if A_prev.dim() != 4 or A_curr.dim() != 4:
            continue

        # mean over heads → (q_len, k_len)
        A_prev = A_prev.mean(dim=1)[0]
        A_curr = A_curr.mean(dim=1)[0]

        q_len_prev, k_len_prev = A_prev.shape
        q_len_curr, k_len_curr = A_curr.shape

        # align key dimension
        k_len = min(k_len_prev, k_len_curr)
        A_prev = A_prev[:, :k_len]
        A_curr = A_curr[:, :k_len]

        # visual slice must be valid
        if v_end > k_len:
            continue

        # ===== *** FIX: align QUERY dimension *** =====
        q_len = min(q_len_prev, q_len_curr)
        # pick last common query
        A_prev_last = A_prev[q_len - 1, v_start:v_end]
        A_curr_last = A_curr[q_len - 1, v_start:v_end]

        # normalize
        A_prev_last = A_prev_last.clamp(1e-9)
        A_curr_last = A_curr_last.clamp(1e-9)
        A_prev_last = A_prev_last / A_prev_last.sum()
        A_curr_last = A_curr_last / A_curr_last.sum()

        # cosine distance
        cos_sim = F.cosine_similarity(
            A_prev_last, A_curr_last, dim=0
        )
        shift = float(1 - cos_sim.clamp(-1, 1))
        layer_shifts.append(shift)

    if len(layer_shifts) == 0:
        return None

    return sum(layer_shifts) / len(layer_shifts)


In [None]:
def ask_qwen(
    image_path,
    caption,
    question,
    history=None,
    max_new_tokens=50,
    return_metrics=True,
    last_turn_only=False
):
    """
    Runs Qwen-VL with image + (caption + question) text prompt.
    Supports:
        - returning answer only
        - returning answer + MDI
        - returning answer + MDI + attention tensors
    """

    # ---------- 0. Initialize ----------
    if history is None:
        history = []

    # ---------- 1. Load image ----------
    image = Image.open(image_path).convert("RGB")

    # ---------- 2. Build fixed-format prompt ----------
    answer_rules = (
        "Please answer in plain text only.\n"
        "Do NOT use markdown formatting.\n"
        "Keep the answer short (1–2 sentences).\n"
        "Provide only the direct answer without any explanation."
    )

    # ---------- 3. Build message list in the CORRECT, SAFE ORDER ----------
    messages = []

    # (A) Add history FIRST (chronological)
    for q_prev, a_prev in history:
        messages.append({
            "role": "user",
            "content": [
                {"type": "text", "text": q_prev}
            ]
        })
        messages.append({
            "role": "assistant",
            "content": [
                {"type": "text", "text": a_prev}
            ]
        })

    # (B) Add CURRENT TURN LAST
    messages.append({
        "role": "user",
        "content": [
            # 1. QUESTION FIRST — prevents L0 contamination
            {"type": "text", "text": question},

            # 2. IMAGE SECOND
            {"type": "image", "image": image_path},

            # 3. CAPTION THIRD — labeled to avoid mixing with instructions
            {"type": "text", "text": f"Caption: {caption}"},

            # 4. ANSWER RULES LAST — separate block
            {"type": "text", "text": answer_rules}
        ]
    })

    # ---------- 4. Preprocess ----------
    text = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # ---------- 5. Vision preprocess ----------
    try:
        from qwen_vl_utils import process_vision_info
        image_inputs, video_inputs = process_vision_info(messages)

        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt"
        ).to(model.device)

    except ImportError:
        # fallback
        inputs = processor(
            text=[text],
            images=[image],
            padding=True,
            return_tensors="pt"
        ).to(model.device)

    # IMPORTANT for attention extraction
    model.config.use_cache = False

#     # ---------- 6. Generate with attention extraction ----------
    
    
    def find_decoder_self_attn_layers(model):
        layers = []
        for name, module in model.named_modules():
            if name.endswith("self_attn") and "language_model" in name:
                layers.append(module)
        return layers
    
    collected_attns = []

    def save_attn(module, inp, out):
        attn = out[1]  # (batch, heads, q_len, k_len)
        collected_attns.append(attn.detach().cpu())

    layers = find_decoder_self_attn_layers(model)
    
    hooks = []
    for layer in layers:
        h = layer.register_forward_hook(save_attn)
        hooks.append(h)
    
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            output_attentions=True,
            return_dict_in_generate=True
        )
        
    # --- REMOVE HOOKS ---
    for h in hooks:
        h.remove()


    # ---------- 7. Decode answer ----------
    generated_ids = outputs.sequences[0][inputs.input_ids.shape[1]:]  # remove prompt tokens
    answer = processor.decode(generated_ids, skip_special_tokens=True).strip()

    # Remove possible "ASSISTANT:" prefixes
    if "ASSISTANT:" in answer:
        answer = answer.split("ASSISTANT:")[-1].strip()
        
    # ---------- 8. Metrics ----------
    if return_metrics:
        try:
            final_mdi = compute_qwen_mdi(collected_attns, inputs)
            for h in hooks:
                h.remove()
        except:
            print("fail")
            final_mdi = None

        return answer, final_mdi, collected_attns, inputs # INPUTS ADDED FOR ATTENTION SHIFT CALC
    
    return answer

In [None]:
###### FOR PACE ICE ONLY - replace GT username below ######

# Tells HuggingFace to save all downloaded models + datasets in scratch directory instead of home directory
os.environ["HF_HOME"] = "/home/hice1/nbalakrishna3/scratch/huggingface"
os.environ["HF_DATASETS_CACHE"] = "/home/hice1/nbalakrishna3/scratch/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/home/hice1/nbalakrishna3/scratch/hf_cache"

In [None]:
# QWEN MODEL

from transformers import AutoModelForVision2Seq, AutoProcessor
import torch
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ Using device:", device)

MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"

print("Loading Qwen model...")
processor = AutoProcessor.from_pretrained(
    MODEL_NAME, 
    trust_remote_code=True
)

model = AutoModelForVision2Seq.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    output_attentions=True
)

print("✅ Qwen Model loaded!")

In [None]:
def create_eval_prompt(caption, condition, question, model_answer):
    return f"""
You are an automated scoring module. You MUST output ONLY a single character: "0" or "1".
NEVER output words, sentences, explanations, punctuation, or reasoning.

YOUR TASK:
Judge whether the MODEL_ANSWER correctly matches what is visually true in the IMAGE.

IMPORTANT:
• The CAPTION was only input context for another model. Do NOT treat it as truth.
• Ignore any misleading or incorrect information in the CAPTION.
• Use ONLY the IMAGE + the QUESTION + world knowledge to judge correctness.
• If the MODEL_ANSWER matches the visible truth in the image, output "1".
• If the MODEL_ANSWER is wrong, unsupported, or contradicts the image, output "0".
• Your ENTIRE reply MUST be exactly one character: "0" or "1".

Now evaluate:

CAPTION (context only): {caption}
CONDITION: {condition}
QUESTION: {question}
MODEL_ANSWER: {model_answer}

Reply with ONLY "0" or "1".
"""

In [None]:
def eval_answer(image_path, caption, condition, question, model_answer):
    """
    Builds the judge prompt, encodes the image,
    calls Claude directly, and returns 0 or 1.
    """

    # ---- Build prompt ----
    prompt = create_eval_prompt(caption, condition, question, model_answer)

    # ---- Encode image ----
    with open(image_path, "rb") as f:
        img_bytes = f.read()
    b64img = base64.b64encode(img_bytes).decode("utf-8")

    # ---- Call Claude ----
    response = anthropic_client.messages.create(
        model="claude-sonnet-4-5-20250929",
        max_tokens=5,
        temperature=0,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/jpeg",
                            "data": b64img
                        }
                    },
                    {
                        "type": "text",
                        "text": prompt
                    }
                ]
            }
        ]
    )

    # ---- Parse output ----
    output = response.content[0].text.strip()

    if output not in ("0", "1"):
        raise ValueError(f"Unexpected Claude judge output: {output}")

    return int(output)

In [None]:
# def eval_answer(image_path, caption, condition, question, model_answer):
#     """
#     Uses Google Gemini to judge the answer.
#     Returns 0 or 1.
#     """
    
#     # 1. Load the image using PIL (Gemini likes this format)
#     img = Image.open(image_path)
    
#     # 2. Get your prompt text
#     # (Keep your create_eval_prompt function exactly the same as it is now)
#     prompt_text = create_eval_prompt(caption, condition, question, model_answer)
    
#     try:
#         # 3. Call Gemini
#         # We pass the text prompt AND the image object in a list
#         response = GOOGLE_MODEL.generate_content([prompt_text, img])
        
#         # 4. Clean the output
#         output = response.text.strip()
        
#         # Gemini might be chatty, ensure we just get the number
#         if "1" in output: return 1
#         if "0" in output: return 0
        
#         # Fallback if it returns something else
#         return 0
        
#     except Exception as e:
#         print(f"Gemini Error: {e}")
#         return 0

In [None]:
# Used to setup eval metric calculation

def pair_stats_by_level(jsonl_path):
    levels = ["L0", "L1", "L2", "L3"]

    # Tallies per level
    pair_stats = {
        lvl: {(1,1):0, (1,0):0, (0,1):0, (0,0):0}
        for lvl in levels
    }

    # ---- Single JSONL pass ----
    with open(jsonl_path, "r") as f:
        for line in f:
            item = json.loads(line)
            eval_scores = item["eval_scores"]

            for lvl in levels:
                s_c = eval_scores[lvl]["correct_caption_score"]
                s_i = eval_scores[lvl]["incorrect_caption_score"]
                pair_stats[lvl][(s_c, s_i)] += 1

    return pair_stats

def conf_pairs_by_level(pair_stats):
    return pair_stats  # already exactly the confusion matrix

In [None]:
def fooling_rate_by_level(pair_stats):

    results = {}

    for lvl, counts in pair_stats.items():
        c10 = counts[(1,0)]
        total = sum(counts.values())
        rate = c10 / total if total > 0 else 0

        results[lvl] = {
            "fooled": c10,
            "total": total,
            "rate": rate,
        }

    return results

In [None]:
# Eval Metrics - per-level answer accuracy, computed separately for the correct-caption and incorrect-caption conditions.

def acc_by_level(pair_stats):
    results = {}

    for lvl, counts in pair_stats.items():
        c11 = counts[(1,1)]
        c10 = counts[(1,0)]
        c01 = counts[(0,1)]
        c00 = counts[(0,0)]
        total = c11 + c10 + c01 + c00

        if total == 0:
            results[lvl] = None
            continue

        # accuracy under correct caption = model is correct (regardless of incorrect-caption score)
        acc_correct = (c11 + c10) / total
        # accuracy under incorrect caption = model is correct under wrong caption
        acc_incorrect = (c11 + c01) / total

#         mdi = acc_correct - acc_incorrect

        results[lvl] = {
            "accuracy_correct_caption": acc_correct,
            "accuracy_incorrect_caption": acc_incorrect
#             "MDI": mdi,
        }

    return results

In [None]:
import random

from concurrent.futures import ThreadPoolExecutor, as_completed

def generate_qwen_outputs_exp1_single(subset_size=None):
    all_image_files = [
        f for f in os.listdir(IMAGE_FOLDER)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ]
    
    if subset_size is not None:
        image_files = random.sample(all_image_files, subset_size)
    else:
        image_files = all_image_files

    print(f"Found {len(image_files)} images.\n")

    with open(QWEN_EXP1_SINGLE_OUTPUT_PATH, "w", encoding="utf-8") as out:
        for img_file in tqdm(image_files, desc="Processing"):
            image_id = os.path.splitext(img_file)[0]
            path = os.path.join(IMAGE_FOLDER, img_file)

            try:
                # ---- 1) GPT captions + questions ----
                b64 = encode_image(path)
                q = generate_questions_exp1(b64)
                
                # ---- CLEAN CAPTIONS HERE ----

                correct_caption = q["correct_caption"]
                incorrect_caption = q["incorrect_caption"]

                L0 = q["L0"]
                L1 = q["L1"]
                L2 = q["L2"]
                L3 = q["L3"]

                answers_correct = {}
                mdi_correct = {}
                entropy_correct = {}
                shift_correct = {}
#                 history_correct = []

                prev_attn = None
        
                for lvl, q in [("L0", L0), ("L1", L1), ("L2", L2), ("L3", L3)]:
#                     ans, mdi, attn = ask_llava(path, correct_caption, q, return_mdi=True, return_attn=True)
                    ans, mdi, attn, inputs = ask_qwen(path, correct_caption, q, return_metrics=True)
                                
#                     if last_turn_only: 
#                         history_correct = [(q, ans)]
#                     else:
#                         history_correct.append((q, ans))
                        
                    answers_correct[lvl] = ans
                    mdi_correct[lvl] = round(mdi, 3)

                    #entropy
                    ent = compute_attention_entropy(attn)
                    entropy_correct[lvl] = round(ent, 3) if ent is not None else None

                    # attention shift
                    if prev_attn is None:
                        shift_correct[lvl] = None
                    else:
                        shift = compute_attention_shift(prev_attn, attn, inputs)
                        shift_correct[lvl] = round(shift, 3) if shift is not None else None

                    prev_attn = attn
                
                answers_incorrect = {}
                mdi_incorrect = {}
                entropy_incorrect = {}
                shift_incorrect = {}
#                 history_incorrect = []

                prev_attn = None

                for lvl, q in [("L0", L0), ("L1", L1), ("L2", L2), ("L3", L3)]:

                    ans, mdi, attn, inputs = ask_qwen(path, incorrect_caption, q, return_metrics=True)

                    answers_incorrect[lvl] = ans
                    mdi_incorrect[lvl] = round(mdi, 3)
        
#                     if last_turn_only: 
#                         history_correct = [(q, ans)]
#                     else:
#                         history_correct.append((q, ans))
                        

                    ent = compute_attention_entropy(attn)
                    entropy_incorrect[lvl] = round(ent, 3) if ent is not None else None

                    if prev_attn is None:
                        shift_incorrect[lvl] = None
                    else:
                        shift = compute_attention_shift(prev_attn, attn, inputs)
                        shift_incorrect[lvl] = round(shift, 3) if shift is not None else None

                    prev_attn = attn


                # ---- 3) Base JSON structure ----
                output = {
                    "image_id": image_id,

                    "captions": {
                        "correct": correct_caption,
                        "incorrect": incorrect_caption
                    },

                    "questions": {
                        "L0": L0,
                        "L1": L1,
                        "L2": L2,
                        "L3": L3
                    },

                    "answers": {
                        "correct_caption": answers_correct,
                        "incorrect_caption": answers_incorrect
                    }, 
                    
                    "mdi_scores": { 
                        "correct_caption": mdi_correct,     
                        "incorrect_caption": mdi_incorrect 
                    },
    
                    "entropy_scores": {
                        "correct_caption": entropy_correct,
                        "incorrect_caption": entropy_incorrect
                    },

                    "shift_scores": {
                        "correct_caption": shift_correct,
                        "incorrect_caption": shift_incorrect
                    },
                    
                    "eval_scores": {} 
                }
                

                # ---- 4) Parallel Claude evaluation ----
                jobs = []
                with ThreadPoolExecutor(max_workers=8) as ex:
                    for level, question in output["questions"].items():

                        # correct caption condition
                        jobs.append(ex.submit(
                            eval_answer,
                            path,
                            output["captions"]["correct"],
                            "correct caption condition",
                            question,
                            output["answers"]["correct_caption"][level]
                        ))

                        # incorrect caption condition
                        jobs.append(ex.submit(
                            eval_answer,
                            path,
                            output["captions"]["incorrect"],
                            "incorrect caption condition",
                            question,
                            output["answers"]["incorrect_caption"][level]
                        ))

                    # collect results
                    ordered_results = [j.result() for j in jobs]

                # ---- 5) Attach scores to JSON in correct structure ----
                idx = 0
                for level in ["L0", "L1", "L2", "L3"]:
                    score_c = ordered_results[idx]; idx += 1
                    score_i = ordered_results[idx]; idx += 1

                    output["eval_scores"][level] = {
                        "correct_caption_score": score_c,
                        "incorrect_caption_score": score_i
                    }

                # ---- 6) Write one JSON line ----
                out.write(json.dumps(output, ensure_ascii=False) + "\n")

            except Exception as e:
                print(f"\nError with {image_id}: {e}")
                

    print(f"\nDone. JSONL saved to: {QWEN_EXP1_SINGLE_OUTPUT_PATH}\n")

In [None]:
# import shutil
# from pathlib import Path

# def create_random_subset(subset_size, subset_name=None):
#     """
#     Creates a sibling folder next to IMAGE_FOLDER that contains subset_size
#     randomly selected images from IMAGE_FOLDER.
    
#     Example:
#     IMAGE_FOLDER = ".../train2017"
#     Creates: ".../train2017_subset"
#     """
#     global IMAGE_FOLDER
#     image_folder = Path(IMAGE_FOLDER)

#     # Infer parent directory and base name
#     parent_dir = image_folder.parent
#     base_name = image_folder.name  # e.g., "train2017"

#     # Default subset name = train2017_subset
#     if subset_name is None:
#         subset_name = f"{base_name}_subset"

#     subset_path = parent_dir / subset_name
#     subset_path.mkdir(exist_ok=True)

#     # Collect valid images
#     all_images = [
#         f for f in image_folder.iterdir()
#         if f.suffix.lower() in [".jpg", ".jpeg", ".png"]
#     ]

#     if subset_size > len(all_images):
#         raise ValueError(
#             f"Requested {subset_size} images but only {len(all_images)} available."
#         )

#     # Randomly choose subset
#     selected = random.sample(all_images, subset_size)

#     # Copy selected images into sibling folder
#     for img in selected:
#         shutil.copy(img, subset_path / img.name)

#     print(f"Created subset of {len(selected)} images at: {subset_path}")
    
#     return subset_path


In [None]:
def generate_qwen_outputs_exp1_multi(subset_size=None, last_turn_only=False):
    all_image_files = [
        f for f in os.listdir(IMAGE_FOLDER)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ]
    
    if subset_size is not None:
        image_files = random.sample(all_image_files, subset_size)
    else:
        image_files = all_image_files

    print(f"Found {len(image_files)} images.\n")

    with open(QWEN_EXP1_MULTI_OUTPUT_PATH, "w", encoding="utf-8") as out:
        for img_file in tqdm(image_files, desc="Processing"):
            image_id = os.path.splitext(img_file)[0]
            path = os.path.join(IMAGE_FOLDER, img_file)

            try:
                # ---- 1) GPT captions + questions ----
                b64 = encode_image(path)
                q = generate_questions_exp1(b64)
                
                # ---- CLEAN CAPTIONS HERE ----

                correct_caption = q["correct_caption"]
                incorrect_caption = q["incorrect_caption"]

                L0 = q["L0"]
                L1 = q["L1"]
                L2 = q["L2"]
                L3 = q["L3"]

                answers_correct = {}
                mdi_correct = {}
                entropy_correct = {}
                shift_correct = {}
                history_correct = []

                prev_attn = None
        
                for lvl, q in [("L0", L0), ("L1", L1), ("L2", L2), ("L3", L3)]:
#                     ans, mdi, attn = ask_llava(path, correct_caption, q, return_mdi=True, return_attn=True)
                    ans, mdi, attn, inputs = ask_qwen(path, correct_caption, q, return_metrics=True, last_turn_only=last_turn_only)
                                
                    if last_turn_only: 
                        history_correct = [(q, ans)]
                    else:
                        history_correct.append((q, ans))
                        
                    answers_correct[lvl] = ans
                    mdi_correct[lvl] = round(mdi, 3)

                    #entropy
                    ent = compute_attention_entropy(attn)
                    entropy_correct[lvl] = round(ent, 3) if ent is not None else None

                    # attention shift
                    if prev_attn is None:
                        shift_correct[lvl] = None
                    else:
                        shift = compute_attention_shift(prev_attn, attn, inputs)
                        shift_correct[lvl] = round(shift, 3) if shift is not None else None

                    prev_attn = attn
                
                answers_incorrect = {}
                mdi_incorrect = {}
                entropy_incorrect = {}
                shift_incorrect = {}
                history_incorrect = []

                prev_attn = None

                for lvl, q in [("L0", L0), ("L1", L1), ("L2", L2), ("L3", L3)]:

                    ans, mdi, attn, inputs = ask_qwen(path, incorrect_caption, q, return_metrics=True, last_turn_only=last_turn_only)

                    answers_incorrect[lvl] = ans
                    mdi_incorrect[lvl] = round(mdi, 3)
        
                    if last_turn_only: 
                        history_correct = [(q, ans)]
                    else:
                        history_correct.append((q, ans))
                        

                    ent = compute_attention_entropy(attn)
                    entropy_incorrect[lvl] = round(ent, 3) if ent is not None else None

                    if prev_attn is None:
                        shift_incorrect[lvl] = None
                    else:
                        shift = compute_attention_shift(prev_attn, attn, inputs)
                        shift_incorrect[lvl] = round(shift, 3) if shift is not None else None

                    prev_attn = attn


                # ---- 3) Base JSON structure ----
                output = {
                    "image_id": image_id,

                    "captions": {
                        "correct": correct_caption,
                        "incorrect": incorrect_caption
                    },

                    "questions": {
                        "L0": L0,
                        "L1": L1,
                        "L2": L2,
                        "L3": L3
                    },

                    "answers": {
                        "correct_caption": answers_correct,
                        "incorrect_caption": answers_incorrect
                    }, 
                    
                    "mdi_scores": { 
                        "correct_caption": mdi_correct,     
                        "incorrect_caption": mdi_incorrect 
                    },
    
                    "entropy_scores": {
                        "correct_caption": entropy_correct,
                        "incorrect_caption": entropy_incorrect
                    },

                    "shift_scores": {
                        "correct_caption": shift_correct,
                        "incorrect_caption": shift_incorrect
                    },
                    
                    "eval_scores": {} 
                }
                

                # ---- 4) Parallel Claude evaluation ----
                jobs = []
                with ThreadPoolExecutor(max_workers=8) as ex:
                    for level, question in output["questions"].items():

                        # correct caption condition
                        jobs.append(ex.submit(
                            eval_answer,
                            path,
                            output["captions"]["correct"],
                            "correct caption condition",
                            question,
                            output["answers"]["correct_caption"][level]
                        ))

                        # incorrect caption condition
                        jobs.append(ex.submit(
                            eval_answer,
                            path,
                            output["captions"]["incorrect"],
                            "incorrect caption condition",
                            question,
                            output["answers"]["incorrect_caption"][level]
                        ))

                    # collect results
                    ordered_results = [j.result() for j in jobs]

                # ---- 5) Attach scores to JSON in correct structure ----
                idx = 0
                for level in ["L0", "L1", "L2", "L3"]:
                    score_c = ordered_results[idx]; idx += 1
                    score_i = ordered_results[idx]; idx += 1

                    output["eval_scores"][level] = {
                        "correct_caption_score": score_c,
                        "incorrect_caption_score": score_i
                    }

                # ---- 6) Write one JSON line ----
                out.write(json.dumps(output, ensure_ascii=False) + "\n")

            except Exception as e:
                print(f"\nError with {image_id}: {e}")
                

    print(f"\nDone. JSONL saved to: {QWEN_EXP1_MULTI_OUTPUT_PATH}\n")

In [None]:
def generate_qwen_outputs_exp2_single(subset_size=None):
    all_image_files = [
        f for f in os.listdir(IMAGE_FOLDER)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ]
    
    if subset_size is not None:
        image_files = random.sample(all_image_files, subset_size)
    else:
        image_files = all_image_files

    print(f"Found {len(image_files)} images.\n")

    with open(QWEN_EXP2_SINGLE_OUTPUT_PATH, "w", encoding="utf-8") as out:
        for img_file in tqdm(image_files, desc="Processing"):
            image_id = os.path.splitext(img_file)[0]
            path = os.path.join(IMAGE_FOLDER, img_file)

            try:
                # ---- 1) GPT captions + questions ----
                b64 = encode_image(path)
                q = generate_questions_exp2(b64)
                
                # ---- CLEAN CAPTIONS HERE ----

                correct_caption = q["correct_caption"]

                L0 = q["L0"]
                L1 = q["L1"]
                L2 = q["L2"]
                L3 = q["L3"]
                L4 = q["L4"]

                answers_correct = {}
                mdi_correct = {}
                entropy_correct = {}
                shift_correct = {}

                prev_attn = None
        
                for lvl, q in [("L0", L0), ("L1", L1), ("L2", L2), ("L3", L3), ("L4", L4)]:
#                     ans, mdi, attn = ask_llava(path, correct_caption, q, return_mdi=True, return_attn=True)
                    ans, mdi, attn, inputs = ask_qwen(path, correct_caption, q, return_metrics=True)
                            
                    answers_correct[lvl] = ans
                    mdi_correct[lvl] = round(mdi, 3)

                    #entropy
                    ent = compute_attention_entropy(attn)
                    entropy_correct[lvl] = round(ent, 3) if ent is not None else None

                    # attention shift
                    if prev_attn is None:
                        shift_correct[lvl] = None
                    else:
                        shift = compute_attention_shift(prev_attn, attn, inputs)
                        shift_correct[lvl] = round(shift, 3) if shift is not None else None

                    prev_attn = attn
                

                # ---- 3) Base JSON structure ----
                output = {
                    "image_id": image_id,

                    "caption": correct_caption,
             
                    "questions": {
                        "L0": L0,
                        "L1": L1,
                        "L2": L2,
                        "L3": L3,
                        "L4": L4
                    },

                    "answers": answers_correct,
                    
                    "metrics": {},
                    
                    "eval_scores": {} 
                }
                
                levels = ["L0", "L1", "L2", "L3", "L4"]
                
                for lvl in levels:
                    output["metrics"][lvl] = {
                    "mdi": mdi_correct.get(lvl),
                    "entropy": entropy_correct.get(lvl),
                    "shift": shift_correct.get(lvl)}
                

                # ---- 4) Parallel Claude evaluation ----
                jobs = []
                with ThreadPoolExecutor(max_workers=8) as ex:
                    for level, question in output["questions"].items():

                        # correct caption condition
                        jobs.append(ex.submit(
                            eval_answer,
                            path,
                            output["caption"],
                            "correct caption condition",
                            question,
                            output["answers"][level]
                        ))

                    # collect results
                    ordered_results = [j.result() for j in jobs]

                # ---- 5) Attach scores to JSON in correct structure ----
                idx = 0
                for lvl in levels:
                    score_c = ordered_results[idx]; idx += 1
                    output["eval_scores"][lvl] = score_c
                    
                # ---- 6) Write one JSON line ----
                out.write(json.dumps(output, ensure_ascii=False) + "\n")

            except Exception as e:
                print(f"\nError with {image_id}: {e}")
                

    print(f"\nDone. JSONL saved to: {QWEN_EXP2_SINGLE_OUTPUT_PATH}\n")

In [None]:
import random

from concurrent.futures import ThreadPoolExecutor, as_completed

def generate_qwen_outputs_exp2_multi(subset_size=None, last_turn_only=False):
    """
    If image_folder is provided (subset folder), use ALL images in that folder.
    Otherwise, fall back to global IMAGE_FOLDER and apply subset_size logic.
    """
#     global IMAGE_FOLDER

#     # Determine which folder to read from
#     active_folder = image_folder if image_folder is not None else IMAGE_FOLDER

#     # List images in the selected folder
#     all_image_files = [
#         f for f in os.listdir(active_folder)
#         if f.lower().endswith((".jpg", ".jpeg", ".png"))
#     ]

#     # CASE 1 — user passed a subset folder → ignore subset_size
#     if image_folder is not None:
#         image_files = all_image_files
#         print(f"Using all {len(image_files)} images from subset folder: {active_folder}\n")

#     # CASE 2 — no subset folder → use normal random subset logic
#     else:
#         if subset_size is not None:
#             image_files = random.sample(all_image_files, subset_size)
#         else:
#             image_files = all_image_files

#         print(f"Found {len(image_files)} images.\n")

    all_image_files = [
        f for f in os.listdir(IMAGE_FOLDER)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))]
    
    if subset_size is not None:
        image_files = random.sample(all_image_files, subset_size)
    else:
        image_files = all_image_files

    print(f"Found {len(image_files)} images.\n")

    with open(QWEN_EXP2_MULTI_OUTPUT_PATH, "w", encoding="utf-8") as out:
        for img_file in tqdm(image_files, desc="Processing"):
            image_id = os.path.splitext(img_file)[0]
            path = os.path.join(IMAGE_FOLDER, img_file)

            try:
                # ---- 1) GPT captions + questions ----
                b64 = encode_image(path)
                q = generate_questions_exp2(b64)
                
                # ---- CLEAN CAPTIONS HERE ----

                correct_caption = q["correct_caption"]

                L0 = q["L0"]
                L1 = q["L1"]
                L2 = q["L2"]
                L3 = q["L3"]
                L4 = q["L4"]

                answers_correct = {}
                mdi_correct = {}
                entropy_correct = {}
                shift_correct = {}
                history_correct = []

                prev_attn = None
        
                for lvl, q in [("L0", L0), ("L1", L1), ("L2", L2), ("L3", L3), ("L4", L4)]:
#                     ans, mdi, attn = ask_llava(path, correct_caption, q, return_mdi=True, return_attn=True)
                    ans, mdi, attn, inputs = ask_qwen(path, correct_caption, q, return_metrics=True, last_turn_only=last_turn_only)
                                
                    if last_turn_only: 
                        history_correct = [(q, ans)]
                    else:
                        history_correct.append((q, ans))
                        
                    answers_correct[lvl] = ans
                    mdi_correct[lvl] = round(mdi, 3)

                    #entropy
                    ent = compute_attention_entropy(attn)
                    entropy_correct[lvl] = round(ent, 3) if ent is not None else None

                    # attention shift
                    if prev_attn is None:
                        shift_correct[lvl] = None
                    else:
                        shift = compute_attention_shift(prev_attn, attn, inputs)
                        shift_correct[lvl] = round(shift, 3) if shift is not None else None

                    prev_attn = attn
                

                # ---- 3) Base JSON structure ----
                output = {
                    "image_id": image_id,

                    "caption": correct_caption,
             
                    "questions": {
                        "L0": L0,
                        "L1": L1,
                        "L2": L2,
                        "L3": L3,
                        "L4": L4
                    },

                    "answers": answers_correct,
                    
                    "metrics": {},
                    
                    "eval_scores": {} 
                }
                
                levels = ["L0", "L1", "L2", "L3", "L4"]
                
                for lvl in levels:
                    output["metrics"][lvl] = {
                    "mdi": mdi_correct.get(lvl),
                    "entropy": entropy_correct.get(lvl),
                    "shift": shift_correct.get(lvl)}
                

                # ---- 4) Parallel Claude evaluation ----
                jobs = []
                with ThreadPoolExecutor(max_workers=8) as ex:
                    for level, question in output["questions"].items():

                        # correct caption condition
                        jobs.append(ex.submit(
                            eval_answer,
                            path,
                            output["caption"],
                            "correct caption condition",
                            question,
                            output["answers"][level]
                        ))

                    # collect results
                    ordered_results = [j.result() for j in jobs]

                # ---- 5) Attach scores to JSON in correct structure ----
                idx = 0
                for lvl in levels:
                    score_c = ordered_results[idx]; idx += 1

                    output["eval_scores"][lvl] = score_c
                    
                # ---- 6) Write one JSON line ----
                out.write(json.dumps(output, ensure_ascii=False) + "\n")

            except Exception as e:
                print(f"\nError with {image_id}: {e}")
                
    print(f"\nDone. JSONL saved to: {QWEN_EXP2_MULTI_OUTPUT_PATH}\n")

In [None]:
if __name__ == "__main__":
    
    ############## QWEN ##############
    
    subset_size = 1
    generate_qwen_outputs_exp1_single(subset_size=subset_size)
    generate_qwen_outputs_exp1_multi(subset_size=subset_size, last_turn_only=False)
    
    generate_qwen_outputs_exp2_single(subset_size=subset_size)
    generate_qwen_outputs_exp2_multi(subset_size=subset_size, last_turn_only=False)
    
#     # Compute metrics for BLIP responses - make sure to change path accordingly!!
#     qwen_pair_stats = pair_stats_by_level(QWEN_OUTPUT_PATH)
#     qwen_fooling_rate_per_level = fooling_rate_by_level(qwen_pair_stats)
#     qwen_acc_per_level = acc_by_level(qwen_pair_stats)
    
#     print("\n========================")
#     print("FOOLING RATE PER LEVEL")
#     print("========================\n")
#     for lvl, stats in qwen_fooling_rate_per_level.items():
#         print(f"{lvl}: Fooling Rate = {stats['fooled']}/{stats['total']} "
#               f"({stats['rate']:.2f})")

#     print("\n========================")
#     print("ACCURACY PER LEVEL")
#     print("========================\n")
#     for lvl, stats in qwen_acc_per_level.items():
#         print(f"{lvl}:  "
#             f"Acc(correct caption) = {stats['accuracy_correct_caption']:.2f},  "
#             f"Acc(incorrect caption) = {stats['accuracy_incorrect_caption']:.2f}")