# Baseline 3 Evaluation

### Initial package installaions

In [1]:
# Install required packages
!pip install -q transformers accelerate bitsandbytes pillow pandas torch opencv-python einops
!pip install qwen-vl-utils

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.14-py3-none-any.whl.metadata (9.0 kB)
Collecting av (from qwen-vl-utils)
  Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Downloading qwen_vl_utils-0.0.14-py3-none-any.whl (8.1 kB)
Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl (40.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 MB[0m [31m64.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av, qwen-vl-utils
Successfully installed av-16.0.1 qwen-vl-utils-0.0.14
Mounted at /content/drive


### Setup and Configurations


In [2]:
import os
import random
import re
import torch
import numpy as np
import pandas as pd
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch.backends.cudnn as cudnn
import warnings

# Clean Environment
warnings.filterwarnings("ignore")
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['HUGGINGFACE_HUB_FORCE_REDOWNLOAD'] = '1'
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

def get_config(
    model_id="Qwen/Qwen2.5-VL-7B-Instruct",
    seed=520,
    image_root_dir="/content/drive/MyDrive/VLM/paper_figures/",
):
    """Returns a dictionary containing all configuration settings, excluding dynamic values."""
    device = "cuda" if torch.cuda.is_available() else "cpu"

    PANEL_PROMPT = """
    You are a visual-language expert specialized in analyzing multi-panel scientific figures. Each panel may be labeled with letters (A, B, C, D, etc.) or alphanumeric labels (e.g., 1A, A5, 3B, etc.).

    Task
    1. Examine the provided figure carefully and identify all visible panel labels (A, B, 1A, A5, etc.).
    2. Determine which specific labeled panel best supports the following scientific claim:

    Claim:
    "{claim}"

    Instructions
    1. Read and interpret each labeled panel visually – focus on trends, comparisons, correlations, or experimental results.
    2. Identify which panel (by its exact visible label) provides the strongest direct visual evidence supporting the claim.
    3. The label can be alphabetic or alphanumeric (e.g., A, B, 5A, A5, C2, etc.).
    4. If no panel clearly supports the claim, output Panel: None and briefly explain why.
    5. Provide a concise but clear explanation – up to 3 lines having max 20 words, focusing on visual reasoning only.

    Output Format (must follow exactly)
    - Line 1: Panel: <exact visible label or None>
    - Line 2: Reason: <first line of reasoning>
    - Line 3 (optional): <second line of reasoning>
    - Line 4 (optional): <third line of reasoning>

    Do NOT include any extra commentary, numbering, markdown, or quotes.

    Provide your answer in the exact format above.
    """

    VERIFICATION_PROMPT = """
    You are a visual-language expert specialized in analyzing multi-panel scientific figures.
    Each panel may be labeled with letters (A, B, C, D, etc.) or alphanumeric labels (e.g., 1A, A5, 3B, etc.).
    You will be given a specific ground truth sub-panel from a figure and a scientific claim.
    Your task is to determine if the visual evidence in that specific panel supports the claim.

    Input Data
    - Target Panel: "{panel_label}" (Look ONLY at this panel in the figure)
    - Claim: "{claim}"

    Instructions
    1. Locate the assigned 'Target Panel' in the image. Ignore all other panels.
    2. Analyze the data in that panel (visual trends, bar heights, scatter plots, or microscopy features, comparisons, correlations, or experimental results).
    3. Compare the visual data against the Claim.
    4. Determine the verdict:
      - SUPPORT: The visual trends/data directly match the claim (e.g., Claim says 'increase', Graph shows rising slope).
      - NON_SUPPORT: The visual data contradicts the claim (e.g., Claim says 'increase', Graph shows flat/drop) OR the panel does not contain relevant information.

    Output Format (Strict)
    - Verdict: <SUPPORT or NON_SUPPORT>
    - Reason: <One short sentence (max 20 words) describing the specific visual feature (slope, bar difference, spatial arrangement) that justifies your verdict.>

    Do NOT include any extra text, markdown, or intros.
    """

    return {
        "MODEL_ID": model_id,
        "SEED": seed,
        "DEVICE": device,
        "IMAGE_ROOT_DIR": image_root_dir,
        "PANEL_PROMPT": PANEL_PROMPT,
        "VERIFICATION_PROMPT": VERIFICATION_PROMPT,
    }

def set_seeds(seed):
    """Sets random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    cudnn.benchmark = False
    cudnn.deterministic = True

config = get_config()
set_seeds(config["SEED"])
print(f"✅ Configuration loaded. Running on {config['DEVICE']}")

✅ Configuration loaded. Running on cuda


In [None]:
def setup_seeds():
    seed = 520

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    cudnn.benchmark = False
    cudnn.deterministic = True

setup_seeds()
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

In [None]:
device

device(type='cuda')

In [None]:
config['DEVICE']

'cuda'

### Initialize and Load the Qwen Model

In [None]:
def initialize_model_and_processor(model_id, device):
    """Loads the Qwen2.5-VL model and processor."""
    print(f"🧠 Loading model: {model_id}...")
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_id,
        dtype=torch.float32,
        device_map="cuda"
    )
    for name, param in model.named_parameters():
        param.requires_grad_(True)

    processor = AutoProcessor.from_pretrained(model_id)
    print("✅ Model and processor loaded.")
    return model, processor

In [None]:
model, processor = initialize_model_and_processor(config["MODEL_ID"], config["DEVICE"])

🧠 Loading model: Qwen/Qwen2.5-VL-7B-Instruct...


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

✅ Model and processor loaded.


In [None]:
model

Qwen2_5_VLForConditionalGeneration(
  (model): Qwen2_5_VLModel(
    (visual): Qwen2_5_VisionTransformerPretrainedModel(
      (patch_embed): Qwen2_5_VisionPatchEmbed(
        (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
      )
      (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
      (blocks): ModuleList(
        (0-31): 32 x Qwen2_5_VLVisionBlock(
          (norm1): Qwen2RMSNorm((1280,), eps=1e-06)
          (norm2): Qwen2RMSNorm((1280,), eps=1e-06)
          (attn): Qwen2_5_VLVisionAttention(
            (qkv): Linear(in_features=1280, out_features=3840, bias=True)
            (proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (mlp): Qwen2_5_VLMLP(
            (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)
            (up_proj): Linear(in_features=1280, out_features=3420, bias=True)
            (down_proj): Linear(in_features=3420, out_features=1280, bias=True)
            (act_fn): SiLUAc

### Load and Prepare the Dataset

In [None]:
import pandas as pd

In [None]:
from datasets import load_dataset
from huggingface_hub import hf_hub_download
from PIL import Image

def load_and_filter_data():
    """Loads the Dataset from Hugging Face and filters rows based on 'support' labels."""
    ds = load_dataset("StonyBrookNLP/MuSciClaims", split="test")
    df = ds.to_pandas()

    filtered_df = df[
        (df['label_3class'].astype(str).str.lower() == "support") &
        (df['label_2class'].astype(str).str.lower() == "support")
    ].copy()

    print(f"Loaded {len(df)} total rows. Filtered down to: {len(filtered_df)} rows")
    return filtered_df

data_df = load_and_filter_data()

README.md: 0.00B [00:00, ?B/s]

test_set.jsonl: 0.00B [00:00, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Loaded 1515 total rows. Filtered down to: 505 rows


In [None]:
# Preprocessing the claim from the dataset to only keep the relevant information
import re
import spacy

# Load the spacy model
nlp = spacy.load("en_core_web_sm")

def denoise_claim(text: str) -> str:
    if not text: return ""

    # Only remove explicit Figure/Panel references and that also matches: "Figure 1A", "Fig. 2", "Panel B" (case insensitive)
    text = re.sub(r'\b(?:Figure|Fig\.?|Panel)\s*[0-9A-Za-z]+\b', '', text, flags=re.IGNORECASE)

    # Remove parenthesized explicit refs like "(see Fig 1)" or "(Panel A)"
    text = re.sub(r'\(\s*(?:see)?\s*(?:Figure|Fig\.?|Panel)\s*[0-9A-Za-z]+\s*\)', '', text, flags=re.IGNORECASE)

    # Remove fillers at the start
    text = re.sub(r'^(Further|Furthermore|Moreover|In addition|Finally|Thus),?\s*', '', text, flags=re.IGNORECASE)

    # Semantic filtering of th text
    doc = nlp(text)

    KEEP_POS = {"NOUN", "PROPN", "VERB", "ADJ", "ADV", "NUM", "ADP", "PART", "SYM"}

    REMOVE_WORDS = {"the", "a", "an", "this", "that", "these", "those", "is", "are", "was", "were", "be", "been"}

    cleaned_tokens = []
    for token in doc:
        if token.text.lower() in REMOVE_WORDS:
            continue

        # Keep punctuation that defines structure
        if token.text in [',', ':', ';', '.', '%', '<', '>', '=']:
            cleaned_tokens.append(token.text)
            continue

        if token.pos_ in KEEP_POS:
            prefix = " " if cleaned_tokens and cleaned_tokens[-1] not in ['<', '>', '=', '-'] else ""
            cleaned_tokens.append(prefix + token.text)

    result = "".join(cleaned_tokens).strip()

    result = re.sub(r'\s+', ' ', result)
    result = re.sub(r'\s+([,.:])', r'\1', result)

    return result.strip()

# Apply the above preprocessing logic to our dataset
data_df['claim_text'] = data_df['claim_text'].apply(denoise_claim)

In [None]:
data_df['associated_figure_panels']

Unnamed: 0,associated_figure_panels
0,[Panel I]
3,[Panel A]
6,[Panel C]
9,[Panel I]
12,[Panel K]
...,...
1500,[Figure 3D]
1503,[Figure 4A]
1506,[Figure 4C]
1509,[Figure 5A]


### Initializing GRAD-CAM for evaluation

In [None]:
import pandas as pd
import torch
import numpy as np
import cv2
import os
import re
from PIL import Image
from einops import rearrange
import torch.nn as nn
import matplotlib.pyplot as plt

In [None]:

class GradCAM():
    def __init__(self, model, target_layer, input_token_len, output_ids):
        self.model = model
        self.target_layer = target_layer
        self.feature_maps = None
        self.gradients = None

        self.input_token_len = input_token_len
        self.output_ids = output_ids

        self.target_ids = self.output_ids[0][self.input_token_len:]
        target_layer.register_forward_hook(self.save_feature_maps)
        target_layer.register_full_backward_hook(self.save_gradients)

    def save_feature_maps(self, module, input, output):
        self.feature_maps = output

    def save_gradients(self, module, grad_input, grad_output):
        self.gradients = grad_output[0].detach()

    def generate_cam(self, image, inputs, image_len, text_len):
        self.model.eval()
        self.model.zero_grad()
        out = self.model(**inputs)

        logits_shape = out.logits.shape
        target_logits = torch.sum(
            out.logits[0, self.input_token_len:self.output_ids.shape[1], :][
                torch.arange(len(self.target_ids)), self.target_ids.int()
            ]
        )

        if self.feature_maps is not None:
            self.feature_maps.retain_grad()

        target_logits.retain_grad()
        target_logits.backward(retain_graph=True)

        print(f"Feature maps shape: {self.feature_maps.shape}")

        # Handle both [batch, num_tokens, channels] and [num_tokens, channels]
        if len(self.feature_maps.shape) == 3:
            actual_num_tokens = self.feature_maps.shape[1]
            has_batch = True
        else:
            actual_num_tokens = self.feature_maps.shape[0]
            has_batch = False

        # Compute square root to get spatial dimension
        h = int(np.sqrt(actual_num_tokens))
        w = h  # Assume square featur maps

        if h * w != actual_num_tokens:
            print(f"Warning: Feature map tokens ({actual_num_tokens}) don't form perfect square")
            # Adjust to closest rectangular shape
            w = actual_num_tokens // h

        print(f"Reshaping to spatial grid: {h}x{w} = {h*w} tokens")

        # Use computed dimensions for reshaping (handle with/without batch dimension)
        if has_batch:
            self.feature_maps = rearrange(
                self.feature_maps.detach(),
                'b (h w) c -> b c h w',
                h=h,
                w=w
            ).squeeze(0)

            self.gradients = rearrange(
                self.gradients.detach(),
                'b (h w) c -> b h w c',
                h=h,
                w=w
            ).squeeze(0)
        else:
            # No batch dimension - direct reshape
            self.feature_maps = rearrange(
                self.feature_maps.detach(),
                '(h w) c -> c h w',
                h=h,
                w=w
            )

            self.gradients = rearrange(
                self.gradients.detach(),
                '(h w) c -> h w c',
                h=h,
                w=w
            )

        self.gradients = nn.ReLU()(self.gradients)

        pooled_gradients = torch.mean(self.gradients, dim=[0, 1])

        # Weight feature maps by pooled gradients
        activation = self.feature_maps
        for i in range(activation.size(0)):
            activation[i, :, :] *= pooled_gradients[i]

        # Generate heatmap
        heatmap = torch.mean(activation.to(dtype=torch.float32), dim=0).squeeze().cpu().numpy()
        heatmap = np.maximum(heatmap, 0)

        if np.max(heatmap) > 0:
            heatmap /= np.max(heatmap)

        # Using percentile-based thresholding instead of fixed threshold
        threshold = np.percentile(heatmap[heatmap > 0], 50) if np.any(heatmap > 0) else 0.5
        print(f"Using adaptive threshold: {threshold:.3f}")
        heatmap[heatmap < threshold] = 0

        # Resize to original image dimensions
        heatmap = cv2.resize(heatmap, (image.size[0], image.size[1]))
        heatmap = np.uint8(255 * heatmap)
        heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)

        # Create overlay
        original_image = np.array(image)
        superimposed_img = heatmap * 0.4 + original_image
        superimposed_img = np.clip(superimposed_img, 0, 255).astype(np.uint8)

        return heatmap, superimposed_img

### Generate GRAD-CAM

In [None]:
output_dir = "/content/drive/MyDrive/VLM/Baseline3/overlay"

In [None]:
import re
import os
import cv2
import torch
from PIL import Image

def predict_verification_with_gradcam(image_path, claim, ground_truth_panel, save_dir=None, perform_gradcam=True):

    image = Image.open(image_path).resize((384,384), resample=Image.Resampling.BOX)
    prompt_text = config['VERIFICATION_PROMPT'].format(claim=claim,panel_label=ground_truth_panel)

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt_text}
        ]
    }]

    # Convert chat messages into a text prompt using the model's chat template
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Preprocess images/videos from the chat messages
    image_inputs, video_inputs = process_vision_info(messages)

    # Estimate image token length (Qwen-VL uses 28x28 = 784 pixel patches)
    image_len = image_inputs[0].size[0] * image_inputs[0].size[1] // (28 * 28)

    # Tokenize and prepare full multimodal input for the model
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Prepare text-only version to compute text token length (before image_pad tokens)
    ptext = processor(
        text=[text.split("<|image_pad|>")[0]],
        padding=True,
        return_tensors="pt",
    )
    text_len = ptext['input_ids'].shape[1]

    print(f"image_len: {image_len}, text_len: {text_len}")

    assert len(inputs.input_ids) == 1, inputs  # Ensure batch size = 1


    # Run autoregressive generation
    generated_ids = model.generate(**inputs, max_new_tokens=128)

    # Length of input tokens, used to trim outputs
    input_token_len = [len(in_ids) for in_ids in inputs.input_ids][0]

    # Remove input tokens from the generated sequence to isolate new output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]

    # Decode model output into text
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=False,
        clean_up_tokenization_spaces=False
    )
    output_text=output_text[0]
    print(output_text)

    # We look for verdicts "SUPPORT" and "NON_SUPPORT"
    verdict_match = re.search(r"Verdict\s*[:\-]+\s*(SUPPORT|NON_SUPPORT)", output_text, re.IGNORECASE)
    reason_match = re.search(r"Reason\s*[:\-]+\s*(.+)", output_text, re.DOTALL)

    verdict = verdict_match.group(1).upper() if verdict_match else "UNKNOWN"
    reason = reason_match.group(1).strip() if reason_match else output_text.strip()

    inputs_out = processor(
        text=[text + output_text],   # using [text + output_text] to backprop
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs_out = inputs_out.to("cuda")

    torch.cuda.empty_cache()

    # Early exit if Grad-CAM is not requested
    if not perform_gradcam:
        return {"verdict": verdict, "reason": reason, "raw_output": output_text}

    try:
        target_layer = model.visual.blocks[-1].norm2

        gradcam = GradCAM(model, model.visual.blocks[-1].norm2, input_token_len, generated_ids)

        print("  🎨 Generating Grad-CAM...")

        # ⚡ CORRECT: result is already the superimposed image
        heatmap, superimposed_img = gradcam.generate_cam(image, inputs=inputs_out, image_len=image_len, text_len=text_len)

        # SUper impose on the image
        filename = os.path.basename(image_path)
        name_only = os.path.splitext(filename)[0]
        overlay_path = os.path.join(output_dir, f"{name_only}_overlay.jpg")

        superimposed_rgb = cv2.cvtColor(superimposed_img, cv2.COLOR_BGR2RGB)
        cv2.imwrite(overlay_path, superimposed_rgb)
        print(f"Saved: {overlay_path}")

        return {"verdict": verdict, "reason": reason, "raw_output": output_text}

    except Exception as e:
        print(f"  ⚠️ Grad-CAM failed: {e}")
        import traceback
        traceback.print_exc()
        return {"verdict": verdict, "reason": reason, "raw_output": output_text, "gradcam_failed": True}

print("✅ Verification function ready")

✅ Verification function ready


In [None]:
# Calculating Accuracy of the model
results = []
base_image_dir = config['IMAGE_ROOT_DIR']  # Update this path
num_to_process = len(data_df)
processed_images=0
matches=0
for idx, (i, row) in enumerate(data_df.head(num_to_process).iterrows()):
    img_name = str(row["associated_figure_filepath"].split('/')[-1]).strip()
    claim = str(row["claim_text"]).strip()
    img_path = base_image_dir + img_name
    ass_panel = str(row.get("associated_figure_panels", "")).strip()
    ass_verdict = str(row["label_2class"]).strip()

    print(f"\n{'='*60}")
    print(f"🔍 [{idx+1}/{num_to_process}] {img_name}")
    print(f"📋 Claim: {claim[:80]}...")
    print(f"🎯 Ground Truth: {ass_panel}")
    print(f"🎯 Associated verdict: {ass_verdict}")

    if not os.path.exists(img_path):
        print(f"\n⚠️ Skipping [{idx+1}]: Image not found at {img_path}")
        continue

    processed_images+=1
    out = predict_verification_with_gradcam(img_path, claim,ass_panel, perform_gradcam=False)

    print(f"✅ Predicted: {out['verdict']}")
    if out['verdict'] == ass_verdict:
        matches += 1

accuracy = matches/processed_images
accuracy = accuracy*100
print(f"Accuracy: {accuracy}%")


🔍 [1/505] bio_PIIS0092867425000455_images_figure_1.jpg
📋 Claim: introduction of stop codon in SunTag frame, insertion of one additional nucleoti...
🎯 Ground Truth: ['Panel I']
🎯 Associated verdict: SUPPORT
image_len: 1369, text_len: 15
Verdict: SUPPORT  
Reason: Bar graph shows significantly lower GFP foci for stop codon compared to sense codon.<|im_end|>
✅ Predicted: SUPPORT

🔍 [2/505] bio_PIIS0092867425000455_images_figure_2.jpg
📋 Claim: Interestingly, measuring GFP foci intensity over time for socRNAs translated by ...
🎯 Ground Truth: ['Panel A']
🎯 Associated verdict: SUPPORT
image_len: 1369, text_len: 15
Verdict: SUPPORT  
Reason: The varying slopes of the curves indicate different elongation speeds among ribosomes.<|im_end|>
✅ Predicted: SUPPORT

🔍 [3/505] bio_PIIS0092867425000455_images_figure_2.jpg
📋 Claim: plateau traces show heterogeneity in slopes as well, heterogeneity substantially...
🎯 Ground Truth: ['Panel C']
🎯 Associated verdict: SUPPORT
image_len: 1369, text_len: 15
V