# Baseline 1 Evaluation

### Initial package installation


In [None]:
# Install required packages
!pip install -q transformers accelerate bitsandbytes pillow pandas torch opencv-python einops
!pip install qwen-vl-utils

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.14-py3-none-any.whl.metadata (9.0 kB)
Collecting av (from qwen-vl-utils)
  Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Downloading qwen_vl_utils-0.0.14-py3-none-any.whl (8.1 kB)
Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl (40.5 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m40.5/40.5 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av, qwen-vl-utils
Successfully installed av-16.0.1 qwen-vl-utils-0.0.14
Mounted at /content/drive


### Setup and Configurations

In [None]:
import os
import random
import re
import torch
import numpy as np
import pandas as pd
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch.backends.cudnn as cudnn
import warnings

# Clean Environment
warnings.filterwarnings("ignore")
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['HUGGINGFACE_HUB_FORCE_REDOWNLOAD'] = '1'
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

def get_config(
    model_id="Qwen/Qwen2.5-VL-7B-Instruct",
    seed=520,
    image_root_dir="/content/drive/MyDrive/VLM/paper_figures/",
):
    """Returns a dictionary containing all configuration settings, excluding dynamic values."""
    device = "cuda" if torch.cuda.is_available() else "cpu"

    PANEL_PROMPT = """
      You are a visual-language expert specialized in analyzing multi-panel scientific figures. Each panel may be labeled with letters (A, B, C, D, etc.) or alphanumeric labels (e.g., 1A, A5, 3B, etc.).

      Task
      1. Examine the provided figure carefully and identify all visible panel labels (A, B, 1A, A5, etc.).
      2. Determine which specific labeled panel best supports the following scientific claim:

      Claim:
      "{claim}"

      Instructions
      1. Read and interpret each labeled panel visually ‚Äì focus on trends, comparisons, correlations, or experimental results.
      2. Identify which panel (by its exact visible label) provides the strongest direct visual evidence supporting the claim.
      3. The label can be alphabetic or alphanumeric (e.g., A, B, 5A, A5, C2, etc.).
      4. If no panel clearly supports the claim, output Panel: None and briefly explain why.
      5. Provide a concise but clear explanation ‚Äì up to 3 lines having max 20 words, focusing on visual reasoning only.

      Output Format (must follow exactly)
      - Line 1: Panel: <exact visible label or None>
      - Line 2: Reason: <first line of reasoning>
      - Line 3 (optional): <second line of reasoning>
      - Line 4 (optional): <third line of reasoning>

      Do NOT include any extra commentary, numbering, markdown, or quotes.

      Provide your answer in the exact format above.
      """

    return {
        "MODEL_ID": model_id,
        "SEED": seed,
        "DEVICE": device,
        "IMAGE_ROOT_DIR": image_root_dir,
        "PANEL_PROMPT": PANEL_PROMPT,
    }

def set_seeds(seed):
    """Sets random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    cudnn.benchmark = False
    cudnn.deterministic = True

config = get_config()
set_seeds(config["SEED"])
print(f"‚úÖ Configuration loaded. Running on {config['DEVICE']} with image size {config['IMAGE_SIZE']}")

‚úÖ Configuration loaded. Running on cuda with image size (768, 768)


### Initialize and Load the Qwen Model

In [None]:
def initialize_model_and_processor(model_id, device):
    """Loads the Qwen2.5-VL model and processor."""
    print(f"üß† Loading model: {model_id}...")
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_id,
        dtype=torch.float32,
        device_map="cuda"
    )
    for name, param in model.named_parameters():
        param.requires_grad_(True)

    processor = AutoProcessor.from_pretrained(model_id)
    print("‚úÖ Model and processor loaded.")
    return model, processor

In [None]:
model, processor = initialize_model_and_processor(config["MODEL_ID"], config["DEVICE"])

üß† Loading model: Qwen/Qwen2.5-VL-7B-Instruct...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

‚úÖ Model and processor loaded.


### Load and Prepare the Dataset

In [None]:
from datasets import load_dataset
from huggingface_hub import hf_hub_download
from PIL import Image

def load_and_filter_data():
    """Loads the Dataset from Hugging Face and filters rows based on 'support' labels."""
    ds = load_dataset("StonyBrookNLP/MuSciClaims", split="test")
    df = ds.to_pandas()

    filtered_df = df[
        (df['label_3class'].astype(str).str.lower() == "support") &
        (df['label_2class'].astype(str).str.lower() == "support")
    ].copy()

    print(f"Loaded {len(df)} total rows. Filtered down to: {len(filtered_df)} rows")
    return filtered_df

data_df = load_and_filter_data()

README.md: 0.00B [00:00, ?B/s]

test_set.jsonl: 0.00B [00:00, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Loaded 1515 total rows. Filtered down to: 505 rows


### Initializing GRAD-CAM for evaluation

In [None]:
import pandas as pd
import torch
import numpy as np
import cv2
import os
import re
from PIL import Image
from einops import rearrange
import torch.nn as nn
import matplotlib.pyplot as plt

In [None]:
import math
import torch
import torch.nn as nn
import numpy as np
import cv2
from einops import rearrange

class GradCAM():
    def __init__(self, model, target_layer, input_token_len, output_ids):
        self.model = model
        self.target_layer = target_layer

        self.feature_maps = None
        self.gradients = None

        self.input_token_len = input_token_len
        self.output_ids = output_ids

        self._handles = []

        # output_ids may be a tensor on CUDA; move to CPU for indexing safely
        try:
            out_ids_cpu = self.output_ids.detach().cpu() if isinstance(self.output_ids, torch.Tensor) else torch.tensor(self.output_ids)
        except Exception:
            out_ids_cpu = torch.tensor(self.output_ids)
        self.target_ids = out_ids_cpu[0][self.input_token_len:].long()

    def save_feature_maps(self, module, input, output):
        # output normalize
        fmap = output[0] if isinstance(output, (list, tuple)) else output
        # expect shape: (batch, seq_len, channels) or (batch, channels, seq_len) depending on implementation
        # handle typical case: (batch, seq_len, channels)
        if fmap.ndim == 3 and fmap.shape[-1] > fmap.shape[1]:
            # if channels appear last, OK (batch, seq_len, channels)
            pass
        # store
        self.feature_maps = fmap
        # make sure gradients will be retained for this tensor
        try:
            self.feature_maps.retain_grad()
        except Exception:
            # some implementations may not allow retain_grad here; ignore gracefully
            pass

    def save_gradients(self, module, grad_input, grad_output):
        # grad_output may be tuple; pick first
        grad_out = grad_output[0] if isinstance(grad_output, (list, tuple)) else grad_output
        # detach and store (we'll move to cpu later as needed)
        self.gradients = grad_out.detach()

    def generate_cam(self, image, inputs, image_len, text_len):
        # Register hooks fresh for this call
        f_handle = self.target_layer.register_forward_hook(self.save_feature_maps)
        b_handle = self.target_layer.register_full_backward_hook(self.save_gradients)
        self._handles = [f_handle, b_handle]

        try:
            self.model.eval()
            self.model.zero_grad()

            # forward pass (we assume inputs already moved to device)
            out = self.model(**inputs)

            # compute target logits robustly
            # out.logits shape : (batch, seq_len, vocab)
            logits = out.logits

            # collect the portion corresponding to generated tokens
            seq_len = logits.shape[1]
            end_idx = min(self.output_ids.shape[1], seq_len) if isinstance(self.output_ids, torch.Tensor) else min(len(self.output_ids[0]), seq_len)
            # select token logits corresponding to target tokens and sum them into a scalar
            # shape and indexing: logits[0, start:end, vocab_index_of_target_token]
            start = self.input_token_len

            if start >= end_idx:
                raise RuntimeError(f"Invalid token index range for Grad-CAM: start {start} >= end {end_idx}")

            # build selection: for each position p in [start, end_idx), pick vocab index from target_ids
            targ_ids = self.target_ids
            if targ_ids.numel() != (end_idx - start):
                targ_ids = targ_ids[: (end_idx - start)]

            # gather logits per token and sum
            token_logits = logits[0, start:end_idx, :]  # shape (L, V)
            # ensure target ids on same device
            targ_ids = targ_ids.to(token_logits.device)
            selected = token_logits[torch.arange(token_logits.shape[0], device=token_logits.device), targ_ids]
            target_logits = selected.sum()

            # backward to get gradients
            target_logits.backward(retain_graph=False)

            # at this point hooks should have filled self.feature_maps and self.gradients
            if self.feature_maps is None or self.gradients is None:
                raise RuntimeError("Grad-CAM hooks failed to capture feature maps or gradients")

            # feature_maps expected shape: (batch, seq_len, channels)
            fmap = self.feature_maps.detach()
            grad = self.gradients.detach()

            # If batch dim present, take first element
            if fmap.ndim == 3:
                fmap = fmap[0]  # (seq_len, channels)
            elif fmap.ndim == 4:
                # (batch, seq_len, channels, ?) unexpected: try to squeeze
                fmap = fmap.reshape(fmap.shape[0], -1)[0]

            # If channels last (seq_len, channels) -> rearrange to (channels, h, w)
            num_tokens = fmap.shape[0]
            side = int(math.sqrt(num_tokens))
            if side * side != num_tokens:
                possible = [n for n in range(num_tokens) if int(math.sqrt(n))**2 == n]
                if possible:
                    # pick nearest lower perfect square
                    new_num = max([p for p in possible if p <= num_tokens])
                    fmap = fmap[:new_num, ...]
                    grad = grad[:new_num, ...]
                    num_tokens = new_num
                    side = int(math.sqrt(num_tokens))
                else:
                    raise RuntimeError(f"Cannot reshape feature maps with num_tokens={num_tokens}")

            # now fmap: (num_tokens, channels) and grad: (num_tokens, channels) or (num_tokens, channels)
            fmap = rearrange(fmap, '(h w) c -> c h w', h=side, w=side)
            grad = rearrange(grad, '(h w) c -> h w c', h=side, w=side)

            # rectify gradients, pool across spatial dims for each channel
            grad = nn.ReLU()(grad)
            pooled_gradients = torch.mean(grad, dim=[0, 1])  # shape (channels,)

            # weight activation maps
            activation = fmap.to(dtype=torch.float32)
            for i in range(activation.size(0)):
                activation[i, :, :] *= pooled_gradients[i].to(activation.device)

            heatmap = torch.mean(activation, dim=0).cpu().numpy()
            heatmap = np.maximum(heatmap, 0)
            maxv = heatmap.max() if heatmap.max() > 0 else 1e-6
            heatmap = heatmap / maxv

            threshold = 0.5
            heatmap[heatmap < threshold] = 0

            heatmap = cv2.resize(heatmap, (image.size[0], image.size[1]))
            heatmap = np.uint8(255 * heatmap)
            heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)

            original_image = np.array(image)  # ensure numpy array HWC BGR or RGB as expected
            if original_image.shape[-1] == 4:
                original_image = original_image[..., :3]

            superimposed_img = heatmap * 0.4 + original_image
            superimposed_img = np.clip(superimposed_img, 0, 255).astype(np.uint8)

            return heatmap, superimposed_img

        finally:
            for h in self._handles:
                try:
                    h.remove()
                except Exception:
                    pass
            self._handles = []
            self.feature_maps = None
            self.gradients = None
            # clear gradients on model to be safe
            try:
                self.model.zero_grad()
            except Exception:
                pass


### Generate and store GRADCAM

In [None]:
output_dir = "/content/drive/MyDrive/VLM/Baseline1/overlay"

In [None]:
def predict_panel_with_gradcam(image_path, claim, save_dir=None, perform_gradcam=True):
    image = Image.open(image_path).resize((384,384), resample=Image.Resampling.BOX)
    # image = Image.open(image_path).convert("RGB")
    prompt_text = config['PANEL_PROMPT'].format(claim=claim)

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt_text}
        ]
    }]

    # === Build prompt text ===
    # Convert chat messages into a text prompt using the model's chat template
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Preprocess images/videos from the chat messages
    image_inputs, video_inputs = process_vision_info(messages)

    # Estimate image token length (Qwen-VL uses 28x28 = 784 pixel patches)
    image_len = image_inputs[0].size[0] * image_inputs[0].size[1] // (28 * 28)

    # Tokenize and prepare full multimodal input for the model
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
        # min_pixels=config["MIN_PIXELS"],
        # max_pixels=config["MAX_PIXELS"]
    )
    inputs = inputs.to("cuda")

    # Prepare text-only version to compute text token length (before image_pad tokens)
    ptext = processor(
        text=[text.split("<|image_pad|>")[0]],
        padding=True,
        return_tensors="pt",
    )
    text_len = ptext['input_ids'].shape[1]

    print(f"image_len: {image_len}, text_len: {text_len}")

    assert len(inputs.input_ids) == 1, inputs  # Ensure batch size = 1

    # --- Inference: generating the model output ---

    # Run autoregressive generation
    generated_ids = model.generate(**inputs, max_new_tokens=128)

    # Length of input tokens, used to trim outputs
    input_token_len = [len(in_ids) for in_ids in inputs.input_ids][0]

    # Remove input tokens from the generated sequence to isolate new output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]

    # Decode model output into text
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=False,
        clean_up_tokenization_spaces=False
    )
    output_text=output_text[0]
    print(output_text)

    panel_match = re.search(r"Panel\s*[:\-]\s*([A-Za-z0-9]+)", output_text)
    reason_match = re.search(r"Reason\s*[:\-]\s*(.+)", output_text, re.DOTALL)

    panel = panel_match.group(1).upper() if panel_match else "Unknown"
    reason = reason_match.group(1).strip() if reason_match else output_text

    # Build combined input (input + generated output) for Grad-CAM or analysis

    inputs_out = processor(
        text=[text + output_text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs_out = inputs_out.to("cuda")


    torch.cuda.empty_cache()

    if perform_gradcam==False:
        return {"panel": panel, "reason": reason, "raw_output": output_text}

    try:
        target_layer = model.visual.blocks[-1].norm2

        gradcam = GradCAM(model, model.visual.blocks[-1].norm2, input_token_len, generated_ids)

        print("  üé® Generating Grad-CAM...")

        heatmap, superimposed_img = gradcam.generate_cam(image, inputs=inputs_out, image_len=image_len, text_len=text_len)

        # visualize
        filename = os.path.basename(image_path)
        name_only = os.path.splitext(filename)[0]
        overlay_path = os.path.join(output_dir, f"{name_only}_overlay.jpg")

        superimposed_rgb = cv2.cvtColor(superimposed_img, cv2.COLOR_BGR2RGB)
        cv2.imwrite(overlay_path, superimposed_rgb)
        print(f"Saved: {overlay_path}")

        return {"panel": panel, "reason": reason, "raw_output": output_text}

    except Exception as e:
        print(f"  ‚ö†Ô∏è Grad-CAM failed: {e}")
        return {"panel": panel, "reason": reason, "raw_output": output_text}

print("‚úÖ Prediction function ready")

‚úÖ Prediction function ready


In [None]:
data_df_gradcam = data_df.iloc[[139]]

In [None]:
data_df_gradcam

Unnamed: 0,base_claim_id,claim_id,claim_text,label_3class,label_2class,paper_id,associated_figure_filepath,associated_figure_number,associated_figure_panels,caption,claim_from_which_random_figure_is_taken,domain
417,PIIS0092867425002831_10,PIIS0092867425002831_10_support,"Within each taxonomic class, gut microbes enco...",SUPPORT,SUPPORT,PIIS0092867425002831,paper_figures/bio_PIIS0092867425002831_images_...,Figure 5,[Panel C],Taxonomic distribution and genomic coding patt...,,biology


### Evaluate the model

In [None]:
# Performing grad cam on the images now
results = []
base_image_dir = "/content/drive/MyDrive/VLM/paper_figures/"
num_to_process = 1  # Start with 3 images

for idx, (i, row) in enumerate(data_df_gradcam.head(num_to_process).iterrows()):
    img_name = str(row["associated_figure_filepath"].split('/')[-1]).strip()
    claim = str(row["claim_text"]).strip()
    img_path = base_image_dir + img_name
    ass_panel = str(row.get("associated_figure_panels", "")).strip()

    print(f"\n{'='*60}")
    print(f"üîç [{idx+1}/{num_to_process}] {img_name}")
    print(f"üìã Claim: {claim[:80]}...")
    print(f"üéØ Ground Truth: {ass_panel}")

    out = predict_panel_with_gradcam(img_path, claim, perform_gradcam=True)

    print(f"‚úÖ Predicted: {out['panel']}")

    results.append({
        "imagename": img_name,
        "claim": claim,
        "predicted_panel": out["panel"],
        "ground_truth_panel": ass_panel,
        "reason": out["reason"]
    })

results_df = pd.DataFrame(results)
display(results_df)


üîç [1/1] bio_PIIS0092867425002831_images_figure_5.jpg
üìã Claim: Within each taxonomic class, gut microbes encode a larger set of CMs compared wi...
üéØ Ground Truth: ['Panel C']
image_len: 196, text_len: 15
Panel: B
Reason: Box plots show higher median number of CMs in Firmicutes compared to other classes.
None<|im_end|>
  üé® Generating Grad-CAM...
Saved: /content/drive/MyDrive/VLM/Baseline11/overlay/bio_PIIS0092867425002831_images_figure_5_overlay.jpg
‚úÖ Predicted: B


Unnamed: 0,imagename,claim,predicted_panel,ground_truth_panel,reason
0,bio_PIIS0092867425002831_images_figure_5.jpg,"Within each taxonomic class, gut microbes enco...",B,['Panel C'],Box plots show higher median number of CMs in ...
