# Baseline 0 Evaluation

### Initial package installation


In [1]:
# Install required packages
!pip install -q transformers accelerate bitsandbytes pillow pandas torch opencv-python einops
!pip install qwen-vl-utils

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.14-py3-none-any.whl.metadata (9.0 kB)
Collecting av (from qwen-vl-utils)
  Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Downloading qwen_vl_utils-0.0.14-py3-none-any.whl (8.1 kB)
Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl (40.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av, qwen-vl-utils
Successfully installed av-16.0.1 qwen-vl-utils-0.0.14
Mounted at /content/drive


### Setup and Configurations

In [18]:
import os
import random
import re
import torch
import numpy as np
import pandas as pd
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch.backends.cudnn as cudnn
import warnings

# Clean Environment
warnings.filterwarnings("ignore")
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['HUGGINGFACE_HUB_FORCE_REDOWNLOAD'] = '1'

def get_config(
    model_id="Qwen/Qwen2.5-VL-7B-Instruct",
    seed=520,
    image_size=(192, 192),
    image_root_dir="/content/drive/MyDrive/VLM/images",
):
    """Returns a dictionary containing all configuration settings, excluding dynamic values."""
    device = "cuda" if torch.cuda.is_available() else "cpu"

    PANEL_PROMPT = """
      You are a visual-language expert specialized in analyzing multi-panel scientific figures. Each panel may be labeled with letters (A, B, C, D, etc.) or alphanumeric labels (e.g., 1A, A5, 3B, etc.).

      Task
      1. Examine the provided figure carefully and identify all visible panel labels (A, B, 1A, A5, etc.).
      2. Determine which specific labeled panel best supports the following scientific claim:

      Claim:
      "{claim}"

      Instructions
      1. Read and interpret each labeled panel visually – focus on trends, comparisons, correlations, or experimental results.
      2. Identify which panel (by its exact visible label) provides the strongest direct visual evidence supporting the claim.
      3. The label can be alphabetic or alphanumeric (e.g., A, B, 5A, A5, C2, etc.).
      4. If no panel clearly supports the claim, output Panel: None and briefly explain why.
      5. Provide a concise but clear explanation – up to 3 lines having max 20 words, focusing on visual reasoning only.

      Output Format (must follow exactly)
      - Line 1: Panel: <exact visible label or None>
      - Line 2: Reason: <first line of reasoning>
      - Line 3 (optional): <second line of reasoning>
      - Line 4 (optional): <third line of reasoning>

      Do NOT include any extra commentary, numbering, markdown, or quotes.

      Provide your answer in the exact format above.
      """

    return {
        "MODEL_ID": model_id,
        "SEED": seed,
        "DEVICE": device,
        "IMAGE_SIZE": image_size,
        "IMAGE_ROOT_DIR": image_root_dir,
        "PANEL_PROMPT": PANEL_PROMPT
    }

def set_seeds(seed):
    """Sets random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    cudnn.benchmark = False
    cudnn.deterministic = True

config = get_config()
set_seeds(config["SEED"])
print(f"✅ Configuration loaded. Running on {config['DEVICE']}")

✅ Configuration loaded. Running on cuda


### Initialize and Load the Qwen Model

In [3]:
def initialize_model_and_processor(model_id, device):
    """Loads the Qwen2.5-VL model and processor."""
    print(f"🧠 Loading model: {model_id}...")
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_id,
        dtype=torch.float16,
        device_map="cuda"
    )
    for name, param in model.named_parameters():
        param.requires_grad_(True)

    processor = AutoProcessor.from_pretrained(model_id)
    print("✅ Model and processor loaded.")
    return model, processor

In [4]:
model, processor = initialize_model_and_processor(config["MODEL_ID"], config["DEVICE"])

🧠 Loading model: Qwen/Qwen2.5-VL-7B-Instruct...


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

✅ Model and processor loaded.


### Load and Prepare the Dataset

In [5]:
from datasets import load_dataset
from huggingface_hub import hf_hub_download
from PIL import Image

def load_and_filter_data():
    """Loads the Dataset from Hugging Face and filters rows based on 'support' labels."""
    ds = load_dataset("StonyBrookNLP/MuSciClaims", split="test")
    df = ds.to_pandas()

    filtered_df = df[
        (df['label_3class'].astype(str).str.lower() == "support") &
        (df['label_2class'].astype(str).str.lower() == "support")
    ].copy()

    print(f"Loaded {len(df)} total rows. Filtered down to: {len(filtered_df)} rows")
    return filtered_df

data_df = load_and_filter_data()

README.md: 0.00B [00:00, ?B/s]

test_set.jsonl: 0.00B [00:00, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Loaded 1515 total rows. Filtered down to: 505 rows


In [6]:
data_df.head()

Unnamed: 0,base_claim_id,claim_id,claim_text,label_3class,label_2class,paper_id,associated_figure_filepath,associated_figure_number,associated_figure_panels,caption,claim_from_which_random_figure_is_taken,domain
0,PIIS0092867425000455_1,PIIS0092867425000455_1_support,"Finally, introduction of a stop codon in the S...",SUPPORT,SUPPORT,PIIS0092867425000455,paper_figures/bio_PIIS0092867425000455_images_...,Figure 1,[Panel I],A method for long-term visualization of single...,,biology
3,PIIS0092867425000455_4,PIIS0092867425000455_4_support,"Interestingly, when measuring GFP foci intensi...",SUPPORT,SUPPORT,PIIS0092867425000455,paper_figures/bio_PIIS0092867425000455_images_...,Figure 2,[Panel A],Transient collisions between translating ribos...,,biology
6,PIIS0092867425000455_6,PIIS0092867425000455_6_support,While plateau traces did show some heterogenei...,SUPPORT,SUPPORT,PIIS0092867425000455,paper_figures/bio_PIIS0092867425000455_images_...,Figure 2,[Panel C],Transient collisions between translating ribos...,,biology
9,PIIS0092867425000455_7,PIIS0092867425000455_7_support,"In this scenario, the trailing ribosome is not...",SUPPORT,SUPPORT,PIIS0092867425000455,paper_figures/bio_PIIS0092867425000455_images_...,Figure 4,[Panel I],Ribosome cooperativity reduces ribosome pausin...,,biology
12,PIIS0092867425000455_8,PIIS0092867425000455_8_support,Comparing experiments with simulations reveale...,SUPPORT,SUPPORT,PIIS0092867425000455,paper_figures/bio_PIIS0092867425000455_images_...,Figure 4,[Panel K],Ribosome cooperativity reduces ribosome pausin...,,biology


In [7]:
data_df.columns

Index(['base_claim_id', 'claim_id', 'claim_text', 'label_3class',
       'label_2class', 'paper_id', 'associated_figure_filepath',
       'associated_figure_number', 'associated_figure_panels', 'caption',
       'claim_from_which_random_figure_is_taken', 'domain'],
      dtype='object')

In [8]:
type(data_df.associated_figure_panels[0])

numpy.ndarray

In [16]:
config

{'MODEL_ID': 'Qwen/Qwen2.5-VL-7B-Instruct',
 'SEED': 520,
 'DEVICE': 'cuda',
 'IMAGE_ROOT_DIR': '/content/drive/MyDrive/VLM/images',
 'PANEL_PROMPT': '\n      You are a visual-language expert specialized in analyzing multi-panel scientific figures. Each panel may be labeled with letters (A, B, C, D, etc.) or alphanumeric labels (e.g., 1A, A5, 3B, etc.).\n\n      Task\n      1. Examine the provided figure carefully and identify all visible panel labels (A, B, 1A, A5, etc.).\n      2. Determine which specific labeled panel best supports the following scientific claim:\n\n      Claim:\n      "{claim}"\n\n      Instructions\n      1. Read and interpret each labeled panel visually – focus on trends, comparisons, correlations, or experimental results.\n      2. Identify which panel (by its exact visible label) provides the strongest direct visual evidence supporting the claim.\n      3. The label can be alphabetic or alphanumeric (e.g., A, B, 5A, A5, C2, etc.).\n      4. If no panel clearly 

### Generate predictions

In [10]:
def predict_panel(model, processor, image_path, claim, config):
    """
    Core function for multimodal inference using Qwen2.5-VL.

    Uses config for DEVICE, IMAGE_SIZE, and PANEL_PROMPT.
    """
    image_size = config["IMAGE_SIZE"]
    device = config["DEVICE"]
    prompt_template = config["PANEL_PROMPT"]

    image = Image.open(image_path).resize(image_size, resample=Image.Resampling.BOX)
    prompt_text = prompt_template.format(claim=claim)

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt_text}
        ]
    }]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to(device)

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = generated_ids[:, inputs.input_ids.shape[1]:]

    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=False,
        clean_up_tokenization_spaces=False
    )[0]

    panel_match = re.search(r"Panel\s*[:\-]\s*([A-Za-z0-9]+|None)", output_text, re.IGNORECASE)
    reason_match = re.search(r"Reason\s*[:\-]\s*(.+)", output_text, re.DOTALL | re.IGNORECASE)

    panel = panel_match.group(1).upper() if panel_match else "Unknown"
    reason = reason_match.group(1).strip() if reason_match else output_text

    return {"panel": panel, "reason": reason, "raw_output": output_text}

### Evaluate Model's performance

In [11]:
import os
import re
import numpy as np

def clean_panel_labels(raw_panels):
    """
    Takes the raw `associated_figure_panels` entry (numpy array or list of str)
    and extracts clean labels like ['A', 'C', '4D'].
    """
    if isinstance(raw_panels, np.ndarray):
        raw_panels = raw_panels.tolist()

    clean = []

    for item in raw_panels:
        s = str(item).strip().upper()

        # Case 1: "Panel I" or "Figure 4A"
        m = re.search(r"(PANEL|FIGURE)\s+([A-Z0-9]+)", s)
        if m:
            clean.append(m.group(2))
            continue

        # Case 2: Already a label like "A", "B", "3C", "12D"
        if re.fullmatch(r"[A-Z0-9]{1,3}", s):
            clean.append(s)
            continue

    return clean

def evaluate_model(df, model, processor, config, num_to_process):
    matches = 0
    results = []

    base_image_dir = config["IMAGE_ROOT_DIR"]

    for idx, (i, row) in enumerate(df.head(num_to_process).iterrows()):
        img_name = os.path.basename(str(row["associated_figure_filepath"]))
        claim = str(row["claim_text"]).strip()
        img_path = os.path.join(base_image_dir, img_name)

        raw_panels = row["associated_figure_panels"]
        ass_panel_clean = clean_panel_labels(raw_panels)

        print(f"\n{'='*60}")
        print(f"[{idx+1}/{num_to_process}] {img_name}")
        print(f"Claim: {claim[:80]}...")
        print(f"Ground Truth: {ass_panel_clean}")

        # run model
        try:
            out = predict_panel(model, processor, img_path, claim, config)
            predicted = out.get("panel", "").strip().upper()
        except Exception as e:
            print(f"Error predicting for {img_name}: {e}")
            out = {"panel": "ERROR", "reason": str(e), "raw_output": f"Error: {e}"}
            predicted = "ERROR"

        print(f"Predicted: {predicted}")
        print(f"Reason: {out.get('reason', '')}")
        print(f"Raw Output: {out.get('raw_output', '')}")

        # partial (robust) matfching
        is_match = False
        for true_label in ass_panel_clean:
            # exact match
            if predicted == true_label:
                is_match = True
                break
            # partial: predicted == last char of true
            if len(true_label) > 1 and predicted == true_label[-1]:
                is_match = True
                break

        if is_match:
            matches += 1

        results.append({
            "claim": claim,
            "ground_truth": ass_panel_clean,
            "prediction": predicted,
            "is_match": is_match,
            "reason": out.get("reason", "")
        })

    accuracy = (matches / num_to_process) * 100 if num_to_process > 0 else 0

    print(f"\n{'='*60}")
    print(f"Total processed: {num_to_process}")
    print(f"Accuracy: {accuracy:.2f}% ({matches}/{num_to_process})")

    return results, accuracy

In [19]:
SAMPLES_TO_TEST = len(data_df)
evaluation_results, final_accuracy = evaluate_model(
            data_df,
            model,
            processor,
            config,
            num_to_process=SAMPLES_TO_TEST # Dynamic parameter
        )

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

[24/505] bio_PIIS0092867424014788_images_figure_3.jpg
Claim: In some cases, even close homologs with >80% sequence identity when compared wit...
Ground Truth: ['A']
Predicted: A
Reason: The box plot shows a range of binding scores for proteins with high sequence identity, indicating variability in binding predictions.
- Reason: Some proteins with >80% sequence identity have low predicted binding scores, supporting the claim of spurious original predictions.<|im_end|>
Raw Output: - Panel: A
- Reason: The box plot shows a range of binding scores for proteins with high sequence identity, indicating variability in binding predictions.
- Reason: Some proteins with >80% sequence identity have low predicted binding scores, supporting the claim of spurious original predictions.<|im_end|>

[25/505] bio_PIIS0092867425000352_images_figure_3.jpg
Claim: In contrast to its dual binding modes in GPR133-GAIN, 5a-DHT was found to adopt .

In [20]:
data_df[data_df["associated_figure_filepath"].str.endswith(
    "bio_PIIS0092867425000509_images_figure_1.jpg"
)]


Unnamed: 0,base_claim_id,claim_id,claim_text,label_3class,label_2class,paper_id,associated_figure_filepath,associated_figure_number,associated_figure_panels,caption,claim_from_which_random_figure_is_taken,domain
147,PIIS0092867425000509_2,PIIS0092867425000509_2_support,This corresponded with less pro-inflammatory c...,SUPPORT,SUPPORT,PIIS0092867425000509,paper_figures/bio_PIIS0092867425000509_images_...,Figure 1,"[Panel R, Panel S, Panel T]",Neutrophils produce unique LAND-Vs with anti-i...,,biology
150,PIIS0092867425000509_3,PIIS0092867425000509_3_support,"In a lethal S. aureus pneumonia model, mice tr...",SUPPORT,SUPPORT,PIIS0092867425000509,paper_figures/bio_PIIS0092867425000509_images_...,Figure 1,"[Panel U, Panel V]",Neutrophils produce unique LAND-Vs with anti-i...,,biology
153,PIIS0092867425000509_4,PIIS0092867425000509_4_support,"Notably, despite reducing inflammation, LAND-V...",SUPPORT,SUPPORT,PIIS0092867425000509,paper_figures/bio_PIIS0092867425000509_images_...,Figure 1,"[Panel W, Panel X]",Neutrophils produce unique LAND-Vs with anti-i...,,biology
