## **0. LOAD LIBRARY**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch, os

## **1. LOAD BASE MODEL - INT4**

In [3]:
!pip install -U bitsandbytes



In [4]:
# Path to save model
DRIVE_PATH = "/content/drive/MyDrive/Meditron3-Gemma2-2B"

# Load model & tokenizer from Drive
print("1. Loading model from Drive...")
tokenizer = AutoTokenizer.from_pretrained(DRIVE_PATH, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    DRIVE_PATH,
    load_in_4bit=True,
    device_map="auto",
    trust_remote_code=True,
)
print("2. Model loaded successfully!")

1. Loading model from Drive...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2. Model loaded successfully!


## **2. EVALUATE WITH DIFFERENT DATASETS**

In [5]:
import json
from tqdm import tqdm
import os
import torch
import psutil
import time
import numpy as np
from tqdm import tqdm

In [6]:
# print out sample data
UNIFIED_DIR = "/content/drive/MyDrive/data_source/unified_format"
DATASETS = ["medqa", "emrqa", "pubmedqa", "medquad"]
SAMPLE_COUNT = 1
for ds_name in DATASETS:
    print(f"\n{'='*20} SAMPLES FOR: {ds_name.upper()} {'='*20}")
    file_path = os.path.join(UNIFIED_DIR, ds_name, "all.jsonl")
    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= SAMPLE_COUNT: break
            print(json.dumps(json.loads(line), indent=2, ensure_ascii=False))


{
  "id": "A_23-year-old_pregnant_woman_at_22_weeks",
  "dataset": "medqa",
  "split": "all",
  "context": null,
  "question": "A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?",
  "text": [
    "Ampicillin",
    "Ceftriaxone",
    "Ciprofloxacin",
    "Doxycycline",
    "Nitrofurantoin"
  ],
  "encode": [
    "A",
    "B",
    "C",
    "D",
    "E"
  ],
  "answer": "E"
}

{
  "id": "The_patient_was_admiWhat_is_her_current_",
  "dataset": "emrqa",
  "split": "

### **2.1 EVALUATE WITH DIFFERENT DATASETS (MCQ tasks)**

In [None]:
UNIFIED_DIR = "/content/drive/MyDrive/data_source/unified_format"

# Dataset medqua
ds_medqa = []
file_path = os.path.join(UNIFIED_DIR, "medqa", "all.jsonl")
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            ds_medqa.append(json.loads(line))

ds_pubmedqa = []
file_path = os.path.join(UNIFIED_DIR, "pubmedqa", "all.jsonl")
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            ds_pubmedqa.append(json.loads(line))

In [None]:
def generate_batch_answers_mcq(model, tokenizer, questions, options_list, contexts=None, encodes=None, max_new_tokens=30):
    """
    Generate answers for a batch of MCQ questions.
    """
    # Handle missing contexts or encodes
    if contexts is None:
        contexts = [None] * len(questions)
    if encodes is None:
        encodes = [None] * len(questions) # Fix error if some items lack 'encode'

    prompts = []
    for q, opts, ctx, enc in zip(questions, options_list, contexts, encodes):
        # Ensure encode is a list (e.g., ['A', 'B', 'C', 'D'])
        # If item lacks 'encode', create default encoding
        if enc is None:
            enc = [chr(65 + i) for i in range(len(opts))] # A, B, C...

        options_str = "\n".join([f"{e}. {t}" for e, t in zip(enc, opts)])
        prompt = f"""
You are a medical reasoning assistant. Read the context and question carefully, then choose the best answer.
Context:
{ctx or "No additional context."}
Question:
{q}
Options:
{options_str}
Please respond with only the letter corresponding to the best answer.
Answer:
"""
        prompts.append(prompt)

    # Tokenize the entire batch
    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(model.device)

    # Generate for the entire batch
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0
        )

    # Decode the entire batch
    decoded_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Extract the answer parts
    answer_parts = [decoded.split("Answer:")[-1].strip() for decoded in decoded_batch]
    return answer_parts

In [None]:
# --- UPDATED EVALUATION FUNCTION (USES BATCHING) ---
def evaluate_model_mcq_task(model, tokenizer, dataset, batch_size=16, max_new_tokens=30): # <--- Added batch_size
    correct = 0
    total = len(dataset)
    latencies = [] # Will store per-sample latency
    total_generation_time = 0.0
    process = psutil.Process()

    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()

    # Get model size on disk
    model_size_gb = 0.0
    model_dir = getattr(model, 'config', None).name_or_path
    if model_dir and os.path.isdir(model_dir): # <--- Added safety check
        try:
            model_size_bytes = sum(os.path.getsize(os.path.join(root, f))
                                   for root, _, files in os.walk(model_dir)
                                   for f in files)
            model_size_gb = model_size_bytes / (1024 ** 3)
        except Exception as e:
            print(f"Warning: Could not calculate model size. Error: {e}")

    # Iterate through the dataset in BATCHES
    for i in tqdm(range(0, total, batch_size), desc="Evaluating Batches"):
        batch_items = dataset[i:i + batch_size]
        num_in_batch = len(batch_items)

        # Prepare data for the batch
        batch_contexts = [item.get("context") for item in batch_items]
        batch_questions = [item["question"] for item in batch_items]
        batch_options = [item["text"] if "text" in item else item["options"] for item in batch_items]
        batch_encodes = [item.get("encode") for item in batch_items]
        batch_true_answers = [item["answer"].strip().upper() for item in batch_items]

        start_time = time.time()

        # Generate answers for the entire batch
        pred_answers = generate_batch_answers_mcq(
            model, tokenizer, batch_questions, batch_options,
            contexts=batch_contexts, encodes=batch_encodes,
            max_new_tokens=max_new_tokens
        )

        batch_time = time.time() - start_time
        total_generation_time += batch_time
        per_sample_latency = batch_time / num_in_batch
        latencies.extend([per_sample_latency] * num_in_batch)

        # Compare results for the batch
        for pred, true_answer in zip(pred_answers, batch_true_answers):
            pred_clean = pred.strip().upper()
            if pred_clean.startswith("ANSWER:"):
                pred_clean = pred_clean.replace("ANSWER:", "").strip()
            if "." in pred_clean:
                pred_clean = pred_clean.split(".")[0].strip()

            if pred_clean == true_answer:
                correct += 1

    # Get model performance metrics
    accuracy = correct / total if total > 0 else 0
    avg_latency = np.mean(latencies) if latencies else 0
    p95_latency = np.percentile(latencies, 95) if latencies else 0
    p99_latency = np.percentile(latencies, 99) if latencies else 0
    vram_peak_gb = torch.cuda.max_memory_allocated() / (1024 ** 3) if torch.cuda.is_available() else 0.0
    cpu_ram_gb = process.memory_info().rss / (1024 ** 3)

    result = {
        "accuracy": accuracy,
        "total_time_sec": total_generation_time, # This is the total generation time
        "avg_latency_sec": avg_latency,     # Average latency per sample
        "p95_latency_sec": p95_latency,
        "p99_latency_sec": p99_latency,
        "vram_peak_gb": vram_peak_gb,
        "cpu_ram_gb": cpu_ram_gb,
        "model_size_gb": model_size_gb,
        "total_samples": total,
        "batch_size": batch_size
    }

    print(f"\n--- Evaluation Complete ---")
    print(f"Batch size: {batch_size}")
    print(f"Accuracy: {accuracy:.2%} ({correct}/{total})")
    print(f"Total time: {total_generation_time:.2f}s | Avg latency/sample: {avg_latency:.3f}s | p95: {p95_latency:.3f}s | p99: {p99_latency:.3f}s")
    print(f"GPU VRAM peak: {vram_peak_gb:.2f} GB | CPU RAM used: {cpu_ram_gb:.2f} GB")
    print(f"Model size on disk: {model_size_gb:.2f} GB" if model_size_gb > 0 else "Model size unknown")

    return result

In [10]:
evaluate_model_mcq_task(model, tokenizer, ds_medqa)

Evaluating Batches:   0%|          | 0/94 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating Batches: 100%|██████████| 94/94 [13:17<00:00,  8.49s/it]


--- Evaluation Complete ---
Batch size: 16
Accuracy: 32.33% (485/1500)
Total time: 797.82s | Avg latency/sample: 0.532s | p95: 0.700s | p99: 0.862s
GPU VRAM peak: 4.55 GB | CPU RAM used: 2.32 GB
Model size on disk: 4.91 GB





{'accuracy': 0.3233333333333333,
 'total_time_sec': 797.8198621273041,
 'avg_latency_sec': np.float64(0.5318799080848694),
 'p95_latency_sec': np.float64(0.699855238199234),
 'p99_latency_sec': np.float64(0.8622446656227112),
 'vram_peak_gb': 4.551327705383301,
 'cpu_ram_gb': 2.323436737060547,
 'model_size_gb': 4.90564379375428,
 'total_samples': 1500,
 'batch_size': 16}

In [11]:
evaluate_model_mcq_task(model, tokenizer, ds_pubmedqa)

Evaluating Batches: 100%|██████████| 63/63 [11:32<00:00, 10.99s/it]


--- Evaluation Complete ---
Batch size: 16
Accuracy: 65.10% (651/1000)
Total time: 692.59s | Avg latency/sample: 0.693s | p95: 0.830s | p99: 0.847s
GPU VRAM peak: 4.52 GB | CPU RAM used: 2.33 GB
Model size on disk: 4.91 GB





{'accuracy': 0.651,
 'total_time_sec': 692.5878157615662,
 'avg_latency_sec': np.float64(0.6925878157615661),
 'p95_latency_sec': np.float64(0.8296231031417847),
 'p99_latency_sec': np.float64(0.8466640114784241),
 'vram_peak_gb': 4.517181396484375,
 'cpu_ram_gb': 2.3306808471679688,
 'model_size_gb': 4.90564379375428,
 'total_samples': 1000,
 'batch_size': 16}

### **2.2 EVALUATE WITH DIFFERENT DATASETS (QnA tasks)**

In [7]:
# Install dependencies
!pip install -q sentence-transformers psutil tqdm
!pip install sentence-transformers -q
import os, re, time, psutil, torch, numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import os
import time
import torch
import psutil
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import re

In [8]:
UNIFIED_DIR = "/content/drive/MyDrive/data_source/unified_format"
# Dataset emrqa
ds_emrqa = []
file_path = os.path.join(UNIFIED_DIR, "emrqa", "all.jsonl")
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            ds_emrqa.append(json.loads(line))

# Dataset medquad
ds_medquad = []
file_path = os.path.join(UNIFIED_DIR, "medquad", "all.jsonl")
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            ds_medquad.append(json.loads(line))

In [9]:
# Load BioBERT from Drive
BioBERT_BDRIVE_PATH = "/content/drive/MyDrive/BioBERT/bert_embeddings"
bert_model = SentenceTransformer(BioBERT_BDRIVE_PATH)
print("BioBERT model loaded from Drive.")

BioBERT model loaded from Drive.


In [10]:
# Helper function: keyword recall
def keyword_recall(pred, target):
    """Compute fraction of keywords in target that appear in pred."""
    clean = lambda s: re.findall(r'\w+', s.lower())
    pred_words = set(clean(pred))
    target_words = set(clean(target))
    matched = pred_words & target_words
    recall = len(matched) / len(target_words) if target_words else 0
    return recall

In [11]:
def generate_batch_answers(model, tokenizer, questions, contexts=None, max_new_tokens=100):
    if contexts is None:
        contexts = [None] * len(questions)

    # Create batch prompt
    prompts = []
    for q, c in zip(questions, contexts):
        prompt = f"""
You are a medical reasoning assistant. Read the context and question carefully and answer concisely.
Context:
{c or "No additional context."}
Question:
{q}
Answer:
"""
        prompts.append(prompt)

    # Tokenize batch with padding
    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(model.device)

    # Generate output for batch
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0
        )

    # Decode for batch
    decoded_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Extract answer for each output
    answer_parts = [decoded.split("Answer:")[-1].strip() for decoded in decoded_batch]
    return answer_parts

In [12]:
def evaluate_short_answer_task(llm_model, tokenizer, dataset, batch_size=48, alpha=0.5, beta=0.5, max_new_tokens=100):
    scores = []
    latencies = [] # sample latency
    total_generation_time = 0.0

    process = psutil.Process()
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()

    # Model size on disk
    model_size_gb = 0.0
    model_dir = getattr(llm_model, 'config', None).name_or_path
    if model_dir and os.path.isdir(model_dir):
        try:
            model_size_bytes = sum(os.path.getsize(os.path.join(root, f))
                                   for root, _, files in os.walk(model_dir)
                                   for f in files)
            model_size_gb = model_size_bytes / (1024 ** 3)
        except Exception as e:
            print(f"Warning: Could not calculate model size. Error: {e}")

    # processing batch
    for i in tqdm(range(0, len(dataset), batch_size), desc="Evaluating Batches"):
        batch_items = dataset[i:i + batch_size]
        batch_questions = [item["question"] for item in batch_items]
        batch_contexts = [item.get("context") for item in batch_items]
        batch_true_answers = [item["answer"] for item in batch_items]

        num_in_batch = len(batch_items)

        start_time = time.time()

        # Generate predictions for all batch
        pred_answers = generate_batch_answers(
            llm_model, tokenizer, batch_questions, batch_contexts, max_new_tokens=max_new_tokens
        )

        batch_time = time.time() - start_time
        total_generation_time += batch_time

        # sample avg latency
        per_sample_latency = batch_time / num_in_batch
        latencies.extend([per_sample_latency] * num_in_batch)

        # evaluate each question in batch
        for pred_answer, true_answer in zip(pred_answers, batch_true_answers):
            # Keyword recall
            kr = keyword_recall(pred_answer, true_answer)

            # BERT embedding similarity
            emb_pred = bert_model.encode(pred_answer, convert_to_tensor=True)
            emb_true = bert_model.encode(true_answer, convert_to_tensor=True)
            sim = util.cos_sim(emb_pred, emb_true).item()

            # Weighted score
            score = alpha * kr + beta * sim
            scores.append(score)

    # Aggregate metrics
    avg_score = np.mean(scores)
    avg_latency = np.mean(latencies)
    p95_latency = np.percentile(latencies, 95)
    p99_latency = np.percentile(latencies, 99)
    vram_peak_gb = torch.cuda.max_memory_allocated() / (1024 ** 3) if torch.cuda.is_available() else 0.0
    cpu_ram_gb = process.memory_info().rss / (1024 ** 3)

    result = {
        "avg_weighted_score": avg_score,
        "total_time_sec": total_generation_time,
        "avg_latency_sec": avg_latency,     # Avg latency per sample
        "p95_latency_sec": p95_latency,
        "p99_latency_sec": p99_latency,
        "vram_peak_gb": vram_peak_gb,
        "cpu_ram_gb": cpu_ram_gb,
        "model_size_gb": model_size_gb,
        "total_samples": len(dataset),
        "batch_size": batch_size
    }

    print(f"\n--- Evaluation Complete ---")
    print(f"Batch size: {batch_size}")
    print(f"Avg weighted score: {avg_score:.2%}")
    print(f"Total time: {total_generation_time:.2f}s | Avg latency/sample: {avg_latency:.2f}s | p95: {p95_latency:.2f}s | p99: {p99_latency:.2f}s")
    print(f"GPU VRAM peak: {vram_peak_gb:.2f} GB | CPU RAM used: {cpu_ram_gb:.2f} GB")
    print(f"Model size on disk: {model_size_gb:.2f} GB" if model_size_gb > 0 else "Model size unknown (path not a local dir)")

    return result

In [13]:
evaluate_short_answer_task(model, tokenizer, ds_emrqa, alpha=0.5, beta=0.5, max_new_tokens=100)

Evaluating Batches:   0%|          | 0/32 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating Batches: 100%|██████████| 32/32 [29:49<00:00, 55.92s/it]


--- Evaluation Complete ---
Batch size: 48
Avg weighted score: 53.29%
Total time: 1758.65s | Avg latency/sample: 1.17s | p95: 1.20s | p99: 1.20s
GPU VRAM peak: 13.24 GB | CPU RAM used: 2.47 GB
Model size on disk: 4.91 GB





{'avg_weighted_score': np.float64(0.5328610202259287),
 'total_time_sec': 1758.6539869308472,
 'avg_latency_sec': np.float64(1.1724359912872315),
 'p95_latency_sec': np.float64(1.1991039315859477),
 'p99_latency_sec': np.float64(1.20492522418499),
 'vram_peak_gb': 13.238492965698242,
 'cpu_ram_gb': 2.473613739013672,
 'model_size_gb': 4.90564379375428,
 'total_samples': 1500,
 'batch_size': 48}

In [15]:
evaluate_short_answer_task(model, tokenizer, ds_medquad, alpha=0.5, beta=0.5, max_new_tokens=100)

Evaluating Batches: 100%|██████████| 32/32 [08:30<00:00, 15.94s/it]


--- Evaluation Complete ---
Batch size: 48
Avg weighted score: 30.42%
Total time: 477.71s | Avg latency/sample: 0.32s | p95: 0.32s | p99: 0.32s
GPU VRAM peak: 8.87 GB | CPU RAM used: 2.48 GB
Model size on disk: 4.91 GB





{'avg_weighted_score': np.float64(0.30422572295200984),
 'total_time_sec': 477.7092914581299,
 'avg_latency_sec': np.float64(0.3184728609720866),
 'p95_latency_sec': np.float64(0.3190922687451045),
 'p99_latency_sec': np.float64(0.3227714051802953),
 'vram_peak_gb': 8.867308616638184,
 'cpu_ram_gb': 2.480823516845703,
 'model_size_gb': 4.90564379375428,
 'total_samples': 1500,
 'batch_size': 48}