## **0. LOAD LIBRARY**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch, os

## **1. LOAD BASE MODEL - BF16**

In [3]:
# Path to save model
DRIVE_PATH = "/content/drive/MyDrive/Meditron3-Gemma2-2B"

# Load model & tokenizer from Drive
print("1. Loading model from Drive...")
tokenizer = AutoTokenizer.from_pretrained(DRIVE_PATH, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    DRIVE_PATH,
    torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    device_map="auto",
    trust_remote_code=True,
)
print("2. Model loaded successfully!")

1. Loading model from Drive...


`torch_dtype` is deprecated! Use `dtype` instead!
The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2. Model loaded successfully!


## **2. EVALUATE WITH DIFFERENT DATASETS**

In [5]:
import json
from tqdm import tqdm
import os
import torch
import psutil
import time
import numpy as np
from tqdm import tqdm

In [6]:
# print out sample data
UNIFIED_DIR = "/content/drive/MyDrive/data_source/unified_format"
DATASETS = ["medqa", "emrqa", "pubmedqa", "medquad"]
SAMPLE_COUNT = 1
for ds_name in DATASETS:
    print(f"\n{'='*20} SAMPLES FOR: {ds_name.upper()} {'='*20}")
    file_path = os.path.join(UNIFIED_DIR, ds_name, "all.jsonl")
    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= SAMPLE_COUNT: break
            print(json.dumps(json.loads(line), indent=2, ensure_ascii=False))


{
  "id": "A_23-year-old_pregnant_woman_at_22_weeks",
  "dataset": "medqa",
  "split": "all",
  "context": null,
  "question": "A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?",
  "text": [
    "Ampicillin",
    "Ceftriaxone",
    "Ciprofloxacin",
    "Doxycycline",
    "Nitrofurantoin"
  ],
  "encode": [
    "A",
    "B",
    "C",
    "D",
    "E"
  ],
  "answer": "E"
}

{
  "id": "The_patient_was_admiWhat_is_her_current_",
  "dataset": "emrqa",
  "split": "

### **2.1 EVALUATE WITH DIFFERENT DATASETS (MCQ tasks)**

In [None]:
UNIFIED_DIR = "/content/drive/MyDrive/data_source/unified_format"

# Dataset medqua
ds_medqa = []
file_path = os.path.join(UNIFIED_DIR, "medqa", "all.jsonl")
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            ds_medqa.append(json.loads(line))

ds_pubmedqa = []
file_path = os.path.join(UNIFIED_DIR, "pubmedqa", "all.jsonl")
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            ds_pubmedqa.append(json.loads(line))

In [None]:
# Function to receive MCQ prompt input and generate answer
def generate_answer(model, tokenizer, question, options, context=None, encode=None, max_new_tokens=30):
    options_str = "\n".join([f"{e}. {t}" for e, t in zip(encode, options)])
    prompt = f"""
You are a medical reasoning assistant. Read the context and question carefully, then choose the best answer.
Context:
{context or "No additional context."}
Question:
{question}
Options:
{options_str}
Please respond with only the letter corresponding to the best answer.
Answer:
"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_part = decoded.split("Answer:")[-1].strip()
    return answer_part

In [None]:
# Function to measure accuracy with other performance metrics
def evaluate_model_mcq_task(model, tokenizer, dataset, max_new_tokens=30):
    correct = 0
    total = len(dataset)
    times = []
    process = psutil.Process()
    torch.cuda.reset_peak_memory_stats()

    # Get model size on disk
    model_dir = getattr(model, 'config', None).name_or_path
    if model_dir is None:
        model_dir = "./model_temp_dir"
    model_size_bytes = sum(os.path.getsize(os.path.join(root, f))
                            for root, _, files in os.walk(model_dir)
                            for f in files)
    model_size_gb = model_size_bytes / (1024 ** 3)

    # Extract information from prompt and feed into function generate answer
    for item in tqdm(dataset, desc="Evaluating"):
        start_time = time.time()
        context = item.get("context")
        question = item["question"]
        options = item["text"] if "text" in item else item["options"]
        encode = item.get("encode")
        true_answer = item["answer"].strip().upper()
        pred = generate_answer(model, tokenizer, question, options, context=context, encode=encode, max_new_tokens=max_new_tokens)
        times.append(time.time() - start_time)

        # Get model predictions compare with answers
        pred_clean = pred.strip().upper()
        if pred_clean.startswith("ANSWER:"):
            pred_clean = pred_clean.replace("ANSWER:", "").strip()
        if "." in pred_clean:
            pred_clean = pred_clean.split(".")[0].strip()
        if pred_clean == true_answer:
            correct += 1

    # Get model performance metrics
    accuracy = correct / total if total > 0 else 0
    avg_latency = np.mean(times) if times else 0
    p95_latency = np.percentile(times, 95) if times else 0
    p99_latency = np.percentile(times, 99) if times else 0
    total_time = sum(times)
    vram_peak_gb = torch.cuda.max_memory_allocated() / (1024 ** 3) if torch.cuda.is_available() else 0.0
    cpu_ram_gb = process.memory_info().rss / (1024 ** 3)

    result = {
        "accuracy": accuracy,
        "total_time_sec": total_time,
        "avg_latency_sec": avg_latency,
        "p95_latency_sec": p95_latency,
        "p99_latency_sec": p99_latency,
        "vram_peak_gb": vram_peak_gb,
        "cpu_ram_gb": cpu_ram_gb,
        "model_size_gb": model_size_gb,
        "total_samples": total
    }

    print(f"Accuracy: {accuracy:.2%} ({correct}/{total})")
    print(f"Total time: {total_time:.2f}s | Avg latency: {avg_latency:.2f}s | p95: {p95_latency:.2f}s | p99: {p99_latency:.2f}s")
    print(f"GPU VRAM peak: {vram_peak_gb:.2f} GB | CPU RAM used: {cpu_ram_gb:.2f} GB")
    print(f"Model size on disk: {model_size_gb:.2f} GB" if model_size_gb else "Model size unknown")

    return result


In [None]:
evaluate_model_mcq_task(model, tokenizer, ds_medqa)

Evaluating: 100%|██████████| 1500/1500 [02:30<00:00,  9.97it/s]

Accuracy: 37.27% (559/1500)
Total time: 149.91s | Avg latency: 0.10s | p95: 0.09s | p99: 0.12s
GPU VRAM peak: 5.01 GB | CPU RAM used: 2.35 GB
Model size on disk: 4.91 GB





{'accuracy': 0.37266666666666665,
 'total_time_sec': 149.9098961353302,
 'avg_latency_sec': np.float64(0.0999399307568868),
 'p95_latency_sec': np.float64(0.0947158694267273),
 'p99_latency_sec': np.float64(0.11509149789810179),
 'vram_peak_gb': 5.010180950164795,
 'cpu_ram_gb': 2.3490371704101562,
 'model_size_gb': 4.90564379375428,
 'total_samples': 1500}

In [None]:
evaluate_model_mcq_task(model, tokenizer, ds_pubmedqa)

Evaluating: 100%|██████████| 1000/1000 [01:34<00:00, 10.54it/s]

Accuracy: 74.30% (743/1000)
Total time: 94.47s | Avg latency: 0.09s | p95: 0.11s | p99: 0.12s
GPU VRAM peak: 5.01 GB | CPU RAM used: 2.35 GB
Model size on disk: 4.91 GB





{'accuracy': 0.743,
 'total_time_sec': 94.47320938110352,
 'avg_latency_sec': np.float64(0.09447320938110351),
 'p95_latency_sec': np.float64(0.11222063302993775),
 'p99_latency_sec': np.float64(0.11685961961746215),
 'vram_peak_gb': 5.006504058837891,
 'cpu_ram_gb': 2.3490562438964844,
 'model_size_gb': 4.90564379375428,
 'total_samples': 1000}

### **2.2 EVALUATE WITH DIFFERENT DATASETS (QnA tasks)**

In [8]:
# Install dependencies
!pip install -q sentence-transformers psutil tqdm
!pip install sentence-transformers -q
import os, re, time, psutil, torch, numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import os
import time
import torch
import psutil
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import re

In [9]:
UNIFIED_DIR = "/content/drive/MyDrive/data_source/unified_format"
# Dataset emrqa
ds_emrqa = []
file_path = os.path.join(UNIFIED_DIR, "emrqa", "all.jsonl")
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            ds_emrqa.append(json.loads(line))

# Dataset medquad
ds_medquad = []
file_path = os.path.join(UNIFIED_DIR, "medquad", "all.jsonl")
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            ds_medquad.append(json.loads(line))

In [10]:
# Load BioBERT from Drive
BioBERT_BDRIVE_PATH = "/content/drive/MyDrive/BioBERT/bert_embeddings"
bert_model = SentenceTransformer(BioBERT_BDRIVE_PATH)
print("BioBERT model loaded from Drive.")

BioBERT model loaded from Drive.


In [11]:
# Helper function: keyword recall
def keyword_recall(pred, target):
    """Compute fraction of keywords in target that appear in pred."""
    clean = lambda s: re.findall(r'\w+', s.lower())
    pred_words = set(clean(pred))
    target_words = set(clean(target))
    matched = pred_words & target_words
    recall = len(matched) / len(target_words) if target_words else 0
    return recall

In [12]:
def generate_batch_answers(model, tokenizer, questions, contexts=None, max_new_tokens=100):
    """
    Generate answers for a batch of questions and contexts.
    """
    if contexts is None:
        contexts = [None] * len(questions)

    # Create batch prompt
    prompts = []
    for q, c in zip(questions, contexts):
        prompt = f"""
You are a medical reasoning assistant. Read the context and question carefully and answer concisely.
Context:
{c or "No additional context."}
Question:
{q}
Answer:
"""
        prompts.append(prompt)

    # Tokenize batch with padding
    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(model.device)

    # Generate output for batch
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0
        )

    # Decode for batch
    decoded_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Extract answer for each output
    answer_parts = [decoded.split("Answer:")[-1].strip() for decoded in decoded_batch]
    return answer_parts

In [13]:
def evaluate_short_answer_task(llm_model, tokenizer, dataset, batch_size=32, alpha=0.5, beta=0.5, max_new_tokens=100):
    scores = []
    latencies = [] # sample latency
    total_generation_time = 0.0

    process = psutil.Process()
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()

    # Model size on disk
    model_size_gb = 0.0
    model_dir = getattr(llm_model, 'config', None).name_or_path
    if model_dir and os.path.isdir(model_dir):
        try:
            model_size_bytes = sum(os.path.getsize(os.path.join(root, f))
                                   for root, _, files in os.walk(model_dir)
                                   for f in files)
            model_size_gb = model_size_bytes / (1024 ** 3)
        except Exception as e:
            print(f"Warning: Could not calculate model size. Error: {e}")

    # processing batch
    for i in tqdm(range(0, len(dataset), batch_size), desc="Evaluating Batches"):
        batch_items = dataset[i:i + batch_size]
        batch_questions = [item["question"] for item in batch_items]
        batch_contexts = [item.get("context") for item in batch_items]
        batch_true_answers = [item["answer"] for item in batch_items]

        num_in_batch = len(batch_items)

        start_time = time.time()

        # Generate predictions for all batch
        pred_answers = generate_batch_answers(
            llm_model, tokenizer, batch_questions, batch_contexts, max_new_tokens=max_new_tokens
        )

        batch_time = time.time() - start_time
        total_generation_time += batch_time

        # sample avg latency
        per_sample_latency = batch_time / num_in_batch
        latencies.extend([per_sample_latency] * num_in_batch)

        # evaluate each question in batch
        for pred_answer, true_answer in zip(pred_answers, batch_true_answers):
            # Keyword recall
            kr = keyword_recall(pred_answer, true_answer)

            # BERT embedding similarity
            emb_pred = bert_model.encode(pred_answer, convert_to_tensor=True)
            emb_true = bert_model.encode(true_answer, convert_to_tensor=True)
            sim = util.cos_sim(emb_pred, emb_true).item()

            # Weighted score
            score = alpha * kr + beta * sim
            scores.append(score)

    # Aggregate metrics
    avg_score = np.mean(scores)
    avg_latency = np.mean(latencies)
    p95_latency = np.percentile(latencies, 95)
    p99_latency = np.percentile(latencies, 99)
    vram_peak_gb = torch.cuda.max_memory_allocated() / (1024 ** 3) if torch.cuda.is_available() else 0.0
    cpu_ram_gb = process.memory_info().rss / (1024 ** 3)

    result = {
        "avg_weighted_score": avg_score,
        "total_time_sec": total_generation_time,
        "avg_latency_sec": avg_latency,     # Avg latency per sample
        "p95_latency_sec": p95_latency,
        "p99_latency_sec": p99_latency,
        "vram_peak_gb": vram_peak_gb,
        "cpu_ram_gb": cpu_ram_gb,
        "model_size_gb": model_size_gb,
        "total_samples": len(dataset),
        "batch_size": batch_size
    }

    print(f"\n--- Evaluation Complete ---")
    print(f"Batch size: {batch_size}")
    print(f"Avg weighted score: {avg_score:.2%}")
    print(f"Total time: {total_generation_time:.2f}s | Avg latency/sample: {avg_latency:.2f}s | p95: {p95_latency:.2f}s | p99: {p99_latency:.2f}s")
    print(f"GPU VRAM peak: {vram_peak_gb:.2f} GB | CPU RAM used: {cpu_ram_gb:.2f} GB")
    print(f"Model size on disk: {model_size_gb:.2f} GB" if model_size_gb > 0 else "Model size unknown (path not a local dir)")

    return result

In [None]:
evaluate_short_answer_task(model, tokenizer, ds_emrqa, alpha=0.5, beta=0.5, max_new_tokens=100)

Evaluating Batches: 100%|██████████| 47/47 [16:10<00:00, 20.65s/it]


--- Evaluation Complete ---
Batch size: 32
Avg weighted score: 41.03%
Total time: 939.97s | Avg latency/sample: 0.63s | p95: 0.64s | p99: 0.66s
GPU VRAM peak: 12.36 GB | CPU RAM used: 2.56 GB
Model size on disk: 4.91 GB





{'avg_weighted_score': np.float64(0.41030950599151267),
 'total_time_sec': 939.9735083580017,
 'avg_latency_sec': np.float64(0.6266490055720011),
 'p95_latency_sec': np.float64(0.6448302492499352),
 'p99_latency_sec': np.float64(0.6552763836724418),
 'vram_peak_gb': 12.362613677978516,
 'cpu_ram_gb': 2.558429718017578,
 'model_size_gb': 4.90564379375428,
 'total_samples': 1500,
 'batch_size': 32}

In [14]:
evaluate_short_answer_task(model, tokenizer, ds_medquad, alpha=0.5, beta=0.5, max_new_tokens=100)

Evaluating Batches:   0%|          | 0/47 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating Batches: 100%|██████████| 47/47 [18:59<00:00, 24.25s/it]


--- Evaluation Complete ---
Batch size: 32
Avg weighted score: 32.57%
Total time: 1094.44s | Avg latency/sample: 0.73s | p95: 0.77s | p99: 0.81s
GPU VRAM peak: 5.98 GB | CPU RAM used: 2.12 GB
Model size on disk: 4.91 GB





{'avg_weighted_score': np.float64(0.3256809326343647),
 'total_time_sec': 1094.4351711273193,
 'avg_latency_sec': np.float64(0.729623447418213),
 'p95_latency_sec': np.float64(0.7698532342910767),
 'p99_latency_sec': np.float64(0.8085334130695888),
 'vram_peak_gb': 5.982316017150879,
 'cpu_ram_gb': 2.1169509887695312,
 'model_size_gb': 4.90564379375428,
 'total_samples': 1500,
 'batch_size': 32}