## **0. LOAD LIBRARY**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch, os

## **1. LOAD BASE MODEL - INT8**

In [3]:
!pip install -U bitsandbytes



In [4]:
# Path to save model
DRIVE_PATH = "/content/drive/MyDrive/Qwen3-8B"

print("1. Loading Qwen model in 8-bit precision...")

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# --- Load tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(
    DRIVE_PATH,
    trust_remote_code=True
)

# Left padding for decoder-only models like Qwen
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# --- Load model in 8-bit ---
# Requires bitsandbytes installed: pip install bitsandbytes accelerate
model = AutoModelForCausalLM.from_pretrained(
    DRIVE_PATH,
    device_map="auto",
    load_in_8bit=True,          # <--- main change
    trust_remote_code=True
)

# Sync pad token
model.config.pad_token_id = tokenizer.pad_token_id

print("2. Model loaded successfully in 8-bit mode!")


1. Loading Qwen model in 8-bit precision...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2. Model loaded successfully in 8-bit mode!


## **2. EVALUATE WITH DIFFERENT DATASETS**

In [6]:
import json
from tqdm import tqdm
import os
import torch
import psutil
import time
import numpy as np
from tqdm import tqdm

In [7]:
# print out sample data
UNIFIED_DIR = "/content/drive/MyDrive/data_source/unified_format"
DATASETS = ["medqa", "emrqa", "pubmedqa", "medquad"]
SAMPLE_COUNT = 1
for ds_name in DATASETS:
    print(f"\n{'='*20} SAMPLES FOR: {ds_name.upper()} {'='*20}")
    file_path = os.path.join(UNIFIED_DIR, ds_name, "all.jsonl")
    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= SAMPLE_COUNT: break
            print(json.dumps(json.loads(line), indent=2, ensure_ascii=False))


{
  "id": "A_23-year-old_pregnant_woman_at_22_weeks",
  "dataset": "medqa",
  "split": "all",
  "context": null,
  "question": "A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?",
  "text": [
    "Ampicillin",
    "Ceftriaxone",
    "Ciprofloxacin",
    "Doxycycline",
    "Nitrofurantoin"
  ],
  "encode": [
    "A",
    "B",
    "C",
    "D",
    "E"
  ],
  "answer": "E"
}

{
  "id": "The_patient_was_admiWhat_is_her_current_",
  "dataset": "emrqa",
  "split": "

### **2.1 EVALUATE WITH DIFFERENT DATASETS (MCQ tasks)**

In [8]:
UNIFIED_DIR = "/content/drive/MyDrive/data_source/unified_format"

# Dataset medqua
ds_medqa = []
file_path = os.path.join(UNIFIED_DIR, "medqa", "all.jsonl")
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            ds_medqa.append(json.loads(line))

ds_pubmedqa = []
file_path = os.path.join(UNIFIED_DIR, "pubmedqa", "all.jsonl")
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            ds_pubmedqa.append(json.loads(line))

In [9]:
import re

def _build_encode(opts, enc):
    """
    Trả về (encodes, valid_labels) đã chuẩn hóa về string UPPER.
    Nếu enc None -> tạo A, B, C, ...
    """
    if enc is None:
        enc = [chr(65 + i) for i in range(len(opts))]  # A, B, C...
    enc = [str(e).strip().upper() for e in enc]
    return enc, set(enc)

def _normalize_true_answer(item, enc, opts_upper):
    """
    Chuẩn hóa ground truth về MÃ trong enc (UPPER).
    - Nếu answer là số / index -> map sang enc[index].
    - Nếu answer là mã (A/B/C/D, 1/2/3/4, ...) -> chuẩn hóa và dùng trực tiếp nếu có trong enc.
    - Nếu answer là VĂN BẢN -> tìm khớp trong options (so sánh UPPER, strip).
    Trả về: mã chuẩn hóa hoặc None nếu không suy ra được.
    """
    ans = item["answer"]
    # case: None / rỗng
    if ans is None:
        return None

    # Nếu là số / digit -> coi là index (0-based hoặc 1-based?)
    # Ưu tiên 0-based nếu nằm trong phạm vi, nếu không thử 1-based.
    try:
        idx = int(ans)
        if 0 <= idx < len(enc):
            return enc[idx]
        if 1 <= idx <= len(enc):
            return enc[idx - 1]
    except Exception:
        pass

    # Nếu là string mã
    if isinstance(ans, str):
        a = ans.strip().upper()
        # Nếu trùng trực tiếp với enc label
        if a in enc:
            return a
        # Một số người ghi "OPTION C", "ANSWER: B"
        m = re.search(r'([A-Z0-9]+)', a)
        if m:
            cand = m.group(1)
            if cand in enc:
                return cand

        # Nếu là văn bản đáp án -> tìm trong options
        # So sánh UPPER đã strip
        a_clean = re.sub(r'\s+', ' ', a).strip()
        # Tạo map text->encode
        for i, opt_text in enumerate(opts_upper):
            if a_clean == opt_text:
                return enc[i]

    return None

def _parse_model_prediction(raw_text, valid_labels):
    """
    Lấy dự đoán từ text model (UPPER), ưu tiên khớp prefix với các nhãn hợp lệ.
    Ví dụ: 'Answer: C.' -> 'C'; 'The best answer is 2' -> '2'
    """
    t = raw_text.strip().upper()

    # Nếu chứa "ANSWER:" tách sau cùng
    parts = t.split("ANSWER:")
    if len(parts) > 1:
        t = parts[-1].strip()

    # Thử khớp trực tiếp một nhãn hợp lệ ở đầu chuỗi
    for lab in sorted(valid_labels, key=len, reverse=True):
        if t.startswith(lab):
            return lab

    # Tìm token chữ/số đầu tiên
    m = re.search(r'([A-Z0-9]+)', t)
    if m:
        tok = m.group(1)
        if tok in valid_labels:
            return tok

    # Không parse được
    return None

def generate_batch_answers_mcq(model, tokenizer, questions, options_list, contexts=None, encodes=None, max_new_tokens=8):
    """
    Generate answers for a batch of MCQ questions.
    Trả về list raw_text (để parser xử lý tiếp).
    """
    if contexts is None:
        contexts = [None] * len(questions)
    if encodes is None:
        encodes = [None] * len(questions)

    prompts = []
    per_q_valid_labels = []
    for q, opts, ctx, enc in zip(questions, options_list, contexts, encodes):
        # chuẩn hóa encode
        enc, valid = _build_encode(opts, enc)

        options_str = "\n".join([f"{e}. {t}" for e, t in zip(enc, opts)])
        allowed_str = ", ".join(enc)  # ví dụ: "A, B, C, D" hoặc "1, 2, 3, 4"

        prompt = (
            "You are a medical reasoning assistant. Read the context and question carefully, then choose the best answer.\n"
            f"Context:\n{ctx or 'No additional context.'}\n"
            f"Question:\n{q}\n"
            f"Options:\n{options_str}\n"
            f"Respond with ONLY one of [{allowed_str}] and nothing else.\n"
            "Answer:"
        )

        prompts.append(prompt)
        per_q_valid_labels.append(valid)

    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
            pad_token_id=tokenizer.pad_token_id,
        )

    decoded_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    # Chỉ trả về phần sau cùng (tránh dính prompt)
    # nhưng vẫn để parser robust
    # (nếu prompt còn trong chuỗi, parser vẫn tách được nhờ 'Answer:')
    return [x for x in decoded_batch]


In [10]:
# --- UPDATED EVALUATION FUNCTION (USES BATCHING) ---

def evaluate_model_mcq_task(model, tokenizer, dataset, batch_size=48, max_new_tokens=8):
    correct = 0
    total = len(dataset)
    latencies = []
    total_generation_time = 0.0
    process = psutil.Process()

    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()

    model_size_gb = 0.0
    model_dir = getattr(model, 'config', None).name_or_path
    if model_dir and os.path.isdir(model_dir):
        try:
            model_size_bytes = sum(os.path.getsize(os.path.join(root, f))
                                   for root, _, files in os.walk(model_dir)
                                   for f in files)
            model_size_gb = model_size_bytes / (1024 ** 3)
        except Exception as e:
            print(f"Warning: Could not calculate model size. Error: {e}")

    for i in tqdm(range(0, total, batch_size), desc="Evaluating Batches"):
        batch_items = dataset[i:i + batch_size]
        num_in_batch = len(batch_items)

        batch_contexts = [item.get("context") for item in batch_items]
        batch_questions = [item["question"] for item in batch_items]
        # hỗ trợ cả "text" và "options"
        batch_options = [item["text"] if "text" in item else item["options"] for item in batch_items]
        batch_encodes = [item.get("encode") for item in batch_items]

        # Chuẩn hóa option text (UPPER) để map khi answer là văn bản
        batch_options_upper = [
            [re.sub(r'\s+', ' ', str(opt).strip()).upper() for opt in opts]
            for opts in batch_options
        ]

        # Chuẩn hóa ground-truth -> mã trong enc
        gt_labels = []
        valid_labels_list = []
        enc_list = []

        for item, optsU, enc in zip(batch_items, batch_options_upper, batch_encodes):
            # build enc + valid set
            _enc, _valid = _build_encode(item["text"] if "text" in item else item["options"], enc)
            enc_list.append(_enc)
            valid_labels_list.append(_valid)
            gt = _normalize_true_answer(item, _enc, optsU)
            gt_labels.append(gt)

        start_time = time.time()
        raw_preds = generate_batch_answers_mcq(
            model, tokenizer, batch_questions, batch_options,
            contexts=batch_contexts, encodes=batch_encodes,
            max_new_tokens=max_new_tokens
        )
        batch_time = time.time() - start_time
        total_generation_time += batch_time
        per_sample_latency = batch_time / num_in_batch
        latencies.extend([per_sample_latency] * num_in_batch)

        # Parse & compare
        for raw, gt, valid in zip(raw_preds, gt_labels, valid_labels_list):
            pred = _parse_model_prediction(raw, valid)
            if pred is not None and gt is not None and pred == gt:
                correct += 1

    accuracy = correct / total if total > 0 else 0.0
    avg_latency = float(np.mean(latencies)) if latencies else 0.0
    p95_latency = float(np.percentile(latencies, 95)) if latencies else 0.0
    p99_latency = float(np.percentile(latencies, 99)) if latencies else 0.0
    vram_peak_gb = torch.cuda.max_memory_allocated() / (1024 ** 3) if torch.cuda.is_available() else 0.0
    cpu_ram_gb = process.memory_info().rss / (1024 ** 3)

    print(f"\n--- Evaluation Complete ---")
    print(f"Batch size: {batch_size}")
    print(f"Accuracy: {accuracy:.2%} ({correct}/{total})")
    print(f"Total time: {total_generation_time:.2f}s | Avg latency/sample: {avg_latency:.3f}s | p95: {p95_latency:.3f}s | p99: {p99_latency:.3f}s")
    print(f"GPU VRAM peak: {vram_peak_gb:.2f} GB | CPU RAM used: {cpu_ram_gb:.2f} GB")
    print(f"Model size on disk: {model_size_gb:.2f} GB" if model_size_gb > 0 else "Model size unknown")

    return {
        "accuracy": accuracy,
        "total_time_sec": total_generation_time,
        "avg_latency_sec": avg_latency,
        "p95_latency_sec": p95_latency,
        "p99_latency_sec": p99_latency,
        "vram_peak_gb": vram_peak_gb,
        "cpu_ram_gb": cpu_ram_gb,
        "model_size_gb": model_size_gb,
        "total_samples": total,
        "batch_size": batch_size
    }


In [11]:
evaluate_model_mcq_task(model, tokenizer, ds_medqa)

Evaluating Batches:   0%|          | 0/32 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating Batches: 100%|██████████| 32/32 [02:26<00:00,  4.59s/it]


--- Evaluation Complete ---
Batch size: 48
Accuracy: 56.40% (846/1500)
Total time: 146.87s | Avg latency/sample: 0.098s | p95: 0.120s | p99: 0.129s
GPU VRAM peak: 18.43 GB | CPU RAM used: 2.52 GB
Model size on disk: 15.27 GB





{'accuracy': 0.564,
 'total_time_sec': 146.8746612071991,
 'avg_latency_sec': 0.0979164408047994,
 'p95_latency_sec': 0.1201900988817215,
 'p99_latency_sec': 0.12890859444936117,
 'vram_peak_gb': 18.426454544067383,
 'cpu_ram_gb': 2.5177955627441406,
 'model_size_gb': 15.271300191059709,
 'total_samples': 1500,
 'batch_size': 48}

In [12]:
evaluate_model_mcq_task(model, tokenizer, ds_pubmedqa)

Evaluating Batches: 100%|██████████| 21/21 [01:55<00:00,  5.49s/it]


--- Evaluation Complete ---
Batch size: 48
Accuracy: 71.10% (711/1000)
Total time: 115.29s | Avg latency/sample: 0.115s | p95: 0.126s | p99: 0.145s
GPU VRAM peak: 20.01 GB | CPU RAM used: 2.52 GB
Model size on disk: 15.27 GB





{'accuracy': 0.711,
 'total_time_sec': 115.29039430618286,
 'avg_latency_sec': 0.11529039430618286,
 'p95_latency_sec': 0.12592656910419464,
 'p99_latency_sec': 0.14451164503892264,
 'vram_peak_gb': 20.009077072143555,
 'cpu_ram_gb': 2.5223045349121094,
 'model_size_gb': 15.271300191059709,
 'total_samples': 1000,
 'batch_size': 48}

### **2.2 EVALUATE WITH DIFFERENT DATASETS (QnA tasks)**

In [13]:
# Install dependencies
!pip install -q sentence-transformers psutil tqdm
!pip install sentence-transformers -q
import os, re, time, psutil, torch, numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import os
import time
import torch
import psutil
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import re

In [14]:
UNIFIED_DIR = "/content/drive/MyDrive/data_source/unified_format"
# Dataset emrqa
ds_emrqa = []
file_path = os.path.join(UNIFIED_DIR, "emrqa", "all.jsonl")
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            ds_emrqa.append(json.loads(line))

# Dataset medquad
ds_medquad = []
file_path = os.path.join(UNIFIED_DIR, "medquad", "all.jsonl")
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            ds_medquad.append(json.loads(line))

In [15]:
# Load BioBERT from Drive
BioBERT_BDRIVE_PATH = "/content/drive/MyDrive/BioBERT/bert_embeddings"
bert_model = SentenceTransformer(BioBERT_BDRIVE_PATH)
print("BioBERT model loaded from Drive.")

BioBERT model loaded from Drive.


In [16]:
# Helper function: keyword recall
def keyword_recall(pred, target):
    """Compute fraction of keywords in target that appear in pred."""
    clean = lambda s: re.findall(r'\w+', s.lower())
    pred_words = set(clean(pred))
    target_words = set(clean(target))
    matched = pred_words & target_words
    recall = len(matched) / len(target_words) if target_words else 0
    return recall

In [17]:
def generate_batch_answers(model, tokenizer, questions, contexts=None, max_new_tokens=100):
    if contexts is None:
        contexts = [None] * len(questions)

    # Create batch prompt
    prompts = []
    for q, c in zip(questions, contexts):
        prompt = f"""
You are a medical reasoning assistant. Read the context and question carefully and answer concisely.
Context:
{c or "No additional context."}
Question:
{q}
Answer:
"""
        prompts.append(prompt)

    # Tokenize batch with padding
    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(model.device)

    # Generate output for batch
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0
        )

    # Decode for batch
    decoded_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Extract answer for each output
    answer_parts = [decoded.split("Answer:")[-1].strip() for decoded in decoded_batch]
    return answer_parts

In [18]:
def evaluate_short_answer_task(llm_model, tokenizer, dataset, batch_size=64, alpha=0.5, beta=0.5, max_new_tokens=100):
    scores = []
    latencies = [] # sample latency
    total_generation_time = 0.0

    process = psutil.Process()
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()

    # Model size on disk
    model_size_gb = 0.0
    model_dir = getattr(llm_model, 'config', None).name_or_path
    if model_dir and os.path.isdir(model_dir):
        try:
            model_size_bytes = sum(os.path.getsize(os.path.join(root, f))
                                   for root, _, files in os.walk(model_dir)
                                   for f in files)
            model_size_gb = model_size_bytes / (1024 ** 3)
        except Exception as e:
            print(f"Warning: Could not calculate model size. Error: {e}")

    # processing batch
    for i in tqdm(range(0, len(dataset), batch_size), desc="Evaluating Batches"):
        batch_items = dataset[i:i + batch_size]
        batch_questions = [item["question"] for item in batch_items]
        batch_contexts = [item.get("context") for item in batch_items]
        batch_true_answers = [item["answer"] for item in batch_items]

        num_in_batch = len(batch_items)

        start_time = time.time()

        # Generate predictions for all batch
        pred_answers = generate_batch_answers(
            llm_model, tokenizer, batch_questions, batch_contexts, max_new_tokens=max_new_tokens
        )

        batch_time = time.time() - start_time
        total_generation_time += batch_time

        # sample avg latency
        per_sample_latency = batch_time / num_in_batch
        latencies.extend([per_sample_latency] * num_in_batch)

        # evaluate each question in batch
        for pred_answer, true_answer in zip(pred_answers, batch_true_answers):
            # Keyword recall
            kr = keyword_recall(pred_answer, true_answer)

            # BERT embedding similarity
            emb_pred = bert_model.encode(pred_answer, convert_to_tensor=True)
            emb_true = bert_model.encode(true_answer, convert_to_tensor=True)
            sim = util.cos_sim(emb_pred, emb_true).item()

            # Weighted score
            score = alpha * kr + beta * sim
            scores.append(score)

    # Aggregate metrics
    avg_score = np.mean(scores)
    avg_latency = np.mean(latencies)
    p95_latency = np.percentile(latencies, 95)
    p99_latency = np.percentile(latencies, 99)
    vram_peak_gb = torch.cuda.max_memory_allocated() / (1024 ** 3) if torch.cuda.is_available() else 0.0
    cpu_ram_gb = process.memory_info().rss / (1024 ** 3)

    result = {
        "avg_weighted_score": avg_score,
        "total_time_sec": total_generation_time,
        "avg_latency_sec": avg_latency,     # Avg latency per sample
        "p95_latency_sec": p95_latency,
        "p99_latency_sec": p99_latency,
        "vram_peak_gb": vram_peak_gb,
        "cpu_ram_gb": cpu_ram_gb,
        "model_size_gb": model_size_gb,
        "total_samples": len(dataset),
        "batch_size": batch_size
    }

    print(f"\n--- Evaluation Complete ---")
    print(f"Batch size: {batch_size}")
    print(f"Avg weighted score: {avg_score:.2%}")
    print(f"Total time: {total_generation_time:.2f}s | Avg latency/sample: {avg_latency:.2f}s | p95: {p95_latency:.2f}s | p99: {p99_latency:.2f}s")
    print(f"GPU VRAM peak: {vram_peak_gb:.2f} GB | CPU RAM used: {cpu_ram_gb:.2f} GB")
    print(f"Model size on disk: {model_size_gb:.2f} GB" if model_size_gb > 0 else "Model size unknown (path not a local dir)")

    return result

In [None]:
# import gc
# import torch

# # Xoá tất cả biến trừ gc và torch
# for obj in list(globals().keys()):
#     if obj not in ['gc', 'torch', '__builtins__', '__name__', '__doc__', '__package__']:
#         del globals()[obj]

# # Giải phóng bộ nhớ GPU
# gc.collect()
# torch.cuda.empty_cache()

# print("✅ Đã xoá biến và giải phóng VRAM thành công.")


✅ Đã xoá biến và giải phóng VRAM thành công.


In [19]:
evaluate_short_answer_task(model, tokenizer, ds_emrqa, alpha=0.5, beta=0.5, max_new_tokens=100)

Evaluating Batches: 100%|██████████| 24/24 [18:19<00:00, 45.82s/it]


--- Evaluation Complete ---
Batch size: 64
Avg weighted score: 70.01%
Total time: 1068.17s | Avg latency/sample: 0.71s | p95: 0.71s | p99: 1.17s
GPU VRAM peak: 29.02 GB | CPU RAM used: 2.68 GB
Model size on disk: 15.27 GB





{'avg_weighted_score': np.float64(0.700075728366123),
 'total_time_sec': 1068.1656415462494,
 'avg_latency_sec': np.float64(0.7121104276974995),
 'p95_latency_sec': np.float64(0.7147824093699455),
 'p99_latency_sec': np.float64(1.1719480071749007),
 'vram_peak_gb': 29.018919944763184,
 'cpu_ram_gb': 2.6760101318359375,
 'model_size_gb': 15.271300191059709,
 'total_samples': 1500,
 'batch_size': 64}

In [20]:
evaluate_short_answer_task(model, tokenizer, ds_medquad, alpha=0.5, beta=0.5, max_new_tokens=100)

Evaluating Batches: 100%|██████████| 24/24 [10:26<00:00, 26.10s/it]


--- Evaluation Complete ---
Batch size: 64
Avg weighted score: 53.53%
Total time: 593.45s | Avg latency/sample: 0.40s | p95: 0.40s | p99: 0.87s
GPU VRAM peak: 10.81 GB | CPU RAM used: 2.70 GB
Model size on disk: 15.27 GB





{'avg_weighted_score': np.float64(0.5353149656586055),
 'total_time_sec': 593.4484632015228,
 'avg_latency_sec': np.float64(0.39563230880101524),
 'p95_latency_sec': np.float64(0.3950095772743225),
 'p99_latency_sec': np.float64(0.8676964044570923),
 'vram_peak_gb': 10.809864044189453,
 'cpu_ram_gb': 2.7022171020507812,
 'model_size_gb': 15.271300191059709,
 'total_samples': 1500,
 'batch_size': 64}