Base Model Inference and Final Evaluation

##Installing Dependencies

In [5]:
%%capture installation_log
!pip install vllm datasets -q

In [6]:
# Core Python libraries
from datasets import load_dataset
from pprint import pprint
import json
import pandas as pd

In [7]:
from google.colab import userdata
import os
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
hf_profile = 'aymangomaa'

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Dataset

In [9]:
from datasets import load_dataset

dataset = load_dataset(f"{hf_profile}/entity_extraction_ade_v2_chat_base")
dataset

README.md:   0%|          | 0.00/718 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/771k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/92.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/99.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3458 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/385 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/428 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'relations', 'messages'],
        num_rows: 3458
    })
    validation: Dataset({
        features: ['text', 'relations', 'messages'],
        num_rows: 385
    })
    test: Dataset({
        features: ['text', 'relations', 'messages'],
        num_rows: 428
    })
})

In [10]:
test_data = dataset["test"]

In [11]:
pprint(test_data[9])

{'messages': [{'content': 'Extract all adverse drug effect (ADE) relationships '
                          'from the sentence. ### TEXT: METHODS: We report a '
                          'patient who had an anaphylactic reaction during the '
                          'intravenous infusion of cyclosporine.',
               'role': 'user'},
              {'content': '[{"ade": "anaphylactic reaction", "drug": '
                          '"cyclosporine"}]',
               'role': 'assistant'}],
 'relations': [{'ade': 'anaphylactic reaction', 'drug': 'cyclosporine'}],
 'text': 'METHODS: We report a patient who had an anaphylactic reaction during '
         'the intravenous infusion of cyclosporine.'}


In [12]:
# Extract Prompts from Dataset
prompts = [x['messages'][0]['content'] for x in test_data]  # extract prompts only
pprint(prompts[4])

('Extract all adverse drug effect (ADE) relationships from the sentence. ### '
 'TEXT: To the best of our knowledge, this is the first case of '
 'lithium-associated CDI and NDI presenting concurrently.')


## Inference with Qwen3-1.7B-Base + QLoRA

In [13]:
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
from huggingface_hub import snapshot_download
import json

INFO 05-10 18:30:45 [__init__.py:239] Automatically detected platform cuda.


In [15]:
# Load Model
import torch
base_model = "Qwen/Qwen3-1.7B-Base"
lora_repo = "aymangomaa/drug-ade-extraction-finetuned-base-4"
adapter_path = snapshot_download(repo_id=lora_repo)
llm = LLM(model=base_model, enable_lora=True, max_lora_rank=128,dtype=torch.float16)
sampling_params = SamplingParams(temperature=0.0, max_tokens=512)
lora_request = LoRARequest("qwen3_base_adapter", 1, adapter_path)

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

INFO 05-10 18:32:50 [config.py:717] This model supports multiple tasks: {'generate', 'classify', 'reward', 'embed', 'score'}. Defaulting to 'generate'.
INFO 05-10 18:32:50 [llm_engine.py:240] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='Qwen/Qwen3-1.7B-Base', speculative_config=None, tokenizer='Qwen/Qwen3-1.7B-Base', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, serv

model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

INFO 05-10 18:33:12 [weight_utils.py:281] Time spent downloading weights for Qwen/Qwen3-1.7B-Base: 19.966130 seconds
INFO 05-10 18:33:12 [weight_utils.py:315] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 05-10 18:33:14 [loader.py:458] Loading weights took 1.67 seconds
INFO 05-10 18:33:14 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 05-10 18:33:15 [model_runner.py:1140] Model loading took 3.4901 GiB and 22.734264 seconds
INFO 05-10 18:33:30 [worker.py:287] Memory profiling takes 14.91 seconds
INFO 05-10 18:33:30 [worker.py:287] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.90) = 13.27GiB
INFO 05-10 18:33:30 [worker.py:287] model weights take 3.49GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 1.65GiB; the rest of the memory reserved for KV Cache is 8.08GiB.
INFO 05-10 18:33:30 [executor_base.py:112] # cuda blocks: 4728, # CPU blocks: 2340
INFO 05-10 18:33:30 [executor_base.py:117] Maximum concurrency for 32768 tokens per request: 2.31x
INFO 05-10 18:33:34 [model_runner.py:1450] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager

Capturing CUDA graph shapes:   0%|          | 0/35 [00:00<?, ?it/s]

INFO 05-10 18:34:19 [model_runner.py:1592] Graph capturing finished in 46 secs, took 0.40 GiB
INFO 05-10 18:34:19 [llm_engine.py:437] init engine (profile, create kv cache, warmup model) took 64.67 seconds


In [16]:
# Generate Predictions
outputs_base = llm.generate(prompts, sampling_params, lora_request=lora_request)
# Save Raw Predictions
with open("outputs_qwen3_base.json", "w") as f:
    json.dump([o.outputs[0].text for o in outputs_base], f, indent=2)

Processed prompts:   0%|          | 0/428 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



In [17]:
# Clean Generated Outputs and Normalization
# Regex cleanup of raw predictions to extract structured relationships.

import json
import re

with open("outputs_qwen3_base.json", "r") as f:
    raw_outputs = json.load(f)

cleaned_outputs = []

# Regex pattern to match a list of dictionaries with "ade" and "drug"
json_pattern = re.compile(r'\[.*?"ade"\s*:\s*".+?".*?"drug"\s*:\s*".+?".*?\]', re.DOTALL)

for output in raw_outputs:
    match = json_pattern.search(output)
    if match:
        try:
            cleaned = json.loads(match.group())
        except json.JSONDecodeError:
            cleaned = []
    else:
        cleaned = []
    cleaned_outputs.append(cleaned)

# Save cleaned output
with open("outputs_qwen3_base_cleaned.json", "w") as f:
    json.dump(cleaned_outputs, f, indent=2)

# Load cleaned predictions and reference test set
with open("outputs_qwen3_base_cleaned.json") as f:
    preds = json.load(f)

# Normalization function
import re
def normalize(text):
    return re.sub(r"[^a-z0-9]", "", text.lower().strip())

def extract_pairs(rel_list):
    try:
        return set((normalize(d["drug"]), normalize(d["ade"])) for d in rel_list)
    except:
        return set()

In [19]:
# Load predictions and ground truth
with open("outputs_qwen3_base_cleaned.json") as f:
    preds = json.load(f)


# Save comparison results
results = []

for idx in range(len(test_data)):
    true_set = set((rel["drug"].lower(), rel["ade"].lower()) for rel in test_data[idx]["relations"])
    pred_set = set((rel["drug"].lower(), rel["ade"].lower()) for rel in preds[idx])
    correct_set = true_set & pred_set

    results.append({
        "idx": idx,
        "text": test_data[idx]["text"],
        "ground_truth": list(true_set),
        "prediction": list(pred_set),
        "correct": list(correct_set)
    })

# Save to file
with open("qwen3_base_all_evaluated.json", "w") as f:
    json.dump(results, f, indent=2)



# Load from saved output
with open("qwen3_base_all_evaluated.json", "r") as f:
    results = json.load(f)

# Convert to DataFrame and show relevant columns
df = pd.DataFrame(results)
df[["idx", "text", "ground_truth", "prediction", "correct"]].head(10)


Unnamed: 0,idx,text,ground_truth,prediction,correct
0,0,We present a case report of a patient with typ...,"[[chloramphenicol sodium succinate, hypersensi...","[[chloramphenicol sodium succinate, hypersensi...","[[chloramphenicol sodium succinate, hypersensi..."
1,1,The ototoxicity of quinine can accurately be s...,"[[quinine, ototoxicity]]","[[quinine, ototoxicity]]","[[quinine, ototoxicity]]"
2,2,Patient 1 presented bilateral ballism 1 week a...,"[[heroin, bilateral ballism]]","[[heroin, bilateral ballism]]","[[heroin, bilateral ballism]]"
3,3,A 58-year-old woman developed unilateral acute...,"[[scopolamine, unilateral acute angle-closure ...","[[transderm-v, acute angle-closure glaucoma]]",[]
4,4,"To the best of our knowledge, this is the firs...","[[lithium, ndi], [lithium, cdi]]",[],[]
5,5,CONCLUSION: A 26-year-old man with bipolar dis...,"[[carbamazepine, hyperammonemia]]","[[carbamazepine, hyperammonemia]]","[[carbamazepine, hyperammonemia]]"
6,6,RESULTS: Quetiapine was associated with leucop...,"[[quetiapine, leucopenia], [quetiapine, agranu...","[[quetiapine, leucopenia], [quetiapine, agranu...","[[quetiapine, leucopenia], [quetiapine, agranu..."
7,7,Hepatopathy subsided after the cessation of ca...,"[[lynestrenol, hepatopathy], [carbamazepine, h...","[[lynestrenol, hepatopathy], [carbamazepine, h...","[[lynestrenol, hepatopathy], [carbamazepine, h..."
8,8,Carbamazepine induced right bundle branch bloc...,"[[carbamazepine, right bundle branch block]]",[],[]
9,9,METHODS: We report a patient who had an anaphy...,"[[cyclosporine, anaphylactic reaction]]","[[cyclosporine, anaphylactic reaction]]","[[cyclosporine, anaphylactic reaction]]"


In [22]:
# Compute TP / FP / FN and F1-Score
tp, fp, fn = 0, 0, 0
for i in range(len(test_data)):
    true_pairs = extract_pairs(test_data[i]["relations"])
    pred_pairs = extract_pairs(preds[i])

    tp += len(true_pairs & pred_pairs)
    fp += len(pred_pairs - true_pairs)
    fn += len(true_pairs - pred_pairs)

precision = tp / (tp + fp + 1e-8)
recall = tp / (tp + fn + 1e-8)
f1 = 2 * precision * recall / (precision + recall + 1e-8)

# ✅ Step 3: Save results to file
import json

score_dict = {
    "TP": tp,
    "FP": fp,
    "FN": fn,
    "Precision": round(precision, 4),
    "Recall": round(recall, 4),
    "F1": round(f1, 4)
}

with open("eval_score_base.json", "w") as f:
    json.dump(score_dict, f, indent=2)

# ✅ Optional: Print summary
print("✅ Evaluation Results:")
for k, v in score_dict.items():
    print(f"{k}: {v}")

✅ Evaluation Results:
TP: 375
FP: 190
FN: 293
Precision: 0.6637
Recall: 0.5614
F1: 0.6083
