In [1]:
!pip install transformers
!pip install accelerate
!pip install peft
!pip install bitsandbytes


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [2]:
import torch
import transformers
import accelerate
import peft
import bitsandbytes

print("PyTorch version:", torch.__version__)
print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)
print("PEFT version:", peft.__version__)
print("BitsAndBytes version:", bitsandbytes.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Setup successful!")

PyTorch version: 2.6.0+cu124
Transformers version: 4.51.3
Accelerate version: 1.6.0
PEFT version: 0.15.2
BitsAndBytes version: 0.45.5
CUDA available: True
Setup successful!


In [3]:
# Clone the MedQuAD repository
!git clone https://github.com/abachaa/MedQuAD.git

# List the contents of the cloned repository to confirm
!ls MedQuAD

Cloning into 'MedQuAD'...
remote: Enumerating objects: 11310, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 11310 (delta 7), reused 4 (delta 4), pack-reused 11300 (from 1)[K
Receiving objects: 100% (11310/11310), 11.01 MiB | 16.32 MiB/s, done.
Resolving deltas: 100% (6807/6807), done.
10_MPlus_ADAM_QA	     6_NINDS_QA
11_MPlusDrugs_QA	     7_SeniorHealth_QA
12_MPlusHerbsSupplements_QA  8_NHLBI_QA_XML
1_CancerGov_QA		     9_CDC_QA
2_GARD_QA		     LICENSE.txt
3_GHR_QA		     QA-TestSet-LiveQA-Med-Qrels-2479-Answers.zip
4_MPlus_Health_Topics_QA     readme.txt
5_NIDDK_QA


In [4]:
import os
import xml.etree.ElementTree as ET
import glob

# Define a function to parse XML files and extract question-answer pairs
def parse_medquad_xml(directory):
    qa_pairs = []
    xml_files = glob.glob(os.path.join(directory, "**/*.xml"), recursive=True)
    print(f"Found {len(xml_files)} XML files to parse.")

    for xml_file in xml_files:
        try:
            tree = ET.parse(xml_file)
            root = tree.getroot()
            # Iterate through all elements to find Question and Answer pairs
            question = None
            for elem in root.iter():
                if elem.tag == "Question":
                    question = elem.text.strip() if elem.text else ""
                elif elem.tag == "Answer" and question:
                    answer = elem.text.strip() if elem.text else ""
                    if question and answer:  # Only add if both question and answer are non-empty
                        qa_pairs.append({"question": question, "answer": answer})
                    question = None  # Reset question after pairing
        except Exception as e:
            print(f"Error parsing {xml_file}: {e}")
    return qa_pairs

# Parse all XML files in the MedQuAD directory
medquad_data = parse_medquad_xml("MedQuAD")

# Print the number of question-answer pairs and the first example
print(f"\nTotal question-answer pairs: {len(medquad_data)}")
print("First example:", medquad_data[0] if medquad_data else "No data found")

Found 11274 XML files to parse.

Total question-answer pairs: 16407
First example: {'question': "What is (are) Rasmussen's Encephalitis ?", 'answer': 'Rasmussens encephalitis is a rare, chronic inflammatory neurological disease that usually affects only one hemisphere of the brain. It usually occurs in children under the age of 10 (more rarely in adolescents and adults), and is characterized by frequent and severe seizures, loss of motor skills and speech, paralysis on one side of the body (hemiparesis), inflammation of the brain (encephalitis), and mental deterioration. Most individuals with Rasmussens encephalitis will experience frequent seizures and progressive brain damage in the affected hemisphere of the brain over the course of the first 8 to 12 months, and then enter a phase of permanent, but stable, neurological deficits. Rasmussens encephalitis has features of an autoimmune disease in which immune system cells enter the brain and cause inflammation and damage.Research is ong

In [5]:
from sklearn.model_selection import train_test_split

# Split the dataset into train (80%) and temp (20%)
train_data, temp_data = train_test_split(medquad_data, test_size=0.2, random_state=42)

# Split the temp data into validation (10%) and test (10%)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Print the sizes of each split
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 13125
Validation set size: 1641
Test set size: 1641


In [6]:
import json

# Save the splits to JSON files
with open("train_data.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

with open("val_data.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, ensure_ascii=False, indent=4)

with open("test_data.json", "w", encoding="utf-8") as f:
    json.dump(test_data, f, ensure_ascii=False, indent=4)

print("Dataset splits saved as JSON files.")

Dataset splits saved as JSON files.


In [7]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Move the model to GPU if available
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("Model and tokenizer loaded successfully!")
print(f"Model is on device: {device}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model and tokenizer loaded successfully!
Model is on device: cuda


In [8]:
import json

# Load the training data
with open("train_data.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

# Format the data for GPT-2
formatted_train_data = []
for pair in train_data:
    formatted_text = f"Question: {pair['question']} Answer: {pair['answer']} <|endoftext|>"
    formatted_train_data.append(formatted_text)

# Print the first formatted example
print("First formatted example:")
print(formatted_train_data[0])
print(f"Total formatted training examples: {len(formatted_train_data)}")

First formatted example:
Question: What is (are) Atherosclerosis ? Answer: Espaol
                
Atherosclerosis is a disease in which plaque builds up inside your arteries. Arteries are blood vessels that carry oxygen-rich blood to your heart and other parts of your body.
                
Plaque is made up of fat, cholesterol, calcium, and other substances found in the blood. Over time, plaque hardens and narrows your arteries. This limits the flow of oxygen-rich blood to your organs and other parts of your body.
                
Atherosclerosis can lead to serious problems, including heart attack, stroke, or even death.
                
Atherosclerosis
                

                
Atherosclerosis-Related Diseases
                
Atherosclerosis can affect any artery in the body, including arteries in the heart, brain, arms, legs, pelvis, and kidneys. As a result, different diseases may develop based on which arteries are affected.
                
Coronary Heart Disease
    

In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Load the tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Set the pad token (GPT-2 doesn't have a default pad token)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Tokenize the formatted training data and include labels
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    # Set labels to be the same as input_ids (for causal language modeling)
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

# Create a dataset from the formatted training data
train_dataset = Dataset.from_dict({"text": formatted_train_data})
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./mediguide_gpt2",
    overwrite_output_dir=True,
    num_train_epochs=1,  # Start with 1 epoch for a quick test
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=500,
    fp16=True,  # Enable mixed precision for faster training on GPU
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
)

# Start fine-tuning
print("Starting fine-tuning...")
trainer.train()
print("Fine-tuning completed!")

# Save the fine-tuned model
model.save_pretrained("./mediguide_gpt2_finetuned")
tokenizer.save_pretrained("./mediguide_gpt2_finetuned")
print("Model and tokenizer saved!")

Map:   0%|          | 0/13125 [00:00<?, ? examples/s]



Starting fine-tuning...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdatascience-shrey[0m ([33mdatascience-shrey-indian-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,1.1031
1000,0.8575
1500,0.8025
2000,0.8344


Step,Training Loss
500,1.1031
1000,0.8575
1500,0.8025
2000,0.8344
2500,0.7603
3000,0.7819


Fine-tuning completed!
Model and tokenizer saved!


In [10]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./mediguide_gpt2_finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./mediguide_gpt2_finetuned")
model.to("cuda")

# Test the model with a sample question
test_question = "Question: What are the symptoms of diabetes? Answer:"
inputs = tokenizer(test_question, return_tensors="pt").to("cuda")
outputs = model.generate(
    inputs["input_ids"],
    max_length=200,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    early_stopping=True,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated response:", response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated response: Question: What are the symptoms of diabetes? Answer: How might diabetes be treated? Diabetes is a disease that affects the body's ability to use glucose. The body uses glucose to make energy. It also uses it to build muscle and to fight infections. Diabetes can be caused by a variety of conditions, including diabetes, diabetes-related disorders, and diabetes mellitus. 


In [11]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from peft import PromptTuningConfig, get_peft_model, TaskType
from datasets import Dataset
import torch

# Load the tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Set the pad token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Tokenize the formatted training data
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

train_dataset = Dataset.from_dict({"text": formatted_train_data})
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# Configure Prompt Tuning
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,  # Causal language modeling for GPT-2
    num_virtual_tokens=20,  # Number of virtual tokens to add as prompts
    prompt_tuning_init="TEXT",  # Initialize prompts with a specific text
    prompt_tuning_init_text="Medical question answering:",  # Initial prompt text
    tokenizer_name_or_path=model_name,
)

# Apply Prompt Tuning to the model
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()  # Show the number of trainable parameters

# Define training arguments
training_args = TrainingArguments(
    output_dir="./mediguide_gpt2_prompt_tuning",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=500,
    fp16=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
)

# Start Prompt Tuning
print("Starting Prompt Tuning...")
trainer.train()
print("Prompt Tuning completed!")

# Save the fine-tuned model
peft_model.save_pretrained("./mediguide_gpt2_prompt_tuned")
tokenizer.save_pretrained("./mediguide_gpt2_prompt_tuned")
print("Prompt-tuned model and tokenizer saved!")

Map:   0%|          | 0/13125 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 15,360 || all params: 124,455,168 || trainable%: 0.0123
Starting Prompt Tuning...


Step,Training Loss
500,7.0911
1000,6.7803
1500,6.368
2000,5.4449
2500,4.9469
3000,4.6599


Step,Training Loss
500,7.0911
1000,6.7803
1500,6.368
2000,5.4449
2500,4.9469
3000,4.6599


Prompt Tuning completed!
Prompt-tuned model and tokenizer saved!


In [12]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from peft import PeftModel

# Load the base GPT-2 model and tokenizer
base_model_name = "gpt2"
base_model = GPT2LMHeadModel.from_pretrained(base_model_name)
tokenizer = GPT2Tokenizer.from_pretrained("./mediguide_gpt2_prompt_tuned")
base_model.to("cuda")

# Load the prompt-tuned PEFT model
model = PeftModel.from_pretrained(base_model, "./mediguide_gpt2_prompt_tuned")
model.to("cuda")
model.eval()

# Test the model with the same question
test_question = "Question: What are the symptoms of diabetes? Answer:"
inputs = tokenizer(test_question, return_tensors="pt").to("cuda")

# Generate with adjusted parameters
outputs = model.generate(
    inputs["input_ids"],
    max_length=200,
    min_length=50,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    early_stopping=False,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=1.2,  # Increase temperature for more creative outputs
)

# Print the raw generated token IDs and intermediate decoding for debugging
print("Raw generated token IDs:", outputs[0].tolist())
print("Intermediate tokens:", [tokenizer.decode([token_id]) for token_id in outputs[0].tolist()])
print("Generated response:", tokenizer.decode(outputs[0], skip_special_tokens=True))



Raw generated token IDs: [50256, 383, 262, 12, 262, 357, 262, 383, 12, 389, 262, 262, 8, 284, 262, 13, 464, 12, 326, 262, 338, 6, 262, 837, 262, 6, 355, 366, 366, 379, 366, 262, 705, 198, 198, 464, 318, 262, 287, 12, 286, 257, 262, 286, 262, 198, 12, 318, 257, 705, 764, 198, 11, 262, 318, 366, 290, 286, 198, 1169, 12, 257, 257, 287, 262, 393, 262, 257, 764, 284, 281, 198, 1, 290, 416, 318, 198, 83, 262, 366, 286, 11, 357, 281, 262, 290, 257, 284, 287, 8, 262, 764, 318, 11, 379, 393, 198, 40, 262, 281, 366, 318, 287, 393, 281, 257, 318, 281, 290, 287, 290, 11, 198, 7, 262, 416, 357, 416, 257, 6, 357, 284, 318, 340, 338, 357, 530, 1267, 262, 532, 318, 379, 530, 198, 50256]
Intermediate tokens: ['<|endoftext|>', ' The', ' the', '-', ' the', ' (', ' the', ' The', '-', ' are', ' the', ' the', ')', ' to', ' the', '.', 'The', '-', ' that', ' the', "'s", "'", ' the', ' ,', ' the', "'", ' as', ' "', ' "', ' at', ' "', ' the', " '", '\n', '\n', 'The', ' is', ' the', ' in', '-', ' of', ' a', ' th

In [13]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from peft import PromptTuningConfig, get_peft_model, TaskType
from datasets import Dataset
import torch

# Load the tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Set the pad token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Tokenize the formatted training data
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

train_dataset = Dataset.from_dict({"text": formatted_train_data})
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# Configure Prompt Tuning with adjusted parameters
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    num_virtual_tokens=30,  # Increased from 20 to 30 for more flexibility
    prompt_tuning_init="TEXT",
    prompt_tuning_init_text="Answer the following medical question:",  # Adjusted prompt for clarity
    tokenizer_name_or_path=model_name,
)

# Apply Prompt Tuning to the model
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

# Define training arguments with more epochs
training_args = TrainingArguments(
    output_dir="./mediguide_gpt2_prompt_tuning_v2",
    overwrite_output_dir=True,
    num_train_epochs=3,  # Increased to 3 epochs
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=500,
    fp16=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
)

# Start Prompt Tuning
print("Starting Prompt Tuning (re-training)...")
trainer.train()
print("Prompt Tuning (re-training) completed!")

# Save the re-trained model
peft_model.save_pretrained("./mediguide_gpt2_prompt_tuned_v2")
tokenizer.save_pretrained("./mediguide_gpt2_prompt_tuned_v2")
print("Re-trained prompt-tuned model and tokenizer saved!")

Map:   0%|          | 0/13125 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 23,040 || all params: 124,462,848 || trainable%: 0.0185
Starting Prompt Tuning (re-training)...


Step,Training Loss
500,6.9799
1000,6.3321
1500,4.7239
2000,3.6288
2500,2.827
3000,2.295
3500,2.0104
4000,1.8585
4500,1.8025
5000,1.7381


Prompt Tuning (re-training) completed!
Re-trained prompt-tuned model and tokenizer saved!


In [17]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from peft import PeftModel

# Load the base GPT-2 model and tokenizer
base_model_name = "gpt2"
base_model = GPT2LMHeadModel.from_pretrained(base_model_name)
tokenizer = GPT2Tokenizer.from_pretrained("./mediguide_gpt2_prompt_tuned_v2")
base_model.to("cuda")

# Load the re-trained prompt-tuned PEFT model
model = PeftModel.from_pretrained(base_model, "./mediguide_gpt2_prompt_tuned_v2")
model.to("cuda")
model.eval()

# Test the model with a simplified prompt
test_prompt = "What are the symptoms of diabetes? Answer:"
inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    inputs["input_ids"],
    max_length=80,  # Reduced further to prevent instability
    min_length=20,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    early_stopping=False,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=False,  # Greedy decoding for predictability
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated response:", response)

Generated response: I have a severe case of a rare, but not fatal, disease. I have been diagnosed with the disease for over a year.


In [18]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import torch

# Load the tokenizer and model (without 8-bit quantization)
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Set the pad token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Tokenize the formatted training data
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

train_dataset = Dataset.from_dict({"text": formatted_train_data})
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout for LoRA layers
    target_modules=["c_attn", "c_proj"],  # GPT-2 modules to apply LoRA to
)

# Apply LoRA to the model
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./mediguide_gpt2_lora",
    overwrite_output_dir=True,
    num_train_epochs=1,  # Start with 1 epoch to compare with basic fine-tuning
    per_device_train_batch_size=2,  # Reduced batch size to avoid memory issues
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=500,
    fp16=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
)

# Start LoRA fine-tuning
print("Starting LoRA fine-tuning...")
trainer.train()
print("LoRA fine-tuning completed!")

# Save the fine-tuned model
peft_model.save_pretrained("./mediguide_gpt2_lora")
tokenizer.save_pretrained("./mediguide_gpt2_lora")
print("LoRA fine-tuned model and tokenizer saved!")

Map:   0%|          | 0/13125 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475
Starting LoRA fine-tuning...


Step,Training Loss
500,1.7456
1000,1.1844
1500,1.0905
2000,1.0499
2500,1.0117
3000,0.9832
3500,0.9852
4000,1.0765
4500,0.9637
5000,0.9295


LoRA fine-tuning completed!
LoRA fine-tuned model and tokenizer saved!


In [22]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from peft import PeftModel

# Load the base GPT-2 model and tokenizer
base_model_name = "gpt2"
base_model = GPT2LMHeadModel.from_pretrained(base_model_name)
tokenizer = GPT2Tokenizer.from_pretrained("./mediguide_gpt2_lora")
base_model.to("cuda")

# Load the LoRA fine-tuned PEFT model
model = PeftModel.from_pretrained(base_model, "./mediguide_gpt2_lora")
model.to("cuda")
model.eval()

# Test the model with a simplified prompt
test_prompt = "List the symptoms of diabetes in this format: 'The symptoms of diabetes include [list symptoms]. Consult a healthcare provider for diagnosis.'"
inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    inputs["input_ids"],
    max_length=150,  # Increased to allow for a complete response
    min_length=40,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    early_stopping=False,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=False,  # Greedy decoding for predictability
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated response:", response)

Generated response: List the symptoms of diabetes in this format: 'The symptoms of diabetes include [list symptoms]. Consult a healthcare provider for diagnosis.'  -   The following list of signs and symptoms are from the MedlinePlus Medical Dictionary. You can look up the definitions for these medical terms here.  These medical uses are often covered in other medical journals. In addition, a specific medical condition may be listed as a medical emergency. The Medline Plus Medical Database ( Medline ) has collected information on how often medical procedures are used in the United States. Much of this information comes from Orphanet, an online encyclopedia. For information about how to use this database, see the Wikipedia article. Sometimes, the signs/symptoms listed in these Medical Terms come from information


In [20]:
import json

# Load the test data
with open("test_data.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# Search for a question about diabetes symptoms
reference_answer = None
for pair in test_data:
    if "symptoms of diabetes" in pair["question"].lower():
        print("Found matching question:", pair["question"])
        print("Reference answer:", pair["answer"])
        reference_answer = pair["answer"]
        break

if reference_answer is None:
    print("No exact match found for 'symptoms of diabetes' in the test set.")

Found matching question: What are the symptoms of Diabetes ?
Reference answer: Diabetes is often called a "silent" disease because it can cause serious complications even before you have symptoms. Symptoms can also be so mild that you dont notice them. An estimated 8 million people in the United States have type 2 diabetes and dont know it, according to 2012 estimates by the Centers for Disease Control and Prevention (CDC). Common Signs Some common symptoms of diabetes are: - being very thirsty  - frequent urination  - feeling very hungry or tired  - losing weight without trying  - having sores that heal slowly  - having dry, itchy skin  - loss of feeling or tingling in the feet  - having blurry eyesight. being very thirsty frequent urination feeling very hungry or tired losing weight without trying having sores that heal slowly having dry, itchy skin loss of feeling or tingling in the feet having blurry eyesight. Signs of type 1 diabetes usually develop over a short period of time. Th

In [24]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=26e65c35e2b9c48d6d93c09c4e40b1afb3f3b3c3cec9c6530710166127d2561b
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [33]:
import os

# List the current working directory and its contents
print("Current working directory:", os.getcwd())
print("Contents of current directory:", os.listdir())

# Check mediguide_gpt2 directory
directory_basic = "./mediguide_gpt2/checkpoint-3282"
if os.path.exists(directory_basic):
    print("Basic Fine-Tuned Directory exists. Contents:", os.listdir(directory_basic))
else:
    print("Basic Fine-Tuned Directory does not exist:", directory_basic)

# Check mediguide_gpt2_prompt_tuned_v2 directory
directory_prompt = "./mediguide_gpt2_prompt_tuned_v2"
if os.path.exists(directory_prompt):
    print("Prompt-Tuned Directory exists. Contents:", os.listdir(directory_prompt))
else:
    print("Prompt-Tuned Directory does not exist:", directory_prompt)

# Check mediguide_gpt2_lora directory
directory_lora = "./mediguide_gpt2_lora"
if os.path.exists(directory_lora):
    print("LoRA Directory exists. Contents:", os.listdir(directory_lora))
else:
    print("LoRA Directory does not exist:", directory_lora)

Current working directory: /content
Contents of current directory: ['.config', 'mediguide_gpt2_prompt_tuning_v2', 'wandb', 'mediguide_gpt2_lora', 'train_data.json', 'MedQuAD', 'mediguide_gpt2_prompt_tuned', 'mediguide_gpt2', 'test_data.json', 'mediguide_gpt2_prompt_tuning', 'val_data.json', 'mediguide_gpt2_finetuned', 'mediguide_gpt2_prompt_tuned_v2', 'sample_data']
Basic Fine-Tuned Directory exists. Contents: ['model.safetensors', 'scaler.pt', 'rng_state.pth', 'generation_config.json', 'optimizer.pt', 'config.json', 'scheduler.pt', 'trainer_state.json', 'training_args.bin']
Prompt-Tuned Directory exists. Contents: ['vocab.json', 'adapter_model.safetensors', 'tokenizer_config.json', 'adapter_config.json', 'merges.txt', 'README.md', 'special_tokens_map.json']
LoRA Directory exists. Contents: ['vocab.json', 'adapter_model.safetensors', 'tokenizer_config.json', 'adapter_config.json', 'checkpoint-6563', 'merges.txt', 'README.md', 'runs', 'special_tokens_map.json']


In [34]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the model from the checkpoint
checkpoint_dir = "./mediguide_gpt2/checkpoint-3282"
print("Loading model from checkpoint:", checkpoint_dir)
model = GPT2LMHeadModel.from_pretrained(checkpoint_dir)

# Load the tokenizer from an existing directory (e.g., mediguide_gpt2_lora)
tokenizer_dir = "./mediguide_gpt2_lora"  # We know this directory has the tokenizer files
print("Loading tokenizer from:", tokenizer_dir)
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_dir)

# Save the model and tokenizer to mediguide_gpt2
save_dir = "./mediguide_gpt2"
print("Saving model and tokenizer to:", save_dir)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("Model and tokenizer saved successfully!")

# Verify the updated contents of mediguide_gpt2
import os
print("Updated contents of mediguide_gpt2:", os.listdir(save_dir))

Loading model from checkpoint: ./mediguide_gpt2/checkpoint-3282
Loading tokenizer from: ./mediguide_gpt2_lora
Saving model and tokenizer to: ./mediguide_gpt2
Model and tokenizer saved successfully!
Updated contents of mediguide_gpt2: ['model.safetensors', 'generation_config.json', 'vocab.json', 'checkpoint-3282', 'tokenizer_config.json', 'config.json', 'merges.txt', 'runs', 'special_tokens_map.json']


In [38]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Download the model and tokenizer explicitly
print("Downloading GPT-2 model and tokenizer...")
try:
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    print("GPT-2 model and tokenizer downloaded successfully!")
except Exception as e:
    print("Error downloading GPT-2 model:", str(e))

Downloading GPT-2 model and tokenizer...
GPT-2 model and tokenizer downloaded successfully!


In [40]:
import time
import os
from rouge_score import rouge_scorer
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from peft import PeftModel

# Reference answer (simplified)
reference_answer = "The symptoms of diabetes include being very thirsty, frequent urination, feeling very hungry or tired, losing weight without trying, having sores that heal slowly, having dry, itchy skin, loss of feeling or tingling in the feet, and having blurry eyesight. Consult a healthcare provider for diagnosis."

# Function to evaluate a model
def evaluate_model(model, tokenizer, test_prompt, reference_answer):
    model.to("cuda")
    model.eval()

    # Measure latency
    start_time = time.time()
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        inputs["input_ids"],
        max_length=150,
        min_length=40,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=False,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False,
    )
    latency = time.time() - start_time
    generated_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Compute ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference_answer, generated_response)

    # Compute model size
    model_dir = model.config._name_or_path if not isinstance(model, PeftModel) else model.base_model.config._name_or_path
    model_size = sum(os.path.getsize(os.path.join(model_dir, f)) for f in os.listdir(model_dir) if os.path.isfile(os.path.join(model_dir, f))) / (1024 ** 2)  # Size in MB

    return {
        "generated_response": generated_response,
        "rouge1": rouge_scores["rouge1"].fmeasure,
        "rouge2": rouge_scores["rouge2"].fmeasure,
        "rougeL": rouge_scores["rougeL"].fmeasure,
        "latency": latency,
        "model_size_mb": model_size,
        "perplexity": "TBD",
    }

# Test prompt (same for all models)
test_prompt = "List the symptoms of diabetes in this format: 'The symptoms of diabetes include [list symptoms]. Consult a healthcare provider for diagnosis.'"

# Evaluate Basic Fine-Tuned Model
base_model = GPT2LMHeadModel.from_pretrained("./mediguide_gpt2_finetuned")
tokenizer_basic = GPT2Tokenizer.from_pretrained("./mediguide_gpt2_finetuned")
basic_results = evaluate_model(base_model, tokenizer_basic, test_prompt, reference_answer)
print("Basic Fine-Tuned Model Results:", basic_results)

# Evaluate Prompt-Tuned Model
base_model = GPT2LMHeadModel.from_pretrained("./mediguide_gpt2")
tokenizer_prompt = GPT2Tokenizer.from_pretrained("./mediguide_gpt2_prompt_tuned_v2")
prompt_model = PeftModel.from_pretrained(base_model, "./mediguide_gpt2_prompt_tuned_v2")
prompt_results = evaluate_model(prompt_model, tokenizer_prompt, test_prompt, reference_answer)
print("Prompt-Tuned Model Results:", prompt_results)

# Evaluate LoRA Fine-Tuned Model
base_model = GPT2LMHeadModel.from_pretrained("./mediguide_gpt2")
tokenizer_lora = GPT2Tokenizer.from_pretrained("./mediguide_gpt2_lora")
lora_model = PeftModel.from_pretrained(base_model, "./mediguide_gpt2_lora")
lora_results = evaluate_model(lora_model, tokenizer_lora, test_prompt, reference_answer)
print("LoRA Fine-Tuned Model Results:", lora_results)

Basic Fine-Tuned Model Results: {'generated_response': "List the symptoms of diabetes in this format: 'The symptoms of diabetes include [list symptoms]. Consult a healthcare provider for diagnosis.'\n  \nDiabetes is a disease of the pancreas, which is the organ that makes blood sugar. The pancreatic is one of two types of pancrias.\n\nType 1 is characterized by a shortage of insulin, a hormone that helps the body produce insulin. Type 2 is more severe, with a lack of glucose in the blood. ", 'rouge1': 0.2479338842975206, 'rouge2': 0.16806722689075632, 'rougeL': 0.19834710743801653, 'latency': 0.896780252456665, 'model_size_mb': 476.1045093536377, 'perplexity': 'TBD'}
Prompt-Tuned Model Results: {'generated_response': '   Is there a genetic disorder that causes the signs and symptoms of  ?  The Human Phenotype Ontology provides the following list of signs or symptoms for Isolated Scleroderma. If the information is available, the table below includes how often the symptom is seen in peop

In [41]:
import torch
import json
from datasets import Dataset

# Load a subset of the test data
with open("test_data.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# Use a subset of 100 examples to compute perplexity
test_subset = test_data[:100]
test_texts = [pair["question"] + " Answer: " + pair["answer"] for pair in test_subset]
test_dataset = Dataset.from_dict({"text": test_texts})

# Load the tokenizer (using the one from mediguide_gpt2 for consistency)
tokenizer_basic = GPT2Tokenizer.from_pretrained("./mediguide_gpt2")
tokenizer_basic.pad_token = tokenizer_basic.eos_token

# Tokenize the test data
def tokenize_function(examples):
    tokenized = tokenizer_basic(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

tokenized_test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# Function to compute perplexity
def compute_perplexity(model, dataset):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for batch in dataset:
            input_ids = torch.tensor(batch["input_ids"]).unsqueeze(0).to("cuda")
            labels = torch.tensor(batch["labels"]).unsqueeze(0).to("cuda")
            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            total_tokens += (labels != tokenizer_basic.pad_token_id).sum().item()

    avg_loss = total_loss / len(dataset)
    perplexity = torch.exp(torch.tensor(avg_loss)).item()
    return perplexity

# Load models
basic_model = GPT2LMHeadModel.from_pretrained("./mediguide_gpt2")
prompt_model = PeftModel.from_pretrained(GPT2LMHeadModel.from_pretrained("gpt2"), "./mediguide_gpt2_prompt_tuned_v2")
lora_model = PeftModel.from_pretrained(GPT2LMHeadModel.from_pretrained("gpt2"), "./mediguide_gpt2_lora")

# Move models to GPU
basic_model.to("cuda")
prompt_model.to("cuda")
lora_model.to("cuda")

# Compute perplexity for each model
basic_perplexity = compute_perplexity(basic_model, tokenized_test_dataset)
print("Basic Fine-Tuned Model Perplexity:", basic_perplexity)

prompt_perplexity = compute_perplexity(prompt_model, tokenized_test_dataset)
print("Prompt-Tuned Model Perplexity:", prompt_perplexity)

lora_perplexity = compute_perplexity(lora_model, tokenized_test_dataset)
print("LoRA Fine-Tuned Model Perplexity:", lora_perplexity)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Basic Fine-Tuned Model Perplexity: 2.3794732093811035
Prompt-Tuned Model Perplexity: 3.8762712478637695
LoRA Fine-Tuned Model Perplexity: 11.410538673400879


In [42]:
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Verify loading from mediguide_gpt2
print("Loading model and tokenizer from mediguide_gpt2...")
try:
    model = GPT2LMHeadModel.from_pretrained("./mediguide_gpt2")
    tokenizer = GPT2Tokenizer.from_pretrained("./mediguide_gpt2")
    print("Model and tokenizer loaded successfully from mediguide_gpt2!")
except Exception as e:
    print("Error loading from mediguide_gpt2:", str(e))

# Verify loading from mediguide_gpt2_finetuned
print("\nLoading model and tokenizer from mediguide_gpt2_finetuned...")
try:
    model = GPT2LMHeadModel.from_pretrained("./mediguide_gpt2_finetuned")
    tokenizer = GPT2Tokenizer.from_pretrained("./mediguide_gpt2_finetuned")
    print("Model and tokenizer loaded successfully from mediguide_gpt2_finetuned!")
except Exception as e:
    print("Error loading from mediguide_gpt2_finetuned:", str(e))

# Check contents of mediguide_gpt2_finetuned
directory = "./mediguide_gpt2_finetuned"
if os.path.exists(directory):
    print("\nContents of mediguide_gpt2_finetuned:", os.listdir(directory))
else:
    print("Directory mediguide_gpt2_finetuned does not exist:", directory)

Loading model and tokenizer from mediguide_gpt2...
Model and tokenizer loaded successfully from mediguide_gpt2!

Loading model and tokenizer from mediguide_gpt2_finetuned...
Model and tokenizer loaded successfully from mediguide_gpt2_finetuned!

Contents of mediguide_gpt2_finetuned: ['model.safetensors', 'generation_config.json', 'vocab.json', 'tokenizer_config.json', 'config.json', 'merges.txt', 'special_tokens_map.json']


In [43]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Download the model and tokenizer explicitly
print("Downloading GPT-2 model and tokenizer...")
try:
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    print("GPT-2 model and tokenizer downloaded successfully!")
except Exception as e:
    print("Error downloading GPT-2 model:", str(e))

Downloading GPT-2 model and tokenizer...
GPT-2 model and tokenizer downloaded successfully!


In [44]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from peft import PeftModel

# Load the base GPT-2 model and tokenizer
base_model_name = "gpt2"
base_model = GPT2LMHeadModel.from_pretrained(base_model_name)
tokenizer = GPT2Tokenizer.from_pretrained("./mediguide_gpt2_lora")
base_model.to("cuda")

# Load the LoRA fine-tuned PEFT model
model = PeftModel.from_pretrained(base_model, "./mediguide_gpt2_lora")
model.to("cuda")
model.eval()

# Test the model with a simplified prompt
test_prompt = "List the symptoms of diabetes in this format: 'The symptoms of diabetes include [list symptoms]. Consult a healthcare provider for diagnosis.'"
inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    inputs["input_ids"],
    max_length=150,
    min_length=40,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    early_stopping=False,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=False,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated response:", response)

Generated response: List the symptoms of diabetes in this format: 'The symptoms of diabetes include [list symptoms]. Consult a healthcare provider for diagnosis.'  -   The following list of signs and symptoms are from the MedlinePlus Medical Dictionary. You can look up the definitions for these medical terms here.  These medical uses are often covered in other medical journals. In addition, a specific medical condition may be listed as a medical emergency. The Medline Plus Medical Database ( Medline ) has collected information on how often medical procedures are used in the United States. Much of this information comes from Orphanet, an online encyclopedia. For information about how to use this database, see the Wikipedia article. Sometimes, the signs/symptoms listed in these Medical Terms come from information


In [45]:
import time
import os
from rouge_score import rouge_scorer
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from peft import PeftModel

# Reference answer (simplified)
reference_answer = "The symptoms of diabetes include being very thirsty, frequent urination, feeling very hungry or tired, losing weight without trying, having sores that heal slowly, having dry, itchy skin, loss of feeling or tingling in the feet, and having blurry eyesight. Consult a healthcare provider for diagnosis."

# Function to evaluate a model
def evaluate_model(model, tokenizer, test_prompt, reference_answer):
    model.to("cuda")
    model.eval()

    # Measure latency
    start_time = time.time()
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        inputs["input_ids"],
        max_length=150,
        min_length=40,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=False,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False,
    )
    latency = time.time() - start_time
    generated_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Compute ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference_answer, generated_response)

    # Compute model size
    model_dir = model.config._name_or_path if not isinstance(model, PeftModel) else model.base_model.config._name_or_path
    model_size = sum(os.path.getsize(os.path.join(model_dir, f)) for f in os.listdir(model_dir) if os.path.isfile(os.path.join(model_dir, f))) / (1024 ** 2)  # Size in MB

    return {
        "generated_response": generated_response,
        "rouge1": rouge_scores["rouge1"].fmeasure,
        "rouge2": rouge_scores["rouge2"].fmeasure,
        "rougeL": rouge_scores["rougeL"].fmeasure,
        "latency": latency,
        "model_size_mb": model_size,
        "perplexity": "TBD",  # We'll update this with the computed values
    }

# Test prompt (same for all models)
test_prompt = "List the symptoms of diabetes in this format: 'The symptoms of diabetes include [list symptoms]. Consult a healthcare provider for diagnosis.'"

# Evaluate Basic Fine-Tuned Model
base_model = GPT2LMHeadModel.from_pretrained("./mediguide_gpt2")
tokenizer_basic = GPT2Tokenizer.from_pretrained("./mediguide_gpt2")
basic_results = evaluate_model(base_model, tokenizer_basic, test_prompt, reference_answer)
print("Basic Fine-Tuned Model Results:", basic_results)

# Evaluate Prompt-Tuned Model
base_model = GPT2LMHeadModel.from_pretrained("./mediguide_gpt2")
tokenizer_prompt = GPT2Tokenizer.from_pretrained("./mediguide_gpt2_prompt_tuned_v2")
prompt_model = PeftModel.from_pretrained(base_model, "./mediguide_gpt2_prompt_tuned_v2")
prompt_results = evaluate_model(prompt_model, tokenizer_prompt, test_prompt, reference_answer)
print("Prompt-Tuned Model Results:", prompt_results)

# Evaluate LoRA Fine-Tuned Model
base_model = GPT2LMHeadModel.from_pretrained("./mediguide_gpt2")
tokenizer_lora = GPT2Tokenizer.from_pretrained("./mediguide_gpt2_lora")
lora_model = PeftModel.from_pretrained(base_model, "./mediguide_gpt2_lora")
lora_results = evaluate_model(lora_model, tokenizer_lora, test_prompt, reference_answer)
print("LoRA Fine-Tuned Model Results:", lora_results)

Basic Fine-Tuned Model Results: {'generated_response': "List the symptoms of diabetes in this format: 'The symptoms of diabetes include [list symptoms]. Consult a healthcare provider for diagnosis.'\n  \nDiabetes is a disease of the pancreas, which is the organ that makes blood sugar. The pancreatic is one of two types of pancrias.\n\nType 1 is characterized by a shortage of insulin, a hormone that helps the body produce insulin. Type 2 is more severe, with a lack of glucose in the blood. ", 'rouge1': 0.2479338842975206, 'rouge2': 0.16806722689075632, 'rougeL': 0.19834710743801653, 'latency': 0.7909693717956543, 'model_size_mb': 476.10461711883545, 'perplexity': 'TBD'}
Prompt-Tuned Model Results: {'generated_response': '   Is there a genetic disorder that causes the signs and symptoms of  ?  The Human Phenotype Ontology provides the following list of signs or symptoms for Isolated Scleroderma. If the information is available, the table below includes how often the symptom is seen in pe