# Install necessary libraries

In [None]:
!pip install transformers accelerate peft datasets evaluate bitsandbytes



In [None]:
import os
import re
import torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from peft import PrefixTuningConfig, get_peft_model

# Load and preprocess the dataset

In [None]:
raw_dataset = load_dataset("omi-health/medical-dialogue-to-soap-summary")

def preprocess_dialogue(example):
    dialogue = example["dialogue"]
    soap = example["soap"]
    dialogue = re.sub(r'[^A-Za-z0-9\s\.,:?-]', '', dialogue).strip()
    soap = re.sub(r'[^A-Za-z0-9\s\.,:?-]', '', soap).strip()
    dialogue = dialogue.replace("Doctor:", "[Doctor]:").replace("Patient:", "[Patient]:")
    return {"dialogue": dialogue, "soap": soap}
git pul
processed = raw_dataset.map(preprocess_dialogue, batched=False)

# Load tokenizer and model with 8-bit quantization via BitsAndBytesConfig

In [None]:
model_name = "unsloth/Llama-3.2-3B-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
    "unsloth/Llama-3.2-3B-bnb-4bit",
    quantization_config=bnb_config,
    device_map="auto"
)

# 3) Configure PEFT prefix tuning (remove unsupported args)
peft_config = PrefixTuningConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    num_virtual_tokens=20,
    prefix_projection=True
)
model = get_peft_model(model, peft_config)




# Tokenization function for seq2seq

In [None]:
max_prompt = 512
max_target = 256
max_total = max_prompt + max_target + 1

def tokenize_fn(example):
    prompt = "Summarize the following medical dialogue into a SOAP note:\n" + example["dialogue"]
    prompt_ids = tokenizer(prompt, truncation=True, max_length=max_prompt, add_special_tokens=False)["input_ids"]
    target_ids = tokenizer(example["soap"], truncation=True, max_length=max_target, add_special_tokens=False)["input_ids"]
    seq = prompt_ids + target_ids + [tokenizer.eos_token_id]
    if len(seq) > max_total:
        seq = seq[-max_total:]
    attention_mask = [1] * len(seq)
    pad_len = max_total - len(seq)
    seq += [tokenizer.pad_token_id] * pad_len
    attention_mask += [0] * pad_len
    labels = [-100] * len(prompt_ids) + target_ids + [tokenizer.eos_token_id]
    labels = labels[-max_total:]
    labels += [-100] * pad_len
    return {"input_ids": seq, "attention_mask": attention_mask, "labels": labels}

# Apply tokenization to all splits
tokenized = processed.map(tokenize_fn, batched=False, remove_columns=["dialogue","soap"])

Map:   0%|          | 0/9250 [00:00<?, ? examples/s]

In [None]:
train_ds = tokenized["train"]
eval_ds  = tokenized["validation"]
test_ds  = tokenized["test"]

In [None]:
torch.cuda.empty_cache()

# Training arguments (disable load_best_model_at_end for PEFT)

In [None]:
training_args = TrainingArguments(
    output_dir="/content/llama-7b-prefix-soap",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    fp16=True,
    logging_steps=100,
    eval_steps=500,
    eval_strategy="steps",
    save_steps=500,
    save_strategy="steps",
    load_best_model_at_end=False,  # disable for prefix tuning
    report_to="none"
)

# Initialize Trainer and start training

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


# Start training

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.628,0.612164
1000,0.5866,0.584016
1500,0.5482,0.574753


TrainOutput(global_step=1734, training_loss=0.60922476384604, metrics={'train_runtime': 3282.9658, 'train_samples_per_second': 8.453, 'train_steps_per_second': 0.528, 'total_flos': 3.604659941347246e+17, 'train_loss': 0.60922476384604, 'epoch': 2.9961089494163424})

# Save the prefix-tuned model

In [None]:
model.save_pretrained("/content/llama-3b-prefix-soap")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Change this to wherever you’d like in your Drive
drive_path = "/content/drive/MyDrive//content/llama3b-prefix-soap"

# Save both model weights and tokenizer files
model.save_pretrained(drive_path)
tokenizer.save_pretrained(drive_path)

print(f"✅ Model and tokenizer saved to {drive_path}")

Mounted at /content/drive
✅ Model and tokenizer saved to /content/drive/MyDrive//content/llama3b-prefix-soap


# Generation helper and test example

In [None]:
def generate_soap(dialogue_text):
    prompt = "Summarize the following medical dialogue into a SOAP note:\n" + dialogue_text
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    out = model.generate(**inputs, max_length=max_total, num_beams=4, early_stopping=True)
    return tokenizer.decode(out[0], skip_special_tokens=True)

sample = processed["test"][0]
print("Original Dialogue:\n", sample["dialogue"])
print("Reference SOAP:\n", sample["soap"])
print("Generated SOAP:\n", generate_soap(sample["dialogue"]))

Original Dialogue:
 [Doctor]: Hello, can you please tell me about your past medical history?
[Patient]: Hi, I dont have any past medical history.
[Doctor]: Okay. What brings you in today?
[Patient]: Ive been experiencing painless blurry vision in my right eye for a week now. Ive also had intermittent fevers, headache, body aches, and a nonpruritic maculopapular rash on my lower legs for the past 6 months.
[Doctor]: Thank you for sharing that. Have you had any other symptoms such as neck stiffness, nausea, vomiting, Raynauds phenomenon, oral ulcerations, chest pain, shortness of breath, abdominal pain, or photosensitivity?
[Patient]: No, only an isolated episode of left knee swelling and testicular swelling in the past.
[Doctor]: Do you work with any toxic substances or have any habits like smoking, drinking, or illicit drug use?
[Patient]: No, I work as a flooring installer and I dont have any toxic habits.
[Doctor]: Alright. We checked your vital signs and they were normal. During the



Generated SOAP:
 Summarize the following medical dialogue into a SOAP note:
[Doctor]: Hello, can you please tell me about your past medical history?
[Patient]: Hi, I dont have any past medical history.
[Doctor]: Okay. What brings you in today?
[Patient]: Ive been experiencing painless blurry vision in my right eye for a week now. Ive also had intermittent fevers, headache, body aches, and a nonpruritic maculopapular rash on my lower legs for the past 6 months.
[Doctor]: Thank you for sharing that. Have you had any other symptoms such as neck stiffness, nausea, vomiting, Raynauds phenomenon, oral ulcerations, chest pain, shortness of breath, abdominal pain, or photosensitivity?
[Patient]: No, only an isolated episode of left knee swelling and testicular swelling in the past.
[Doctor]: Do you work with any toxic substances or have any habits like smoking, drinking, or illicit drug use?
[Patient]: No, I work as a flooring installer and I dont have any toxic habits.
[Doctor]: Alright. We c

In [None]:
from tqdm.auto import tqdm
import pandas as pd

# 8) Generation helper using max_new_tokens to avoid input length issues
def generate_soap(dialogue_text):
    prompt = "Summarize the following medical dialogue into a SOAP note:\n" + dialogue_text
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_prompt
    ).to(model.device)
    out = model.generate(
        **inputs,
        max_new_tokens=max_target,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(out[0], skip_special_tokens=True)

records = []
for idx, sample in enumerate(tqdm(processed["test"].select(range(100)), total=100, desc="Generating SOAP")):
    ref = sample["soap"]
    gen = generate_soap(sample["dialogue"])
    records.append({"reference_soap": ref, "generated_soap": gen})
    print(f"Sample {idx+1}/100 complete.")

df = pd.DataFrame(records)
output_path = "soap_comparison.csv"
df.to_csv(output_path, index=False)
print(f"Saved {output_path} with first 100 SOAP comparisons.")

Generating SOAP:   0%|          | 0/100 [00:00<?, ?it/s]

Sample 1/100 complete.
Sample 2/100 complete.
Sample 3/100 complete.
Sample 4/100 complete.
Sample 5/100 complete.
Sample 6/100 complete.
Sample 7/100 complete.
Sample 8/100 complete.
Sample 9/100 complete.
Sample 10/100 complete.
Sample 11/100 complete.
Sample 12/100 complete.
Sample 13/100 complete.
Sample 14/100 complete.
Sample 15/100 complete.
Sample 16/100 complete.
Sample 17/100 complete.
Sample 18/100 complete.
Sample 19/100 complete.
Sample 20/100 complete.
Sample 21/100 complete.
Sample 22/100 complete.
Sample 23/100 complete.
Sample 24/100 complete.
Sample 25/100 complete.
Sample 26/100 complete.
Sample 27/100 complete.
Sample 28/100 complete.
Sample 29/100 complete.
Sample 30/100 complete.
Sample 31/100 complete.
Sample 32/100 complete.
Sample 33/100 complete.
Sample 34/100 complete.
Sample 35/100 complete.
Sample 36/100 complete.
Sample 37/100 complete.
Sample 38/100 complete.
Sample 39/100 complete.
Sample 40/100 complete.
Sample 41/100 complete.
Sample 42/100 complete.
S

# Evaluation

In [None]:
!rm -r ~/.cache/huggingface/datasets

In [None]:
!pip install -U bitsandbytes



In [None]:
from huggingface_hub import login
login("")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm
import pandas as pd
import os # Import the os module
from peft import PeftModel, PeftConfig # Import PeftModel and PeftConfig

# Load the dataset
dataset = load_dataset("omi-health/medical-dialogue-to-soap-summary")
test_data = dataset["test"]

# Load evaluation metrics
# Make sure these libraries are installed if you haven't already
# !pip install rouge_score bert_score sacrebleu
rouge = load_metric("rouge")
bertscore = load_metric("bertscore")
bleu = load_metric("bleu")

# Load your fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/llama3b-prefix-soap"

# --- Add this check ---
print(f"Checking directory contents at: {model_path}")
if os.path.exists(model_path):
    print("Directory exists. Contents:")
    contents = os.listdir(model_path)
    for item in contents:
        print(f"- {item}")

    # Check for PEFT adapter weights file (usually adapter_model.safetensors or adapter_model.bin)
    peft_weights_found = False
    for item in contents:
        if item.startswith("adapter_model.") and (item.endswith(".safetensors") or item.endswith(".bin")):
            peft_weights_found = True
            break

    if not peft_weights_found:
        print("Error: PEFT adapter weights file (adapter_model.safetensors or .bin) not found in the directory!")
        # Exit or raise an error if the weights are not found
        exit()
else:
    print("Error: Directory does not exist!")
    exit()
# --- End of added check ---

# Load the base model first, potentially with the same quantization used during training
# Assuming you used 8-bit quantization based on the training code
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

# Get the base model name from the saved PEFT config
# You might need to load the config first to get the base model name
peft_config = PeftConfig.from_pretrained(model_path)
base_model_name = peft_config.base_model_name_or_path

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Load the tokenizer from the saved path (PEFT saves the tokenizer too)
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
tokenizer.pad_token = tokenizer.eos_token # Ensure pad token is set

print("Base model and tokenizer loaded successfully.")

# Load the PEFT adapter weights onto the base model
model = PeftModel.from_pretrained(base_model, model_path)
print("PEFT model loaded successfully.")

model.eval()

# Function to generate SOAP note
def generate_soap(dialogue_text):
    prompt = "Summarize the following medical dialogue into a SOAP note:\n" + dialogue_text
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=256, do_sample=False)  # greedy, faster
    return tokenizer.decode(out[0], skip_special_tokens=True)


# Generate predictions and collect references
references = []
predictions = []

for sample in tqdm(test_data):
    ref = sample["soap"]
    pred = generate_soap(sample["dialogue"])
    references.append(ref)
    predictions.append(pred)

# Compute ROUGE scores
rouge_result = rouge.compute(predictions=predictions, references=references)

# Compute BERTScore
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")

# Prepare data for BLEU (requires tokenized inputs)
bleu_references = [[ref.split()] for ref in references]
bleu_predictions = [pred.split() for pred in predictions]
bleu_result = bleu.compute(predictions=bleu_predictions, references=bleu_references)

# Combine all evaluation results
results = {
    "ROUGE-1": round(rouge_result["rouge1"].mid.fmeasure, 4),
    "ROUGE-2": round(rouge_result["rouge2"].mid.fmeasure, 4),
    "ROUGE-Lsum": round(rouge_result["rougeLsum"].mid.fmeasure, 4),
    "BERTScore-F1": round(sum(bertscore_result["f1"]) / len(bertscore_result["f1"]), 4),
    "BLEU": round(bleu_result["bleu"], 4)
}

# Display results as a table
results_df = pd.DataFrame([results])
print("Evaluation Metrics:\n", results_df)

Checking directory contents at: /content/drive/MyDrive/llama3b-prefix-soap
Directory exists. Contents:
- README.md
- adapter_model.safetensors
- adapter_config.json
- tokenizer_config.json
- special_tokens_map.json
- tokenizer.json




Base model and tokenizer loaded successfully.
PEFT model loaded successfully.


100%|██████████| 250/250 [20:00<00:00,  4.80s/it]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
    ROUGE-1  ROUGE-2  ROUGE-Lsum  BERTScore-F1   BLEU
0   0.4804   0.2451      0.4304         0.868  0.129
