In [1]:
pip install transformers datasets peft bitsandbytes torch accelerate rouge_score bert_score

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
import torch
print("Visible:", torch.cuda.device_count())

Visible: 1


In [5]:
print("Using device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

Using device: 0
Device name: NVIDIA RTX A6000


In [8]:
# Install Adapter-transformers
!pip install -q transformers datasets accelerate bitsandbytes peft

In [9]:
from peft import get_peft_model, LoraConfig, TaskType

In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from rouge_score import rouge_scorer
from bert_score import score
import re
import os
import pandas as pd
from transformers import pipeline
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import numpy as np

In [11]:
# Load and preprocess the dataset
# Load the full dataset
dataset = load_dataset("omi-health/medical-dialogue-to-soap-summary")
print("Original Training Split Size:", len(dataset["train"]))
print("Original Validation Split Size:", len(dataset["validation"]))
print("Original Test Split Size:", len(dataset["test"]))

# Preprocessing function
def preprocess_dialogue(example):
    dialogue = example["dialogue"]
    soap = example["soap"]

    # Clean text
    dialogue = re.sub(r'[^A-Za-z0-9\s.,:?-]', '', dialogue).lower()
    soap = re.sub(r'[^A-Za-z0-9\s.,:?-]', '', soap).lower()

    # Add role tags
    dialogue = dialogue.replace("Doctor:", "[Doctor]:")
    dialogue = dialogue.replace("Patient:", "[Patient]:")

    return {"dialogue": dialogue, "soap": soap}

# Apply preprocessing
processed_dataset = dataset.map(preprocess_dialogue)

Original Training Split Size: 9250
Original Validation Split Size: 500
Original Test Split Size: 250


In [12]:
# Tokenize the dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tokenizer.pad_token = tokenizer.eos_token

# Tokenization function with labels
def tokenize_function(examples):
    # Combine dialogue and soap notes
    texts = []
    for dialogue, soap in zip(examples["dialogue"], examples["soap"]):
        # Convert to strings if they are lists
        if isinstance(dialogue, list):
            dialogue = " ".join(dialogue)
        if isinstance(soap, list):
            soap = " ".join(soap)
        texts.append(f"{dialogue} {soap}")

    # Tokenize the batch
    tokenized = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

    # Add labels
    tokenized["labels"] = tokenized["input_ids"].clone()

    return tokenized

# Apply tokenization with batched processing
tokenized_dataset = processed_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1000,  # Adjust batch size as needed
    remove_columns=["dialogue", "soap"]  # Remove original columns
)

# Set format for PyTorch
tokenized_dataset.set_format("torch")

# Split into train and eval datasets
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["validation"]

# Debug: Verify 'labels' key
print("Sample from train_dataset:", train_dataset[0].keys())
print("Sample 'labels' shape:", train_dataset[0]["labels"].shape)
print("Sample from eval_dataset:", eval_dataset[0].keys())

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/9250 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Sample from train_dataset: dict_keys(['prompt', 'messages', 'messages_nosystem', 'input_ids', 'attention_mask', 'labels'])
Sample 'labels' shape: torch.Size([512])
Sample from eval_dataset: dict_keys(['prompt', 'messages', 'messages_nosystem', 'input_ids', 'attention_mask', 'labels'])


In [13]:
# Load the model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

# Apply LoRA Adapters (PEFT)
lora_config = LoraConfig(
    r=8,                           # Rank: How small the adapter is
    lora_alpha=32,                 # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Adapter inside attention layers
    lora_dropout=0.05,              # Dropout for regularization
    bias="none",
    task_type=TaskType.CAUSAL_LM    # Because we are doing language modeling
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


In [14]:
# Clear GPU memory
torch.cuda.empty_cache()

In [15]:
# Fine tune the model
# Set Up TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=1e-4,
    warmup_steps=100,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none",
    save_total_limit=2,
    fp16=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

# Train the model (only the adapter will be updated)
trainer.train()

# Save the LoRA Adapter
model.save_pretrained("fine_tuned_adapter_peft")
tokenizer.save_pretrained("fine_tuned_adapter_peft")

print("Fine-tuning complete! Only Adapter was trained and saved.")

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,1.5194,1.500445
2,1.487,1.480144
3,1.4767,1.473941


Fine-tuning complete! Only Adapter was trained and saved.


In [16]:
# Clear memory after training
del trainer, model
torch.cuda.empty_cache()

In [17]:
# Load the model for evaluation
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and adapter
base_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
adapter_path = "fine_tuned_adapter_peft"

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

# Attach adapter
model = PeftModel.from_pretrained(base_model, adapter_path)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

# Set model to evaluation mode
model.eval()

# Load the test split
dataset = load_dataset("omi-health/medical-dialogue-to-soap-summary", split="test")

In [18]:
# Generate predictions for evaluation
# Load the original dataset (not the tokenized one)
original_dataset = load_dataset("omi-health/medical-dialogue-to-soap-summary", split="validation")

# Take 100 samples (same seed for reproducibility)
sampled_dialogues = original_dataset.shuffle(seed=42).select(range(100))

# Define the generation function (updated to handle the dataset format)
def generate_soap(dialogue, max_new_tokens=300):
    # Create a prompt that matches your training format
    prompt = f"Summarize this medical dialogue into a SOAP note:\n\n{dialogue}\n\nSOAP Note:"

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.eos_token_id
        )

    # Extract just the generated part (after the prompt)
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_part = full_text[len(prompt):].strip()  # Remove the prompt
    return generated_part

# Generate and save results
import pandas as pd
from tqdm import tqdm 

results = []
for idx, sample in enumerate(tqdm(sampled_dialogues, desc="Generating SOAP notes")):
    try:
        dialogue = sample["dialogue"]
        soap = generate_soap(dialogue)

        results.append({
            "id": idx,
            "dialogue": dialogue,
            "generated_soap": soap
        })
    except Exception as e:
        print(f"Error processing sample {idx}: {str(e)}")
        continue

# Convert to DataFrame and save
df_results = pd.DataFrame(results)
df_results.to_csv("adaptive-learning-results.csv", index=False)

print(f"Successfully generated {len(results)} SOAP notes!")
print(f"Saved to: adaptive-learning-results.csv")

Generating SOAP notes: 100%|██████████████| 100/100 [12:55<00:00,  7.76s/it]

Successfully generated 100 SOAP notes!
Saved to: adaptive-learning-results.csv





In [19]:
from rouge_score import rouge_scorer
from bert_score import score as bert_score

# Extract ground-truth and predictions
ground_truths = [sample["soap"] for sample in sampled_dialogues]
predictions = [item["generated_soap"] for item in results]
# Initialize scorer
rouge = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

# Store results
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for ref, pred in zip(ground_truths, predictions):
    scores = rouge.score(ref, pred)
    rouge1_scores.append(scores["rouge1"].fmeasure)
    rouge2_scores.append(scores["rouge2"].fmeasure)
    rougeL_scores.append(scores["rougeL"].fmeasure)

# Calculate average
print(f"ROUGE-1 Average: {sum(rouge1_scores)/len(rouge1_scores):.4f}")
print(f"ROUGE-2 Average: {sum(rouge2_scores)/len(rouge2_scores):.4f}")
print(f"ROUGE-L Average: {sum(rougeL_scores)/len(rougeL_scores):.4f}")
# Use BERTScore
P, R, F1 = bert_score(predictions, ground_truths, lang="en", verbose=True)

# Print average
print(f"BERTScore F1 Average: {F1.mean().item():.4f}")

ROUGE-1 Average: 0.0764
ROUGE-2 Average: 0.0105
ROUGE-L Average: 0.0467


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 4.34 seconds, 23.06 sentences/sec
BERTScore F1 Average: 0.5497


