In [2]:
pip install transformers peft datasets accelerate bitsandbytes

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import torch
print("Visible:", torch.cuda.device_count())

Visible: 1


In [3]:
print("Using device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

Using device: 0
Device name: NVIDIA RTX A6000


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [5]:
from peft import get_peft_model, PrefixTuningConfig, TaskType

In [7]:
model_name = "Qwen/Qwen2-7B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,  # optional: saves memory
    device_map={"":0},
    trust_remote_code=True
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
from peft import get_peft_model, PrefixTuningConfig

peft_config = PrefixTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    num_virtual_tokens=20,  # Tune as needed
)

model = get_peft_model(model, peft_config)

In [12]:
import json

def load_json_file(path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"File not found: {path}")
    except json.JSONDecodeError as e:
        print(f"JSON decode error in {path}: {e}")
    return None

total_data = load_json_file("discharge_summaries.json")

In [13]:
from sklearn.model_selection import train_test_split

In [15]:
from datasets import load_dataset

#Split: 90% train, 5% validation, 5% test
train_data, temp_data = train_test_split(total_data, test_size=0.10, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.50, random_state=42)

# Save each split to a seperate JSON file
with open("train.json", "w", encoding="utf-8") as file:
    json.dump(train_data, file, indent=2, ensure_ascii=False)
with open("validation.json", "w", encoding="utf-8") as file:
    json.dump(val_data, file, indent=2, ensure_ascii=False)
with open("test.json", "w", encoding="utf-8") as file:
    json.dump(test_data, file, indent=2, ensure_ascii=False)

print("Dataset split and saved as train.json, validation.json, and test.json.")

Dataset split and saved as train.json, validation.json, and test.json.


In [16]:
from datasets import load_dataset, DatasetDict, Dataset
import json

# Load JSON files into HuggingFace Datasets
def load_json_to_dataset(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return Dataset.from_list(data)

# Load each split
train_dataset = load_json_to_dataset("train.json")
val_dataset = load_json_to_dataset("validation.json")
test_dataset = load_json_to_dataset("test.json")

# Combine into a DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
})

# Tokenization function
def tokenize_function(example):
    inputs = tokenizer(
        example["summary"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

# Apply tokenization to all splits
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names  # removes original keys like "patient_id", etc.
)

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
pip install --upgrade transformers

In [21]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,              # Keep only if on compatible GPU
    report_to=None          # Set to None instead of "none" if older version
)

# Use a data collator suited for Causal LM (doesn't mask inputs like MLM)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Define Trainer with PEFT model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,8.1753
20,8.1597
30,8.0169
40,7.9506
50,7.7403
60,7.6475
70,7.418
80,7.4531
90,7.2736
100,7.1766


TrainOutput(global_step=2700, training_loss=4.009802700325295, metrics={'train_runtime': 2275.0256, 'train_samples_per_second': 2.374, 'train_steps_per_second': 1.187, 'total_flos': 1.172930867232768e+17, 'train_loss': 4.009802700325295, 'epoch': 3.0})

In [22]:
model.save_pretrained("qwen2-prefix-tuned-7B-final")
tokenizer.save_pretrained("qwen2-prefix-tuned-7B-final")

('qwen2-prefix-tuned-7B-final/tokenizer_config.json',
 'qwen2-prefix-tuned-7B-final/special_tokens_map.json',
 'qwen2-prefix-tuned-7B-final/chat_template.jinja',
 'qwen2-prefix-tuned-7B-final/vocab.json',
 'qwen2-prefix-tuned-7B-final/merges.txt',
 'qwen2-prefix-tuned-7B-final/added_tokens.json',
 'qwen2-prefix-tuned-7B-final/tokenizer.json')

In [24]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

# === Step 1: Load PEFT config ===
peft_config = PeftConfig.from_pretrained("qwen2-prefix-tuned-7B-final")

# === Step 2: Load base model (Qwen2) ===
base_model = AutoModelForCausalLM.from_pretrained(
    peft_config.base_model_name_or_path,
    device_map="auto",
    trust_remote_code=True,
)

# === Step 3: Load PEFT-tuned model ===
model = PeftModel.from_pretrained(base_model, "qwen2-prefix-tuned-7B-final")

# === Step 4: Load tokenizer ===
tokenizer = AutoTokenizer.from_pretrained("qwen2-prefix-tuned-7B-final", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [1]:
import json

def load_json_file(path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"File not found: {path}")
    except json.JSONDecodeError as e:
        print(f"JSON decode error in {path}: {e}")
    return None

test_data = load_json_file("test.json")

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
from tqdm import tqdm

# === Load PEFT config and model ===
peft_config = PeftConfig.from_pretrained("qwen2-prefix-tuned-7B-final")
base_model = AutoModelForCausalLM.from_pretrained(
    peft_config.base_model_name_or_path,
    device_map={"":0},
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, "qwen2-prefix-tuned-7B-final")

# === Load tokenizer ===
tokenizer = AutoTokenizer.from_pretrained("qwen2-prefix-tuned-7B-final", trust_remote_code=True)

def format_structured_input(entry):
    return (
        f"Generate a discharge summary for the following patient:\n"
        f"Patient ID: {entry['patient_id']}\n"
        f"Age: {entry['age']}\n"
        f"Gender: {entry['gender']}\n"
        f"Admission Date: {entry['admission_date']}\n"
        f"Discharge Date: {entry['discharge_date']}\n"
        f"Primary Diagnosis: {entry['primary_diagnosis']}\n"
        f"Procedures: {', '.join(entry['procedures'])}\n"
        f"Medications: {', '.join(entry['medications'])}\n"
        f"Follow-up Instructions: {entry['follow_up_instructions']}\n"
    )

processed_data = []
for sample in test_data:
    processed_data.append({
        "input": format_structured_input(sample),
        "summary": sample["summary"]
    })

# def tokenize_function(example):
#     inputs = tokenizer(
#         example["input"],
#         padding="max_length",
#         truncation=True,
#         max_length=512,
#     )
#     targets = tokenizer(
#         example["summary"],
#         padding="max_length",
#         truncation=True,
#         max_length=512,
#     )
#     inputs["labels"] = targets["input_ids"]
#     return inputs

def generate_summary(structured_input):
    inputs = tokenizer(structured_input, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=True,
            top_p=0.9,
            temperature=0.8,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

# Prompt
example_input = format_structured_input({
    "patient_id": "001",
    "age": 82,
    "gender": "Female",
    "admission_date": "2025-02-10",
    "discharge_date": "2025-02-17",
    "primary_diagnosis": "Kidney Failure",
    "procedures": ["Hemodialysis", "Renal Function Panel"],
    "medications": ["Erythropoietin", "Calcium Acetate"],
    "follow_up_instructions": "Follow up in 1 week with primary physician. Adhere to prescribed medications."
})

generated_summary = generate_summary(example_input)
print("\n--- Generated Summary ---\n")
print(generated_summary)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]




--- Generated Summary ---

Generate a discharge summary for the following patient:
Patient ID: 001
Age: 82
Gender: Female
Admission Date: 2025-02-10
Discharge Date: 2025-02-17
Primary Diagnosis: Kidney Failure
Procedures: Hemodialysis, Renal Function Panel
Medications: Erythropoietin, Calcium Acetate
Follow-up Instructions: Follow up in 1 week with primary physician. Adhere to prescribed medications.
Mathematically RMsharper, 10 months
Verified Patient, and please between 025. Youarent. If you are anow. But what you ahad completelystalledisconnected Youre widelyavailable. The
Question Time: Moref than that, please. O’Fogerty.
Perotl. This information is not substitute connected to what Presley. The following
Thely. Information. Patient, you are on the Jobath. Show your
relax. That Said, Exercise, and. The
that’swhiledischarged or
followtheand. BeConnected. Disconnected. Youarengel;Schifferically. 69%, Report
Patient Transport, the


In [4]:
import evaluate

predictions = []
references = []

for item in test_data:
    input_text = format_structured_input(item)
    pred = generate_summary(input_text)
    predictions.append(pred)
    references.append(item["summary"])

rouge = evaluate.load("rouge")
results = rouge.compute(predictions=predictions, references=references)
print(results)

{'rouge1': np.float64(0.311275431844502), 'rouge2': np.float64(0.2195560047623867), 'rougeL': np.float64(0.28660513805312204), 'rougeLsum': np.float64(0.30765777579765174)}
