In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [5]:
import torch
print("Visible:", torch.cuda.device_count())

Visible: 1


In [6]:
import numpy as np 
import pandas as pd

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id = "microsoft/phi-2"  

# Load model and tokenizer from Hugging Face Hub
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map={"":0},
    torch_dtype=torch.float16 
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Input dialogue
dialogue = """
The patient is a 45-year-old male who came to the emergency department with complaints of chest pain and shortness of breath. There is no significant medical history.
Upon examination, the patient is in moderate distress, with bilateral diffuse rales on his chest. His vital signs are BP 110/70 mmHg, HR 95 bpm, and temperature 98.6°F. His oxygen saturation is 95% on room air, and his initial ECG shows nonspecific ST-T changes. A chest x-ray is performed, which reveals a right subpleural infiltrate.
"""

# Prompt
prompt = f"""You are a medical assistant.

Given the following dialogue:

{dialogue}

Write ONLY the SOAP note, starting immediately with:
S: 
O: 
A: 
P: 
No explanations, no answers, no comments, no headings.
"""

# Tokenize
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=400,
        temperature=0.3,
        top_p=0.95,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )

# Decode
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Postprocessing
s_indices = [i for i in range(len(generated_text)) if generated_text.startswith("S:", i)]
if len(s_indices) >= 2:
    real_start = s_indices[1]
    generated_text = generated_text[real_start:]
elif len(s_indices) == 1:
    real_start = s_indices[0]
    generated_text = generated_text[real_start:]
generated_soap = generated_text.strip()

# Final output
print(generated_soap)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


S: Chest pain and shortness of breath. Moderate distress, bilateral diffuse rales on chest, initial ECG shows nonspecific ST-T changes.
O: BP 110/70 mmHg, HR 95 bpm, temperature 98


In [12]:
from datasets import load_dataset

dataset = load_dataset("omi-health/medical-dialogue-to-soap-summary")

# print(dataset['test'].column_names)
# Extract the ground truth SOAP notes from the test set
test_data = dataset['test']['soap'][:100]

In [14]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from datasets import load_dataset

# Load dataset and convert to pandas DataFrame
dataset = load_dataset("omi-health/medical-dialogue-to-soap-summary")
test_data = dataset['test'].to_pandas()

# Load model and tokenizer
model_path = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map={"": 0})
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Take only the first dialogue
first_dialogue = test_data.iloc[0]['dialogue']

# Prepare the prompt
prompt = f"""You are a medical assistant.

Given the following dialogue:

{first_dialogue}

Write ONLY the SOAP note, starting immediately with:
S: 
O: 
A: 
P: 
No explanations, no answers, no comments, no headings.
"""

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate the SOAP report
with torch.no_grad():
    output = model.generate(**inputs, max_new_tokens=500)

# Decode the output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Extract only the SOAP report
start_index = generated_text.find("S:")
final_soap_report = generated_text[start_index:].strip() if start_index != -1 else generated_text.strip()

# Save into CSV with exactly two columns
df_save = pd.DataFrame({
    'dialogue': [first_dialogue],
    'generated_soap_report': [final_soap_report]
})

df_save.to_csv("lora-finetunig-results.csv", index=False)

print("done")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


done


In [15]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load model and tokenizer
model_path = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_path)


# Function to generate clean SOAP notes
def generate_soap(dialogue):
    # Very clear prompt with strict formatting instructions
    prompt = f"""You are a medical assistant. Given this doctor-patient dialogue, generate ONLY a SOAP note in this exact format:

S: [Subjective findings - patient's complaints and history]
O: [Objective findings - exam results, vitals, test results]
A: [Assessment - diagnosis/differential]
P: [Plan - treatment recommendations]

Dialogue:
{dialogue}

SOAP Note:
S: """
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=500,
            temperature=0.3,  # Lower temperature for more focused output
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract just the SOAP note part
    start_idx = full_text.find("S: ")
    if start_idx == -1:
        return "S: \nO: \nA: \nP: "  # Fallback if format not found
    
    soap_note = full_text[start_idx:]
    
    # Ensure all sections are present
    for section in ["O: ", "A: ", "P: "]:
        if section not in soap_note:
            soap_note += f"\n{section}"
    
    return soap_note

# 4. Process all samples and save to CSV
results = []
for idx, row in test_data.iterrows():
    dialogue = row['dialogue']
    soap_note = generate_soap(dialogue)
    results.append({'dialogue': dialogue, 'generated_soap': soap_note})
    
# 5. Create DataFrame and save
df_results = pd.DataFrame(results)
df_results.to_csv("lora-finetuned-results-new.csv", index=False)

print("SOAP notes generated and saved successfully!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

SOAP notes generated and saved successfully!


In [16]:
import pandas as pd
from rouge_score import rouge_scorer
from bert_score import score

# Load your data (first 100 rows)
df = pd.read_csv("lora-finetuned-results-new.csv").head(100)

# 2. Create temporary reference notes (for demonstration only)
# WARNING: These are auto-generated so scores won't be meaningful
print("Creating temporary reference notes for demonstration...")
df["reference_soap"] = df["dialogue"].apply(lambda x: 
    "S: Patient reports symptoms\nO: Examination findings\nA: Assessment\nP: Plan")

# 3. Clean data
df = df.dropna(subset=["generated_soap", "reference_soap"])

# 4. Calculate ROUGE scores
print("\nCalculating ROUGE scores...")
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], use_stemmer=True)

rouge_results = []
for _, row in df.iterrows():
    scores = scorer.score(row["reference_soap"], row["generated_soap"])
    rouge_results.append({
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeLsum': scores['rougeLsum'].fmeasure
    })

rouge_df = pd.DataFrame(rouge_results)
df = pd.concat([df, rouge_df], axis=1)

# 5. Calculate BERTScore
print("Calculating BERTScore (this may take a few minutes)...")
_, _, bert_f1 = score(df["generated_soap"].tolist(), 
                     df["reference_soap"].tolist(), 
                     lang="en")
df["bert_f1"] = bert_f1

# 6. Show results
print("\nEvaluation Results (First 100 Reports):")
print(f"ROUGE-1 Average: {df['rouge1'].mean():.3f}")
print(f"ROUGE-2 Average: {df['rouge2'].mean():.3f}")
print(f"ROUGE-L Average: {df['rougeLsum'].mean():.3f}")
print(f"BERTScore-F1 Average: {df['bert_f1'].mean():.3f}")

Creating temporary reference notes for demonstration...

Calculating ROUGE scores...
Calculating BERTScore (this may take a few minutes)...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluation Results (First 100 Reports):
ROUGE-1 Average: 0.026
ROUGE-2 Average: 0.008
ROUGE-L Average: 0.026
BERTScore-F1 Average: 0.816
