In [11]:
# Install necessary dependencies
!pip uninstall -y fsspec gcsfs
!pip install fsspec==2024.9.0 datasets==3.1.0
!pip install -q transformers datasets torch

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("Bilal-Mamji/Medical-summary")

# Simplified SOAP instructions
SOAP_format_instruct = """
You are a medical assistant generating SOAP notes. Based on the following dialogue, create a SOAP note in this format:

Example:
Dialogue:
Doctor: Hello, how can I help you today?
Patient: I have been feeling very tired and have a persistent headache.

SOAP Note:
S (Subjective): The patient reports feeling very tired and having a persistent headache.
O (Objective): Not provided in the dialogue.
A (Assessment): Likely diagnosis of fatigue and chronic headache, pending further testing.
P (Plan): Recommend further diagnostic tests, including blood work and imaging, and suggest over-the-counter pain relief in the meantime.

Dialogue:
<Insert the patient's dialogue here>
SOAP Note:
"""


# Load pre-trained distilgpt2 model and tokenizer
model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Update preprocess function
def preprocess_data(example):
    dialogue = example['input']  # Use the correct column name for dialogue
    input_text = SOAP_format_instruct + dialogue  # Use the correct instruction variable
    return {"input_text": input_text, "target_summary": example['output']}  # Correct column for target summary

# Check dataset structure
print(dataset['train'].column_names)

# Preprocess dataset
dataset = dataset.map(preprocess_data, remove_columns=dataset['train'].column_names)

# Test distilgpt2 on a few examples
for i, example in enumerate(dataset['train']):
    if i >= 3:  # Process only the first 3 examples
        break

    # Tokenize the input (truncate if necessary)
    input_ids = tokenizer.encode(example['input_text'], return_tensors="pt", max_length=512, truncation=True)

    # Generate the SOAP summary
    outputs = model.generate(
        input_ids,
        max_new_tokens=200,  # Generate up to 200 new tokens
        pad_token_id=tokenizer.eos_token_id,  # Avoid warnings by setting pad token
        num_beams=5,
        no_repeat_ngram_size=3,
        early_stopping=True,
        temperature=0.7
    )

    # Decode and print the generated summary
    generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Dialogue:\n{example['input_text']}\n")
    print(f"Generated SOAP Summary:\n{generated_summary}\n")
    print(f"Ground Truth Summary:\n{example['target_summary']}\n")
    print("-----------")



Found existing installation: fsspec 2024.9.0
Uninstalling fsspec-2024.9.0:
  Successfully uninstalled fsspec-2024.9.0
[0mCollecting fsspec==2024.9.0
  Using cached fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Using cached fsspec-2024.9.0-py3-none-any.whl (179 kB)
Installing collected packages: fsspec
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.27.0 requires gcsfs>=2023.3.0, which is not installed.[0m[31m
[0mSuccessfully installed fsspec-2024.9.0
['input', 'output', 'instruction']


Map:   0%|          | 0/9250 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Dialogue:

You are a medical assistant generating SOAP notes. Based on the following dialogue, create a SOAP note in this format:

Example:
Dialogue:
Doctor: Hello, how can I help you today?
Patient: I have been feeling very tired and have a persistent headache.

SOAP Note:
S (Subjective): The patient reports feeling very tired and having a persistent headache.
O (Objective): Not provided in the dialogue.
A (Assessment): Likely diagnosis of fatigue and chronic headache, pending further testing.
P (Plan): Recommend further diagnostic tests, including blood work and imaging, and suggest over-the-counter pain relief in the meantime.

Dialogue:
<Insert the patient's dialogue here>
SOAP Note:
Doctor: Hello, how can I help you today?
Patient: My son has been having some issues with speech and development. He's 13 years old now.
Doctor: I see. Can you tell me more about his symptoms? Does he have any issues with muscle tone or hypotonia?
Patient: No, he doesn't have hypotonia. But he has mild