# Generating Radiology Report Impression with Large Language Model on AWS
### Fine-Tuning State-of-the-Art LLMs (Flan-T5 XL) to generate impressions from findings in radiology reports

We focus on demonstrating strategy on fine tuning third party pretrained large language models (LLM) for the task of radiology report summarization leveraging AWS services. LLM have demonstrated remarkable capabilities in natural language understanding and generation, serving as foundation models that can be adapted to various domains and tasks. We fine-tuned the Flan-T5 XL model for summarization task on 91544 free-text radiology reports obtained from MIMIC-CXR dataset. We also presented evaluation with using the pretrained model out of the box. 



In [None]:
!pip install nest-asyncio==1.5.5 --quiet
!pip install ipywidgets==8.0.4 --quiet
!pip install sagemaker==2.148.0 --quiet

In [None]:
import boto3
import sagemaker

# Get current region, role, and default bucket
aws_region = boto3.Session().region_name
aws_role = sagemaker.session.Session().get_caller_identity_arn()
output_bucket = 'llm-radiology-bucket'

# This will be useful for printing
newline, bold, unbold = "\n", "\033[1m", "\033[0m"

print(f"{bold}aws_region:{unbold} {aws_region}")
print(f"{bold}aws_role:{unbold} {aws_role}")
print(f"{bold}output_bucket:{unbold} {output_bucket}")

In [None]:
import IPython
from ipywidgets import Dropdown
from sagemaker.jumpstart.filters import And
from sagemaker.jumpstart.notebook_utils import list_jumpstart_models

# Default model choice
model_id = "huggingface-text2text-flan-t5-xl"

# Identify FLAN T5 models that support fine-tuning
filter_value = And("task == text2text", "framework == huggingface", "training_supported == true")
model_list = [m for m in list_jumpstart_models(filter=filter_value) if "flan-t5" in m]

# Display the model IDs in a dropdown, for user to select
dropdown = Dropdown(
    value=model_id,
    options=model_list,
    description="FLAN T5 models available for fine-tuning:",
    style={"description_width": "initial"},
    layout={"width": "max-content"},
)
display(IPython.display.Markdown("### Select a pre-trained model from the dropdown below"))
display(dropdown)

In [None]:
from sagemaker.instance_types import retrieve_default

model_id, model_version = dropdown.value, "*"

# Instance types for training and inference
training_instance_type = retrieve_default(
    model_id=model_id, model_version=model_version, scope="training"
)
inference_instance_type = retrieve_default(
    model_id=model_id, model_version=model_version, scope="inference"
)

print(f"{bold}model_id:{unbold} {model_id}")
print(f"{bold}training_instance_type:{unbold} {training_instance_type}")
print(f"{bold}inference_instance_type:{unbold} {inference_instance_type}")

### Prepare Training Data in JSONL Format

In [None]:
import pandas as pd
train_df = pd.read_json('train.json')

#dev1 is the evaluation dataset from the MIMIC CXR dataset
dev1 = pd.read_json('dev.json')

#dev2 is the evaluation dataset from Indiana University
dev2 = pd.read_json('indiana_dev.json')

In [None]:
#Rename the columns to prompt and completion
train_df = train_df[['findings', 'impression']].rename(columns={'findings': 'prompt', 'impression': 'completion'})

In [None]:
#convert to json lines format
train_df.to_json('train.jsonl', orient='records', lines=True)

In [None]:
local_data_file = "train.jsonl" 

from sagemaker.s3 import S3Uploader

train_data_location = f"s3://{output_bucket}/train_data"
S3Uploader.upload(local_data_file, train_data_location)
print(f"{bold}training data:{unbold} {train_data_location}")

In [None]:
from sagemaker import image_uris, model_uris, script_uris

# Training instance will use this image
train_image_uri = image_uris.retrieve(
    region=aws_region,
    framework=None,  # automatically inferred from model_id
    model_id=model_id,
    model_version=model_version,
    image_scope="training",
    instance_type=training_instance_type,
)

# Pre-trained model
train_model_uri = model_uris.retrieve(
    model_id=model_id, model_version=model_version, model_scope="training"
)

# Script to execute on the training instance
train_script_uri = script_uris.retrieve(
    model_id=model_id, model_version=model_version, script_scope="training"
)

output_location = f"s3://{output_bucket}/demo-llm-rad-fine-tune-flan-t5/"

print(f"{bold}image uri:{unbold} {train_image_uri}")
print(f"{bold}model uri:{unbold} {train_model_uri}")
print(f"{bold}script uri:{unbold} {train_script_uri}")
print(f"{bold}output location:{unbold} {output_location}")

In [None]:
from sagemaker import hyperparameters

# Retrieve the default hyper-parameters for fine-tuning the model
hyperparameters = hyperparameters.retrieve_default(model_id=model_id, model_version=model_version)

# We will override some default hyperparameters with custom values
hyperparameters["epochs"] = "3"
print(hyperparameters)

# Note that the maximum output length is set to 128 tokens by default.
# The targets in your data (i.e., ground truth responses) will be truncated to this size.
# You can override this behavior, e.g.,
# hyperparameters["max_output_length"] = "256"

In [None]:
from sagemaker.estimator import Estimator
from sagemaker.utils import name_from_base

model_name = "-".join(model_id.split("-")[2:])  # get the most informative part of ID
training_job_name = name_from_base(f"js-demo-{model_name}-{hyperparameters['epochs']}")
print(f"{bold}job name:{unbold} {training_job_name}")

training_metric_definitions = [
    {"Name": "val_loss", "Regex": "'eval_loss': ([0-9\\.]+)"},
    {"Name": "train_loss", "Regex": "'loss': ([0-9\\.]+)"},
    {"Name": "epoch", "Regex": "'epoch': ([0-9\\.]+)"},
]

# Create SageMaker Estimator instance
sm_estimator = Estimator(
    role=aws_role,
    image_uri=train_image_uri,
    model_uri=train_model_uri,
    source_dir=train_script_uri,
    entry_point="transfer_learning.py",
    instance_count=1,
    instance_type=training_instance_type,
    volume_size=300,
    max_run=360000,
    hyperparameters=hyperparameters,
    output_path=output_location,
    metric_definitions=training_metric_definitions,
)

# Launch a SageMaker training job over data located in the given S3 path
# Training jobs can take hours, it is recommended to set wait=False,
# and monitor job status through SageMaker console
sm_estimator.fit({"training": train_data_location}, job_name=training_job_name, wait=True)

In [None]:
from sagemaker import TrainingJobAnalytics

# Wait for a couple of minutes for the job to start before running this cell
# This can be called while the job is still running
df = TrainingJobAnalytics(training_job_name=training_job_name).dataframe()

In [None]:
df.head()

## Deploy Inference Endpoints for Both Original Pretrained and Finetuned Models

In [None]:
from sagemaker import image_uris

# Retrieve the inference docker image URI. This is the base HuggingFace container image
deploy_image_uri = image_uris.retrieve(
    region=aws_region,
    framework=None,  # automatically inferred from model_id
    model_id=model_id,
    model_version=model_version,
    image_scope="inference",
    instance_type=inference_instance_type,
)

In [None]:
from sagemaker import model_uris, script_uris
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base

# Retrieve the URI of the pre-trained model
pre_trained_model_uri = model_uris.retrieve(
    model_id=model_id, model_version=model_version, model_scope="inference"
)

large_model_env = {"SAGEMAKER_MODEL_SERVER_WORKERS": "1", "TS_DEFAULT_WORKERS_PER_MODEL": "1"}

pre_trained_name = name_from_base(f"jumpstart-demo-pre-trained-{model_id}")

# Create the SageMaker model instance of the pre-trained model
if ("small" in model_id) or ("base" in model_id):
    deploy_source_uri = script_uris.retrieve(
        model_id=model_id, model_version=model_version, script_scope="inference"
    )
    pre_trained_model = Model(
        image_uri=deploy_image_uri,
        source_dir=deploy_source_uri,
        entry_point="inference.py",
        model_data=pre_trained_model_uri,
        role=aws_role,
        predictor_cls=Predictor,
        name=pre_trained_name,
    )
else:
    # For those large models, we already repack the inference script and model
    # artifacts for you, so the `source_dir` argument to Model is not required.
    pre_trained_model = Model(
        image_uri=deploy_image_uri,
        model_data=pre_trained_model_uri,
        role=aws_role,
        predictor_cls=Predictor,
        name=pre_trained_name,
        env=large_model_env,
    )

print(f"{bold}image URI:{unbold}{newline} {deploy_image_uri}")
print(f"{bold}model URI:{unbold}{newline} {pre_trained_model_uri}")
print("Deploying an endpoint ...")

# Deploy the pre-trained model. Note that we need to pass Predictor class when we deploy model
# through Model class, for being able to run inference through the SageMaker API
pre_trained_predictor = pre_trained_model.deploy(
    initial_instance_count=1,
    instance_type=inference_instance_type,
    predictor_cls=Predictor,
    endpoint_name=pre_trained_name,
)
print(f"{newline}Deployed an endpoint {pre_trained_name}")

In [None]:
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base

fine_tuned_name = name_from_base(f"jumpstart-demo-fine-tuned-{model_id}")
fine_tuned_model_uri = f"{output_location}{training_job_name}/output/model.tar.gz"

# Create the SageMaker model instance of the fine-tuned model
fine_tuned_model = Model(
    image_uri=deploy_image_uri,
    model_data=fine_tuned_model_uri,
    role=aws_role,
    predictor_cls=Predictor,
    name=fine_tuned_name,
    env=large_model_env,
)

print(f"{bold}image URI:{unbold}{newline} {deploy_image_uri}")
print(f"{bold}model URI:{unbold}{newline} {fine_tuned_model_uri}")
print("Deploying an endpoint ...")

# Deploy the fine-tuned model.
fine_tuned_predictor = fine_tuned_model.deploy(
    initial_instance_count=1,
    instance_type=inference_instance_type,
    predictor_cls=Predictor,
    endpoint_name=fine_tuned_name,
)
print(f"{newline}Deployed an endpoint {fine_tuned_name}")

### Preprocess Evaluation Dataset and Run inference queries
As the name suggests, a Text2Text model such as FLAN T5 receives a piece of text as input, and generates text as output. The input text will contain the description of the task. In this demo, our task is to generate impressions given a piece of text/findings. The impressions must be relevant to the findings, but the findings should contain no answer. 

In [None]:
dev1.head()

In [None]:
prompt = "Generate radiology report impressions based on the following findings. Findings: {context}"

#Input sample paragraphs from dev1 evaluation set (MIMIC CXR findings)

test_paragraphs_dev1 = [
    """
    """,
    """
    """
    ]

In [None]:
import boto3
import json

# Parameters of (output) text generation. A great introduction to generation
# parameters can be found at https://huggingface.co/blog/how-to-generate
parameters = {
    "max_length": 80,  # restrict the length of the generated text
    "num_return_sequences": 1,  # we will inspect several model outputs
    "num_beams": 10,  # use beam search
}


# Helper functions for running inference queries
def query_endpoint_with_json_payload(payload, endpoint_name):
    encoded_json = json.dumps(payload).encode("utf-8")
    client = boto3.client("runtime.sagemaker")
    response = client.invoke_endpoint(
        EndpointName=endpoint_name, ContentType="application/json", Body=encoded_json
    )
    return response


def parse_response_multiple_texts(query_response):
    model_predictions = json.loads(response["Body"].read())
    generated_text = model_predictions['generated_texts']
    return generated_text


def generate_impressions(endpoint_name, text):
    expanded_prompt = prompt.replace("{context}", text)
    payload = {"text_inputs": expanded_prompt, **parameters}
    query_response = query_endpoint_with_json_payload(payload, endpoint_name=endpoint_name)
    generated_texts = parse_response_multiple_texts(query_response)
    for i, generated_text in enumerate(generated_texts):
        print(f"Response {i}: {generated_text}{newline}")

In [None]:
print(f"{bold}Prompt:{unbold} {repr(prompt)}")
for paragraph in test_paragraphs_dev1:
    print("-" * 80)
    print(paragraph)
    print("-" * 80)
    print(f"{bold}pre-trained{unbold}")
    generate_impressions(pre_trained_name, paragraph)
    print(f"{bold}fine-tuned{unbold}")
    generate_impressions(fine_tuned_name, paragraph)

In [None]:
prompt = "Generate radiology report impressions based on the following findings. Findings: {context}"

# Sources: Indiana University Radiology Findings
test_paragraphs_dev2 = [
    """
    The heart is normal in size and contour. There is no mediastinal widening. Low lung volumes. No focal airspace disease. No large pleural effusion or pneumothorax. The ____ are intact.
    """,
    """
    The cardiomediastinal silhouette is within normal limits for appearance. No focal areas of pulmonary consolidation. No pneumothorax. No pleural effusion. The thoracic spine appears intact. No acute, displaced rib fractures.
    """,
    """
    The cardiac and mediastinal contours are within normal limits. There are calcifications of the aortic ____. The lungs are hyperinflated with increased retrosternal airspace and flattening of hemidiaphragms. There is haziness in the right lung apex. There is a 1.7 cm nodular density in the medial right lung base seen on the frontal view, not identified on the lateral view. This may represent a vessel on end. There is no consolidation, pneumothorax, or effusion. There are mild degenerative changes of the spine.
    """

    ]

In [None]:
print(f"{bold}Prompt:{unbold} {repr(prompt)}")
for paragraph in test_paragraphs_dev2:
    print("-" * 80)
    print(paragraph)
    print("-" * 80)
    print(f"{bold}pre-trained{unbold}")
    generate_impressions(pre_trained_name, paragraph)
    print(f"{bold}fine-tuned{unbold}")
    generate_impressions(fine_tuned_name, paragraph)

## Model Evaluation and ROUGE Score Computation

Compute ROUGE score for both pretrained and finetuned deployed models using dev1 and dev2

In [None]:
def generate_impressions(endpoint_name, text):
    expanded_prompt = prompt.replace("{context}", text)
    payload = {"text_inputs": expanded_prompt, **parameters}
    query_response = query_endpoint_with_json_payload(payload, endpoint_name=endpoint_name)
    generated_texts = parse_response_multiple_texts(query_response)
    for i, generated_text in enumerate(generated_texts):
        return generated_text

In [None]:
dev1_sentences = dev1["findings"].to_list()

In [None]:
#generate impressions for dev1 using finetuned and pretrained models
pred_dev1_pretrained = []
pred_dev1_finetuned = []
for paragraph in dev1_sentences:
    dev1_pretrained = generate_impressions(pre_trained_name, paragraph)
    dev1_finetuned = generate_impressions(fine_tuned_name, paragraph)
    pred_dev1_pretrained.append(dev1_pretrained)
    pred_dev1_finetuned.append(dev1_finetuned)
    pred_dev1_pretrained_df = pd.DataFrame(pred_dev1_pretrained)
    pred_dev1_finetuned_df = pd.DataFrame(pred_dev1_finetuned)

In [None]:
dev1["impression_pretrained"] = pred_dev1_pretrained_df
dev1["impression_finetuned"] = pred_dev1_finetuned_df

In [None]:
dev1.tail()

In [None]:
#compute dev1 rouge score for finetuned and pretrained models

import evaluate
from rouge_score import rouge_scorer, scoring
#from transformers import AutoTokenizer, BartTokenizer


rouge_score = evaluate.load("rouge") #"/home/hd/hd_hd/hd_rk435/evaluate/metrics/rouge")
#tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
result_pretrained_dev1 = rouge_score.compute(predictions=list(dev1["impression_pretrained"]), references=list(dev1["impression"]), use_aggregator=True) #, use_stemmer=True) #, tokenizer=tokenizer)
print("ROUGE Score for Pretrained Flan-T5 XL model on Dev1 Set:")
print(result_pretrained_dev1)

results_finetuned_dev1 = rouge_score.compute(predictions=list(dev1["impression_finetuned"]),references=list(dev1["impression"]), use_aggregator=True) #, use_stemmer=True) #, tokenizer=tokenizer)
print("ROUGE Score for FineTuned Flan-T5 XL model on Dev1 Set:")
print(results_finetuned_dev1)

In [None]:
results_pretrained_dev1_all = rouge_score.compute(predictions=list(dev1["impression_pretrained"]),references=list(dev1["impression"]), use_aggregator=False)
results_pretrained_dev1_all_df = pd.DataFrame(results_pretrained_dev1_all)
results_pretrained_dev1_all_df.plot(kind='box')
plt.show()

In [None]:
results_pretrained_dev1_all_df.describe()

In [None]:
results_finetuned_dev1_all = rouge_score.compute(predictions=list(dev1["impression_finetuned"]),references=list(dev1["impression"]), use_aggregator=False)
results_finetuned_dev1_all_df = pd.DataFrame(results_finetuned_dev1_all)
results_finetuned_dev1_all_df.plot(kind='box')
plt.show()

In [None]:
results_finetuned_dev1_all_df.describe()

In [None]:
dev2_sentences = dev2["findings"].to_list()

In [None]:
#generate impressions for dev2 using finetuned and pretrained models
pred_dev2_pretrained = []
pred_dev2_finetuned = []
for paragraph in dev2_sentences:
    dev2_pretrained = generate_impressions(pre_trained_name, paragraph)
    dev2_finetuned = generate_impressions(fine_tuned_name, paragraph)
    pred_dev2_pretrained.append(dev2_pretrained)
    pred_dev2_finetuned.append(dev2_finetuned)
    pred_dev2_pretrained_df = pd.DataFrame(pred_dev2_pretrained)
    pred_dev2_finetuned_df = pd.DataFrame(pred_dev2_finetuned)

In [None]:
dev2["impression_pretrained"] = pred_dev2_pretrained_df
dev2["impression_finetuned"] = pred_dev2_finetuned_df

In [None]:
#compute dev2 rouge score for finetuned and pretrained models

result_pretrained_dev2 = rouge_score.compute(predictions=list(dev2["impression_pretrained"]), references=list(dev2["impression"]), use_aggregator=True) #, use_stemmer=True) #, tokenizer=tokenizer)
print("ROUGE Score for Pretrained Flan-T5 XL model on Dev2 (Indiana Uni) Set:")
print(result_pretrained_dev2)

results_finetuned_dev2 = rouge_score.compute(predictions=list(dev2["impression_finetuned"]),references=list(dev2["impression"]), use_aggregator=True) #, use_stemmer=True) #, tokenizer=tokenizer)
print("ROUGE Score for FineTuned Flan-T5 XL model Dev2 (Indiana Uni) Set:")
print(results_finetuned_dev2)

In [None]:
result_pretrained_dev2_all = rouge_score.compute(predictions=list(dev2["impression_pretrained"]), references=list(dev2["impression"]), use_aggregator=False)

In [None]:
result_pretrained_dev2_all_df = pd.DataFrame(result_pretrained_dev2_all)

In [None]:
result_pretrained_dev2_all_df.plot(kind='box')
plt.show()

In [None]:
results_finetuned_dev2_all = rouge_score.compute(predictions=list(dev2["impression_finetuned"]),references=list(dev2["impression"]), use_aggregator=False)
results_finetuned_dev2_all_df = pd.DataFrame(results_finetuned_dev2_all)
results_finetuned_dev2_all_df.plot(kind='box')
plt.show()

In [None]:
result_pretrained_dev2_all_df.describe()

In [None]:
results_finetuned_dev2_all_df.describe()

In [None]:
dev1 = dev1.join(results_finetuned_dev1_all_df)

In [None]:
dev2 = dev2.join(results_finetuned_dev2_all_df)

In [None]:
dev1['data source'] = "MIMIC CXR"
dev2['data source'] = "Indiana Uni"

In [None]:
df = pd.concat([dev1, dev2])

In [None]:
df_radiology_pred = df[['findings', 'background', 'impression_finetuned', 'data source', 'rouge1', 'rouge2', 'rougeL']].rename(columns={'impression_finetuned': 'generated impressions'}).reset_index()

In [None]:
file_name = "radiology_pred.json" 
df_radiology_pred.to_json(file_name)

# instantiate S3 client and upload to s3
import boto3

s3 = boto3.resource('s3')
s3.meta.client.upload_file(file_name, 'llm-radiology-bucket', 'prediction_data/radiology_pred.json')

### Delete Resources

In [None]:
# Delete resources
pre_trained_predictor.delete_model()
pre_trained_predictor.delete_endpoint()
fine_tuned_predictor.delete_model()
fine_tuned_predictor.delete_endpoint()