In [1]:
!pip install git+https://github.com/huggingface/transformers.git@88d960937c81a32bfb63356a2e8ecf7999619681 gradio

Collecting git+https://github.com/huggingface/transformers.git@88d960937c81a32bfb63356a2e8ecf7999619681
  Cloning https://github.com/huggingface/transformers.git (to revision 88d960937c81a32bfb63356a2e8ecf7999619681) to /tmp/pip-req-build-l10qam5f
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-l10qam5f
  Running command git rev-parse -q --verify 'sha^88d960937c81a32bfb63356a2e8ecf7999619681'
  Running command git fetch -q https://github.com/huggingface/transformers.git 88d960937c81a32bfb63356a2e8ecf7999619681
  Running command git checkout -q 88d960937c81a32bfb63356a2e8ecf7999619681
  Resolved https://github.com/huggingface/transformers.git to commit 88d960937c81a32bfb63356a2e8ecf7999619681
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tokenizers<0.21,>=0.20 (from transformers==

In [6]:
from transformers import AutoModelForCausalLM, AutoProcessor
from pathlib import Path
import torch

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [8]:
model = AutoModelForCausalLM.from_pretrained("microsoft/maira-2", trust_remote_code=True)
processor = AutoProcessor.from_pretrained("microsoft/maira-2", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [3]:
import requests
from PIL import Image

def get_sample_data() -> dict[str, Image.Image | str]:
    """
    Download chest X-rays from IU-Xray, which we didn't train MAIRA-2 on. License is CC.
    We modified this function from the Rad-DINO repository on Huggingface.
    """
    frontal_image_url = "https://openi.nlm.nih.gov/imgs/512/145/145/CXR145_IM-0290-1001.png"
    lateral_image_url = "https://openi.nlm.nih.gov/imgs/512/145/145/CXR145_IM-0290-2001.png"

    def download_and_open(url: str) -> Image.Image:
        response = requests.get(url, headers={"User-Agent": "MAIRA-2"}, stream=True)
        return Image.open(response.raw)

    frontal_image = download_and_open(frontal_image_url)
    lateral_image = download_and_open(lateral_image_url)

    sample_data = {
        "frontal": frontal_image,
        "lateral": lateral_image,
        "indication": "Dyspnea.",
        "comparison": "None.",
        "technique": "PA and lateral views of the chest.",
        "phrase": "Pleural effusion."  # For the phrase grounding example. This patient has pleural effusion.
    }
    return sample_data

sample_data = get_sample_data()

In [4]:
sample_data

{'frontal': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x624>,
 'lateral': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x624>,
 'indication': 'Dyspnea.',
 'comparison': 'None.',
 'technique': 'PA and lateral views of the chest.',
 'phrase': 'Pleural effusion.'}

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Maira2ForConditionalGeneration(
  (vision_tower): Dinov2Backbone(
    (embeddings): Dinov2Embeddings(
      (patch_embeddings): Dinov2PatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): Dinov2Encoder(
      (layer): ModuleList(
        (0-11): 12 x Dinov2Layer(
          (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (attention): Dinov2SdpaAttention(
            (attention): Dinov2SdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): Dinov2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inp

In [10]:
processed_inputs = processor.format_and_preprocess_reporting_input(
    current_frontal=sample_data["frontal"],
    current_lateral=sample_data["lateral"],
    prior_frontal=None,  # Our example has no prior
    indication=sample_data["indication"],
    technique=sample_data["technique"],
    comparison=sample_data["comparison"],
    prior_report=None,  # Our example has no prior
    return_tensors="pt",
    get_grounding=False,  # For this example we generate a non-grounded report
)

processed_inputs = processed_inputs.to(device)
with torch.no_grad():
    output_decoding = model.generate(
        **processed_inputs,
        max_new_tokens=450,  # Set to 450 for grounded reporting
        use_cache=True,
    )
prompt_length = processed_inputs["input_ids"].shape[-1]
decoded_text = processor.decode(output_decoding[0][prompt_length:], skip_special_tokens=True)
decoded_text = decoded_text.lstrip()  # Findings generation completions have a single leading space
prediction = processor.convert_output_to_plaintext_or_grounded_sequence(decoded_text)
print("Parsed prediction:", prediction)

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Parsed prediction: There is a large right pleural effusion with associated right basilar atelectasis. The left lung is clear. No pneumothorax is identified. The cardiomediastinal silhouette and hilar contours are normal. There is no free air under the diaphragm. Surgical clips are noted in the right upper quadrant of the abdomen.


## Gradio App

In [11]:
def download_image(url: str) -> Image.Image:
    """
    Download the image from the given URL and return as a PIL Image.
    """
    response = requests.get(url, headers={"User-Agent": "MAIRA-2"}, stream=True)
    return Image.open(response.raw)

In [12]:
def generate_findings(
    frontal_url: str,
    lateral_url: str,
    indication: str,
    comparison: str,
    technique: str
):
    """
    1. Download the frontal & lateral images from the provided URLs.
    2. Format & preprocess the input for the model using `processor`.
    3. Generate the findings from the model.
    4. Return the two images and the generated findings text.
    """
    # 1. Download images
    frontal_image = download_image(frontal_url)
    lateral_image = download_image(lateral_url)

    # 2. Prepare inputs for the model
    processed_inputs = processor.format_and_preprocess_reporting_input(
        current_frontal=frontal_image,
        current_lateral=lateral_image,
        prior_frontal=None,  # Example doesn't use prior images
        indication=indication,
        technique=technique,
        comparison=comparison,
        prior_report=None,   # Example doesn't use prior reports
        return_tensors="pt",
        get_grounding=False, # For a non-grounded report
    )
    processed_inputs = processed_inputs.to(model.device)

    # 3. Generate the findings
    with torch.no_grad():
        output_decoding = model.generate(
            **processed_inputs,
            max_new_tokens=450,
            use_cache=True,
        )

    # Skip the prompt portion for a cleaner result
    prompt_length = processed_inputs["input_ids"].shape[-1]
    decoded_text = processor.decode(output_decoding[0][prompt_length:], skip_special_tokens=True)
    decoded_text = decoded_text.lstrip()

    # Convert the model output into plain text
    prediction = processor.convert_output_to_plaintext_or_grounded_sequence(decoded_text)

    # Return:
    # - frontal/lateral images so they can be displayed in Gradio
    # - the generated findings
    return frontal_image, lateral_image, prediction

In [13]:
import gradio as gr

In [14]:
app_name = "MAIRA-2 CXR Report Generator"
app_description = """
Enter URLs for the frontal and lateral chest X-ray images and relevant metadata.
Click "Generate Findings" to see the automatic radiology report findings.
"""

with gr.Blocks(title=app_name) as demo:
    gr.Markdown(f"## {app_name}")
    gr.Markdown(app_description)

    with gr.Row():
        frontal_url = gr.Textbox(
            label="Frontal Image URL",
            value="https://openi.nlm.nih.gov/imgs/512/145/145/CXR145_IM-0290-1001.png"
        )
        lateral_url = gr.Textbox(
            label="Lateral Image URL",
            value="https://openi.nlm.nih.gov/imgs/512/145/145/CXR145_IM-0290-2001.png"
        )

    indication = gr.Textbox(label="Indication", value="Dyspnea.")
    comparison = gr.Textbox(label="Comparison", value="None.")
    technique = gr.Textbox(label="Technique", value="PA and lateral views of the chest.")

    generate_button = gr.Button("Generate Findings")

    with gr.Row():
        frontal_image_out = gr.Image(label="Frontal Image")
        lateral_image_out = gr.Image(label="Lateral Image")
    result_text_out = gr.Textbox(label="Generated Findings", lines=6)

    generate_button.click(
        fn=generate_findings,
        inputs=[frontal_url, lateral_url, indication, comparison, technique],
        outputs=[frontal_image_out, lateral_image_out, result_text_out]
    )

In [15]:
if __name__ == "__main__":
    demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bf26da9c02bd11815e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
