In [None]:
from transformers import pipeline
import pandas as pd

df = pd.read_csv("../data/medical_transcription.csv")

summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    framework="pt"   # <-- THIS LINE MATTERS
)


In [None]:
def summarize_report(text):
    word_count = len(text.split())

    # Short text: summarize directly
    if word_count < 400:
        summary = summarizer(
            text,
            max_length=120,
            min_length=50,
            do_sample=False
        )
        return summary[0]["summary_text"]

    # Long text: chunk + summarize
    chunks = chunk_text(text)
    chunk_summaries = []

    for chunk in chunks:
        try:
            summary = summarizer(
                chunk,
                max_length=120,
                min_length=40,
                do_sample=False
            )
            chunk_summaries.append(summary[0]["summary_text"])
        except Exception:
            continue  # fail-safe

    # Combine chunk summaries
    combined_summary = " ".join(chunk_summaries)

    # Optional second-pass summarization
    if len(combined_summary.split()) > 150:
        final_summary = summarizer(
            combined_summary,
            max_length=150,
            min_length=60,
            do_sample=False
        )
        return final_summary[0]["summary_text"]

    return combined_summary


In [None]:
DISCLAIMER = (
    "This summary is for informational purposes only "
    "and does not constitute medical advice or diagnosis."
)


In [None]:
text = df["transcription"].iloc[0]

final_output = summarize_report(text) + "\n\n" + DISCLAIMER
print(final_output)
