In [1]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import time
import nltk
import torch

In [2]:
torch.cuda.empty_cache()

In [4]:
model_name = "t5-small" 
model = T5ForConditionalGeneration.from_pretrained(model_name).to('cuda')
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [5]:
og_med_data = pd.read_json("hf://datasets/ncbi/Open-Patients/Open-Patients.jsonl", lines=True)
og_med_data.head()

Unnamed: 0,_id,description
0,trec-cds-2014-1,A 58-year-old African-American woman presents ...
1,trec-cds-2014-2,An 8-year-old male presents in March to the ER...
2,trec-cds-2014-3,A 58-year-old nonsmoker white female with mild...
3,trec-cds-2014-4,A 2-year-old boy is brought to the emergency d...
4,trec-cds-2014-5,A 56-year-old female on 20th day post-left mas...


In [7]:
og_med_data_subset = og_med_data.iloc[0:30000,:]

In [8]:
def generate_t5_summary_batch(texts, batch_size=4, max_input_length=512, max_output_length=150):
    summaries = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, truncation=True, padding=True, max_length=max_input_length, return_tensors="pt").to("cuda")
        outputs = model.generate(inputs["input_ids"], max_length=max_output_length, num_beams=4, early_stopping=True)
        decoded_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        summaries.extend(decoded_batch)
    return summaries

In [9]:
def tokenize_texts(texts):
    return [" ".join(nltk.word_tokenize(text.lower())) for text in texts]

In [10]:
descriptions = og_med_data_subset["description"].tolist()

In [11]:
def calculate_sentence_bleu(reference, generated):
    smoothing_function = SmoothingFunction().method1
    scores = [
        sentence_bleu([nltk.word_tokenize(ref)], nltk.word_tokenize(gen), smoothing_function=smoothing_function)
        for ref, gen in zip(reference, generated)
    ]
    return sum(scores) / len(scores)

In [12]:
start_time = time.time()
t5_summaries = generate_t5_summary_batch(descriptions, batch_size=2)
t5_time = time.time() - start_time

In [13]:
reference_tokenized = tokenize_texts(descriptions)
generated_tokenized = tokenize_texts(t5_summaries)

In [14]:
sentence_bleu_score = calculate_sentence_bleu(reference_tokenized, generated_tokenized)

In [15]:
print(f"t5 Summary Generation Metrics:")
print(f"  Time taken: {t5_time:.2f} seconds")
print(f"  Sentence-Level BLEU Score: {sentence_bleu_score:.4f}")

t5 Summary Generation Metrics:
  Time taken: 432.14 seconds
  Sentence-Level BLEU Score: 0.0493


In [19]:
avg_rouge1, avg_rouge2, avg_rougeL = og_med_data_subset[['rouge1', 'rouge2', 'rougeL']].mean()

In [20]:
print(f"Average Test Rouge1 Score: {avg_rouge1:.4f}")
print(f"Average Test Rouge2 Score: {avg_rouge2:.4f}")
print(f"Average Test RougeL Score: {avg_rougeL:.4f}")

Average Test Rouge1 Score: 0.1468
Average Test Rouge2 Score: 0.0465
Average Test RougeL Score: 0.1131


In [16]:
for ref, gen in zip(reference_tokenized[:5], generated_tokenized[:5]):
    print("\n---")
    print(f"Reference: {ref}")
    print(f"Generated: {gen}")


---
Reference: a 58-year-old african-american woman presents to the er with episodic pressing/burning anterior chest pain that began two days earlier for the first time in her life . the pain started while she was walking , radiates to the back , and is accompanied by nausea , diaphoresis and mild dyspnea , but is not increased on inspiration . the latest episode of pain ended half an hour prior to her arrival . she is known to have hypertension and obesity . she denies smoking , diabetes , hypercholesterolemia , or a family history of heart disease . she currently takes no medications . physical examination is normal . the ekg shows nonspecific changes .
Generated: african-american woman , 58 , presents to the er with chest pain that began two days earlier . the pain began while she was walking , radiates to the back , and is accompanied by nausea , diaphoresis and mild dyspnea .

---
Reference: an 8-year-old male presents in march to the er with fever up to 39 c , dyspnea and cough 