In [1]:
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import os
os.environ["WANDB_DISABLED"] = "true"


In [2]:
df = pd.read_csv('Open-Patients-With-Summaries.csv')

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
torch.cuda.empty_cache()

In [5]:
model_name = 'google/pegasus-large'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to('cuda')

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples['original_text'], padding="max_length", truncation=True, max_length=256, return_tensors="pt"
    )
    labels = tokenizer(
        examples['summary'], padding="max_length", truncation=True, max_length=128, return_tensors="pt"
    )

    labels["input_ids"] = torch.tensor([
        [(label if label != tokenizer.pad_token_id else -100) for label in seq]
        for seq in labels["input_ids"]
    ])

    inputs = {key: value.to('cuda') for key, value in inputs.items()}
    inputs["labels"] = labels["input_ids"].to('cuda')  # Move labels to GPU
    return inputs

In [7]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(test_df)

In [9]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

In [11]:

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_dir='./logs',
    fp16=True,
    report_to=None)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,1.1793,0.881459


Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


TrainOutput(global_step=600, training_loss=1.1536206309000652, metrics={'train_runtime': 2836.7908, 'train_samples_per_second': 0.846, 'train_steps_per_second': 0.212, 'total_flos': 1733678648524800.0, 'train_loss': 1.1536206309000652, 'epoch': 1.0})

In [12]:
save_path = './pegasus-finetuned'
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


('./pegasus-finetuned\\tokenizer_config.json',
 './pegasus-finetuned\\special_tokens_map.json',
 './pegasus-finetuned\\spiece.model',
 './pegasus-finetuned\\added_tokens.json')

In [19]:
def calculate_metrics(row):
    inputs = tokenizer(
        row['original_text'], return_tensors="pt", padding="max_length", truncation=True, max_length=256
    ).input_ids.to(device)
    
    target = tokenizer(
        row['summary'], return_tensors="pt", padding="max_length", truncation=True, max_length=128
    ).input_ids.to(device)

    generated_ids = model.generate(inputs, max_length=128, num_beams=5, early_stopping=True)
    predicted_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    bleu = sentence_bleu([row['summary'].split()], predicted_summary.split())
    
    return bleu, predicted_summary

In [20]:
test_metrics = test_df.apply(calculate_metrics, axis=1, result_type='expand')
test_metrics.columns = ['bleu_score', 'predicted_summary']

In [30]:
test_df = pd.concat([test_df, test_metrics], axis=1)

In [31]:
duplicate_columns = test_df.columns.duplicated(keep='last') 

test_df = test_df.loc[:, duplicate_columns | ~test_df.columns.duplicated(keep='first')]

print(test_df.columns)

Index(['case_id', 'original_text', 'summary', 'avg_relevance', 'avg_diversity',
       'avg_mmr', 'generated_summary', 'similarity_score', 'bleu_score',
       'rouge1', 'rouge2', 'rougeL', 'predicted_summary'],
      dtype='object')


In [32]:
avg_bleu = test_df['bleu_score'].mean()
print(f"Average Test BLEU Score: {avg_bleu:.4f}")

Average Test BLEU Score: 0.0861


In [33]:
print(test_df['bleu_score'].head())  
print(test_df['bleu_score'].dtype)  

1801    0.046268
1190    0.129621
1817    0.087839
251     0.083179
2505    0.136461
Name: bleu_score, dtype: float64
float64


In [26]:
output_path = './Test-Predicted-With-Metrics-Pegasus.csv'
test_df.to_csv(output_path, index=False)

In [27]:
print(f"Test results saved to {output_path}")

Test results saved to ./Test-Predicted-With-Metrics-Pegasus.csv


In [35]:
df_results = pd.read_csv('Test-Predicted-With-Metrics-Pegasus.csv')

In [36]:
df_results.head()

Unnamed: 0,case_id,original_text,summary,avg_relevance,avg_diversity,avg_mmr,generated_summary,similarity_score,bleu_score,rouge1,rouge2,rougeL,bleu_score_predicted,predicted_summary
0,pmc-6076118-1,A 36-year-old man who denied previous systemic...,A 36-year-old man who denied previous systemic...,0.927933,0.788866,0.481566,A 36-year-old man who denied previous systemic...,0.746881,0.046268,0.409524,0.339713,0.357143,0.078022,A 36-year-old man who denied previous systemic...
1,pmc-6032496-1,A 65-year-old woman presented with a three-mon...,A 65-year-old woman presented with a three-mon...,0.840577,0.755277,0.425655,A 65-year-old woman presented with a three-mon...,0.872979,0.129621,0.504,0.41129,0.424,0.031381,A 65-year-old woman presented with a three-mon...
2,pmc-6076939-1,A 30-year-old gravida 1 para 1 female presente...,A 30-year-old gravida 1 para 1 female presente...,0.898212,0.768605,0.464839,A 30-year-old gravida 1 para 1 female presente...,0.70173,0.087839,0.456954,0.393333,0.437086,0.154417,A 30-year-old gravida 1 para 1 female presente...
3,pmc-8676056-2,A 35-year-old male patient presented with a si...,A 35-year-old male patient presented with a si...,0.887862,0.76272,0.607263,A 35-year-old male patient presented with a si...,0.75927,0.083179,0.423423,0.263636,0.324324,0.516986,A 35-year-old male patient presented with a si...
4,pmc-6116383-1,A 30-year-old female presented with complaints...,A 30-year-old female presented with complaints...,0.870243,0.7624,0.445755,A 30-year-old female presented with complaints...,0.747018,0.136461,0.503497,0.401408,0.412587,0.219651,A 30-year-old female presented with complaints...


In [37]:
avg_bleu = df_results['bleu_score_predicted'].mean()
print(f"Average Test BLEU Score: {avg_bleu:.4f}")

Average Test BLEU Score: 0.2576


In [41]:
df[['rouge1','rouge2','rougeL']].head()

Unnamed: 0,rouge1,rouge2,rougeL
0,0.621951,0.493827,0.487805
1,0.736842,0.586667,0.539474
2,0.551724,0.45614,0.534483
3,0.427083,0.252632,0.354167
4,0.469799,0.326531,0.402685


In [42]:
avg_rouge1, avg_rouge2, avg_rougeL = df[['rouge1','rouge2','rougeL']].mean()

In [43]:
print(f"Average Test Rouge1 Score: {avg_rouge1:.4f}")
print(f"Average Test Rouge2 Score: {avg_rouge2:.4f}")
print(f"Average Test RougeL Score: {avg_rougeL:.4f}")

Average Test Rouge1 Score: 0.4034
Average Test Rouge2 Score: 0.2897
Average Test RougeL Score: 0.3319
