In [16]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import nltk
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch
import os
os.environ["WANDB_DISABLED"] = "true"

In [23]:
df = pd.read_csv('Open-Patients-With-Summaries.csv')

In [3]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small').to('cuda')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
def preprocess_function(examples):
    inputs = t5_tokenizer(
        examples['original_text'],
        padding='max_length',
        truncation=True,
        max_length=512
    ).to('cuda')
    labels = t5_tokenizer(
        examples['summary'],
        padding='max_length',
        truncation=True,
        max_length=150
    ).to('cuda')
    inputs['labels'] = labels['input_ids']
    return inputs

In [5]:
train_df, test_df = train_test_split(df[['original_text', 'summary']], test_size=0.2, random_state=42)

In [6]:
train_dataset = Dataset.from_pandas(train_df)

In [7]:
train_dataset = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

In [8]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='no',
    save_strategy='epoch',
    run_name="t5_training_run",
    report_to=None)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [9]:
trainer = Trainer(
    model=t5_model,
    args=training_args,
    train_dataset=train_dataset)

In [10]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=300, training_loss=2.074310099283854, metrics={'train_runtime': 66.6804, 'train_samples_per_second': 35.993, 'train_steps_per_second': 4.499, 'total_flos': 324820323532800.0, 'train_loss': 2.074310099283854, 'epoch': 1.0})

In [11]:
save_path = 't5-finetuned'
trainer.save_model(save_path)
t5_tokenizer.save_pretrained(save_path)

('t5-finetuned\\tokenizer_config.json',
 't5-finetuned\\special_tokens_map.json',
 't5-finetuned\\spiece.model',
 't5-finetuned\\added_tokens.json')

In [12]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-finetuned')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-finetuned')

In [13]:
def generate_summary(text):
    inputs = t5_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    summary_ids = t5_model.generate(inputs['input_ids'], max_length=150, num_beams=5, length_penalty=2.0, early_stopping=True)
    summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [14]:
generated_summaries = []
for idx, row in test_df.iterrows():
    description = row['original_text']
    generated_summary = generate_summary(description)
    generated_summaries.append(generated_summary)


In [15]:
test_df['generated_summary_model'] = generated_summaries

In [20]:
def calculate_bleu_scores(references, generated):

    reference_tokenized = [[nltk.word_tokenize(ref)] for ref in references]  
    generated_tokenized = [nltk.word_tokenize(gen) for gen in generated]  
    
 
    smoothing = SmoothingFunction().method4
    bleu_score = corpus_bleu(reference_tokenized, generated_tokenized, smoothing_function=smoothing)
    return bleu_score


bleu_score = calculate_bleu_scores(
    test_df['summary'].tolist(),
    test_df['generated_summary_model'].tolist()
)

print(f"Average BLEU Score: {bleu_score:.4f}")



Average BLEU Score: 0.1705


In [21]:
bleu_score = calculate_bleu_scores(test_df['summary'].tolist(), test_df['generated_summary_model'].tolist())
print(f"Average BLEU Score: {bleu_score:.4f}")



Average BLEU Score: 0.1705


In [6]:
avg_rouge1, avg_rouge2, avg_rougeL = df[['rouge1','rouge2','rougeL']].mean()

In [7]:
print(f"Average Test Rouge1 Score: {avg_rouge1:.4f}")
print(f"Average Test Rouge2 Score: {avg_rouge2:.4f}")
print(f"Average Test RougeL Score: {avg_rougeL:.4f}")

Average Test Rouge1 Score: 0.3840
Average Test Rouge2 Score: 0.2732
Average Test RougeL Score: 0.3150


In [22]:
output_path = 'test_samples_results-T5.csv'
columns_to_save = ['original_text', 'generated_summary_model', 'summary']
test_df[columns_to_save].to_csv(output_path, index=False)
print(f"Results saved to {output_path}")

Results saved to test_samples_results-T5.csv
