## Setup and Dataset Download




In [1]:
!pip install rouge_score
!pip install evaluate
import numpy as np 
import pandas as pd
import kagglehub
import os
import kagglehub
import re
from datasets import Dataset
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))




path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

print("Path to dataset files:", path)

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=ec6a79ac94fe714ba455b8b109e0d96bc1115157023ef2d7cc4cc21fc7ae17e4
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:

2025-11-02 15:51:38.261744: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762098698.464437      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762098698.530029      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv
/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv
/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv
Path to dataset files: /kaggle/input/newspaper-text-summarization-cnn-dailymail


# Reading the data

## Load Dataset

###  Load the training, validation, and test splits into pandas DataFrames

In [2]:
training_set = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv')
test_set = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv')
validation_set = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv')
training_set.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


###  Droped Unused Column

In [3]:
training_set.drop(columns = ['id'],inplace = True)
test_set.drop(columns = ['id'],inplace = True)
validation_set.drop(columns = ['id'],inplace = True)

## checking for nulls

In [4]:
print(training_set.isnull().sum())
print(training_set.shape)

article       0
highlights    0
dtype: int64
(287113, 2)


###  droped the duplicates

In [5]:
training_set.drop_duplicates(inplace = True)

## checking the data size

In [6]:
print(training_set.shape)
print(validation_set.shape)
print(test_set.shape)

(284015, 2)
(13368, 2)
(11490, 2)


## Sampling the Dataset

###  Since the CNN/DailyMail dataset is large,  
###  we take smaller random samples for training

In [7]:
training_set = training_set.sample(n=40_000, random_state=42)

In [8]:
training_set.shape


(40000, 2)

## Text Cleaning

###  For summarization, we apply minimal cleaning:  
###  - Remove extra spaces and line breaks.  
###  - Keep punctuation, casing, and sentence structure intact (important for meaning).  

###  This ensures the text remains close to the original while removing noise.


In [9]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [10]:
training_set['article'] = training_set['article'].apply(clean_text)
training_set['highlights'] = training_set['highlights'].apply(clean_text)

validation_set['article'] = validation_set['article'].apply(clean_text)
validation_set['highlights'] = validation_set['highlights'].apply(clean_text)

test_set['article'] = test_set['article'].apply(clean_text)
test_set['highlights'] = test_set['highlights'].apply(clean_text)

## Convert DataFrames to Hugging Face Datasets

###   Convert the pandas DataFrames for training, validation, and testing  
###   into the `datasets.Dataset` format, which is required for use with the Hugging Face Trainer API.


In [11]:
train_dataset = Dataset.from_pandas(training_set)
test_dataset = Dataset.from_pandas(test_set)
validation_dataset = Dataset.from_pandas(validation_set)

###   Load the pretrained **T5-base tokenizer** from Hugging Face.  
###   The tokenizer converts raw text into token IDs that the model can understand,  
###   and will also handle decoding model outputs back into text.

In [12]:
model_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast = True)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

## Preprocessing Function

###   Define a preprocessing function to tokenize the dataset:  
###   - **Articles** are truncated/padded to a maximum length of 512 tokens.  
###   - **Summaries (highlights)** are truncated/padded to a maximum length of 128 tokens.  
###   - The tokenized summaries are stored as labels for training.

In [13]:
max_input_length = 512
max_target_length = 128

def preprocess_data(data):
    model_inputs = tokenizer(
        data["article"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length"  
    )

    labels = tokenizer(
        data["highlights"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length" 
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
train_dataset = train_dataset.map(preprocess_data, batched=True)
validation_dataset   = validation_dataset.map(preprocess_data, batched=True)
test_dataset  = test_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

## Load Pretrained Model and Data Collator

### - Load the pretrained **T5-base** model for sequence-to-sequence learning.  
###   - Use a `DataCollatorForSeq2Seq` to handle dynamic padding and batching during training.  
###   - Suppress unnecessary log messages for cleaner output.


In [None]:
transformers.logging.set_verbosity_error()
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

## Evaluation Metric (ROUGE)

###  Define the evaluation function using **ROUGE scores**, which are standard for text summarization:  
###  - Decode model predictions and labels back into text.  
###  - Replace `-100` values in labels (ignored tokens) with the padding token ID.  
###  - Compute ROUGE metrics (ROUGE-1, ROUGE-2, ROUGE-L) with stemming enabled.  
###  - Return the F-measure for each metric.


In [None]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    result = {k: v.mid.fmeasure if hasattr(v, "mid") else v for k, v in result.items()}
    return result


## Training the Model

###  Set up the `Seq2SeqTrainer` with the model, datasets, tokenizer, and data collator.  
###  - Evaluate and save checkpoints every 500 steps.  
###  - Use gradient accumulation, mixed precision (FP16), and gradient checkpointing for efficiency.  
###  - Track ROUGE scores and load the best model at the end.  
###  - Save the trained model and tokenizer for later use.


In [None]:
model.gradient_checkpointing_enable()

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,     
    save_strategy="steps",
    save_steps=500,            
    learning_rate=2e-5,               
    per_device_train_batch_size=8,    
    per_device_eval_batch_size=8,     
    gradient_accumulation_steps=4,    
    num_train_epochs=3,              
    weight_decay=0.01,                
    save_total_limit=3,               
    predict_with_generate=True,       
    fp16=True,                        
    logging_dir="./logs",
    logging_steps=50,                
    warmup_ratio=0.1,                 
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    disable_tqdm=False
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

model.save_pretrained("/kaggle/working/model")
tokenizer.save_pretrained("/kaggle/working/model")

## Continue Training for Metric Improvement

Continuing training for 1 additional epoch to boost ROUGE metrics, using:
- Lower learning rate (1e-5) for stable updates
- Larger effective batch size (gradient accumulation = 8)
- Resume from last checkpoint

This enhances performance without restarting from scratch.


In [None]:
training_args.num_train_epochs += 1             
training_args.learning_rate = 1e-5            
training_args.gradient_accumulation_steps = 8
training_args.eval_steps = 625
training_args.save_steps = 625


trainer.train(resume_from_checkpoint=True)

model.save_pretrained("/kaggle/working/model_epoch4")
tokenizer.save_pretrained("/kaggle/working/model_epoch4")


# Evaluating the model

#### epoch 3

In [None]:
print("Evaluation started...")
metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Evaluation finished.")
print(f"Test Metrics = {metrics}")

#### epoch 4   

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = T5Tokenizer.from_pretrained("/kaggle/working/model_epoch4")
model = T5ForConditionalGeneration.from_pretrained("/kaggle/working/model_epoch4")

model.to(device)

trainer.model = model

print("Evaluation started...")
metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Evaluation finished.")
print(f"Test Metrics = {metrics}")
