In [1]:
import os 
# os.environ["WANDB_DISABLED"] = "true"
import pandas as pd
import torch
from tqdm import tqdm
import torch.nn.functional as F
from tqdm.auto import tqdm
from datetime import datetime
import wandb
import time
import os
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, PegasusConfig, get_cosine_schedule_with_warmup
from torch.optim import AdamW

from torch.utils.data import DataLoader, Dataset
from rouge import Rouge
from torch.optim.lr_scheduler import _LRScheduler
import math


2025-05-08 13:54:25.853161: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-08 13:54:26.223416: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746687266.359370   33644 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746687266.400935   33644 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746687266.716609   33644 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
NUM_EPOCHS = 10
BATCH_SIZE = 4
FRAC_SAMPLE = 0.001
MAX_LENGTH_ARTICLE = 512
MIN_LENGTH_ARTICLE = 50
MAX_LENGTH_SUMMARY = 128
MIN_LENGTH_SUMMARY = 20
HIDDEN_DIM = 128
LEARNING_RATE = 1e-4
PATIENCE = 5  # For early stopping
WEIGHT_DECAY = 1e-4
NUM_CYCLES = 5


model_dir = "../Model"
datafilter = "../dataft"
save_dir = "fine_tuned_pegasus_custom"
output_path = os.path.join(datafilter, "test_pred_5.csv")
os.makedirs(datafilter, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.makedirs(save_dir, exist_ok=True)

In [3]:
train_data = pd.read_csv("../dataset/train.csv")
validation_data = pd.read_csv("../dataset/validation.csv")
test_data = pd.read_csv("../dataset/test.csv")

# add col
train_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)
validation_data.rename(columns={"highlights": "summaries","article":"articles"}, inplace=True)
test_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)

train_data["article_word_count"] = train_data["articles"].astype(str).apply(lambda x: len(x.split()))
train_data["summary_word_count"] = train_data["summaries"].astype(str).apply(lambda x: len(x.split()))

validation_data["article_word_count"] = validation_data["articles"].astype(str).apply(lambda x: len(x.split()))
validation_data["summary_word_count"] = validation_data["summaries"].astype(str).apply(lambda x: len(x.split()))

test_data["article_word_count"] = test_data["articles"].astype(str).apply(lambda x: len(x.split()))
test_data["summary_word_count"] = test_data["summaries"].astype(str).apply(lambda x: len(x.split()))

# filter range
train_data = train_data[
    (train_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (train_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (train_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (train_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

validation_data = validation_data[
    (validation_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (validation_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (validation_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (validation_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]
test_data = test_data[
    (test_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (test_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (test_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (test_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

train_sample = train_data.sample(frac=FRAC_SAMPLE, random_state=1)
validation_sample = validation_data.sample(frac=FRAC_SAMPLE*100, random_state=1)
test_sample = test_data.sample(frac=0.1, random_state=1)
train_sample.info()
print("\n")
validation_sample.info()
train_sample.to_csv(os.path.join(datafilter,"train_sample.csv"), index=False)
test_sample.to_csv(os.path.join(datafilter,"test_sample.csv"), index=False)
validation_sample.to_csv(os.path.join(datafilter,"validation_sample.csv"), index=False)


<class 'pandas.core.frame.DataFrame'>
Index: 96 entries, 144417 to 16783
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  96 non-null     object
 1   articles            96 non-null     object
 2   summaries           96 non-null     object
 3   article_word_count  96 non-null     int64 
 4   summary_word_count  96 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 4.5+ KB


<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 8901 to 9955
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  5 non-null      object
 1   articles            5 non-null      object
 2   summaries           5 non-null      object
 3   article_word_count  5 non-null      int64 
 4   summary_word_count  5 non-null      int64 
dtypes: int64(2), object(3)
memory usage: 240.0+ bytes


In [4]:
train_sample = pd.read_csv("../dataft/train_sample.csv")
validation_sample = pd.read_csv("../dataft/validation_sample.csv")
test_sample = pd.read_csv("../dataft/test_sample.csv")
train_sample.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  96 non-null     object
 1   articles            96 non-null     object
 2   summaries           96 non-null     object
 3   article_word_count  96 non-null     int64 
 4   summary_word_count  96 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 3.9+ KB


In [5]:
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=MAX_LENGTH_ARTICLE, max_output_length=MAX_LENGTH_SUMMARY):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        article = self.data.iloc[index]["articles"]
        summary = self.data.iloc[index]["summaries"]
        input_ids = self.tokenizer.encode(article, max_length=self.max_input_length, truncation=True, padding="max_length")
        output_ids = self.tokenizer.encode(summary, max_length=self.max_output_length, truncation=True, padding="max_length")
        return {
            "input_ids": input_ids,
            "attention_mask": [int(token_id != 0) for token_id in input_ids],
            "decoder_input_ids": output_ids[:-1],
            "decoder_attention_mask": [1] * (len(output_ids) - 1),
            "labels": output_ids[1:]
        }
        
train_df = train_sample
test_df = test_sample
val_df = validation_sample


In [6]:
# Initialize model and tokenizer
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
config = PegasusConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=512,  # Increased from 256
    encoder_layers=6,  # Increased from 4
    decoder_layers=6,  # Increased from 4
    encoder_attention_heads=8,
    decoder_attention_heads=8,
    dropout=0.1,
    max_position_embeddings=MAX_LENGTH_ARTICLE,
    scale_embedding=True
)
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large", config=config).to(device)




Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized because the shapes did not match:
- model.shared.weight: found shape torch.Size([96103, 1024]) in the checkpoint and torch.Size([96103, 512]) in the model instantiated
- model.encoder.embed_tokens.weight: found shape torch.Size([96103, 1024]) in the checkpoint and torch.Size([96103, 512]) in the model instantiated
- model.encoder.layers.0.self_attn.k_proj.weight: found shape torch.Size([1024, 1024]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- model.encoder.layers.0.self_attn.k_proj.b

In [7]:
train_dataset = SummarizationDataset(train_df, tokenizer)
val_dataset = SummarizationDataset(val_df, tokenizer)
test_dataset = SummarizationDataset(test_df, tokenizer)

In [8]:
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    decoder_input_ids = [item["decoder_input_ids"] for item in batch]
    decoder_attention_mask = [item["decoder_attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]
    max_input_length = max(len(ids) for ids in input_ids)
    max_output_length = max(len(ids) for ids in decoder_input_ids)
    input_ids = [ids + [0] * (max_input_length - len(ids)) for ids in input_ids]
    attention_mask = [mask + [0] * (max_input_length - len(mask)) for mask in attention_mask]
    decoder_input_ids = [ids + [0] * (max_output_length - len(ids)) for ids in decoder_input_ids]
    decoder_attention_mask = [mask + [0] * (max_output_length - len(mask)) for mask in decoder_attention_mask]
    labels = [ids + [-100] * (max_output_length - len(ids)) for ids in labels]
    return {
        "input_ids": torch.tensor(input_ids),
        "attention_mask": torch.tensor(attention_mask),
        "decoder_input_ids": torch.tensor(decoder_input_ids),
        "decoder_attention_mask": torch.tensor(decoder_attention_mask),
        "labels": torch.tensor(labels)
    }


In [9]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

In [10]:
no_decay = ['bias', 'LayerNorm.weight']    # cần ít regularization hơn weight
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        'weight_decay': WEIGHT_DECAY,
    },
    {
        'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0,
    },
]
# optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)

# Scheduler
num_training_steps = len(train_loader) * NUM_EPOCHS
# scheduler = get_cosine_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=int(0.2 * num_training_steps),
#     num_training_steps=num_training_steps,
#     num_cycles=NUM_CYCLES
# )



In [11]:
# Add custom scheduler class (place this at the top of your script)
class CosineWarmupWithBounds(_LRScheduler):
    def __init__(self, optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, max_lr=1e-4, min_lr=1e-5, last_epoch=-1):
        self.num_warmup_steps = num_warmup_steps
        self.num_training_steps = num_training_steps
        self.num_cycles = num_cycles
        self.max_lr = max_lr
        self.min_lr = min_lr
        super(CosineWarmupWithBounds, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        step = self.last_epoch + 1
        if step < self.num_warmup_steps:
            lr = self.min_lr + (self.max_lr - self.min_lr) * step / self.num_warmup_steps
        else:
            progress = (step - self.num_warmup_steps) / (self.num_training_steps - self.num_warmup_steps)
            cosine_factor = 0.5 * (1.0 + math.cos(math.pi * progress * self.num_cycles))
            lr = self.min_lr + (self.max_lr - self.min_lr) * cosine_factor
        lr = max(self.min_lr, min(self.max_lr, lr))
        return [lr for _ in self.optimizer.param_groups]

# Update optimizer and scheduler
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)
scheduler = CosineWarmupWithBounds(
    optimizer,
    num_warmup_steps=int(0.2 * num_training_steps),
    num_training_steps=num_training_steps,
    num_cycles=NUM_CYCLES,
    max_lr=1e-4,
    min_lr=1e-5
)

In [12]:
# Weights & Biases initialization
wandb.init(
    project="Finetune-Summarization",
    name=f"pegasus-custom-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    config={
        "model": "Pegasus_custom",
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "weight_decay": WEIGHT_DECAY,
        "num_epochs": NUM_EPOCHS,
        "num_cycles": NUM_CYCLES,
        "data_ratio": FRAC_SAMPLE,
        "warm_up": "Cosine",
    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvubkk67[0m ([33mvubkk67-hanoi-university-of-science-and-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
# Save best model and early stopping
best_val_loss = float("inf")
epochs_no_improve = 0
wandb.watch(model)

# Training loop
for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
        val_loss /= len(val_loader)
        current_lr = scheduler.get_last_lr()[0]

    # W&B log
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "lr": current_lr,
        "best_val_loss": best_val_loss
    })

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"Saved best model to `{save_dir}` at epoch {epoch+1}")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    # Early stopping
    if epochs_no_improve >= PATIENCE:
        print(f"Early stopping triggered at epoch {epoch+1}")
        break

    print(
        f"Epoch {epoch+1:02d} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Val Loss: {val_loss:.4f} | "
        f"LR: {current_lr:.6f} | "
        f"Time: {time.time() - start_time:.2f}s"
    )

# W&B end
wandb.finish()


Epoch 1 [Train]:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 1 [Val]:   0%|          | 0/2 [00:00<?, ?it/s]

Saved best model to `fine_tuned_pegasus_custom` at epoch 1
Epoch 01 | Train Loss: 6.3165 | Val Loss: 8.2625 | LR: 0.000057 | Time: 9.17s


Epoch 2 [Train]:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 2 [Val]:   0%|          | 0/2 [00:00<?, ?it/s]

Saved best model to `fine_tuned_pegasus_custom` at epoch 2
Epoch 02 | Train Loss: 4.8005 | Val Loss: 7.1994 | LR: 0.000100 | Time: 8.22s


Epoch 3 [Train]:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 3 [Val]:   0%|          | 0/2 [00:00<?, ?it/s]

Saved best model to `fine_tuned_pegasus_custom` at epoch 3
Epoch 03 | Train Loss: 3.9359 | Val Loss: 6.4831 | LR: 0.000034 | Time: 8.29s


Epoch 4 [Train]:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 4 [Val]:   0%|          | 0/2 [00:00<?, ?it/s]

Saved best model to `fine_tuned_pegasus_custom` at epoch 4
Epoch 04 | Train Loss: 3.6124 | Val Loss: 6.3459 | LR: 0.000026 | Time: 8.39s


Epoch 5 [Train]:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 5 [Val]:   0%|          | 0/2 [00:00<?, ?it/s]

Saved best model to `fine_tuned_pegasus_custom` at epoch 5
Epoch 05 | Train Loss: 3.4280 | Val Loss: 6.0042 | LR: 0.000098 | Time: 8.46s


Epoch 6 [Train]:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 6 [Val]:   0%|          | 0/2 [00:00<?, ?it/s]

Saved best model to `fine_tuned_pegasus_custom` at epoch 6
Epoch 06 | Train Loss: 3.0785 | Val Loss: 5.7593 | LR: 0.000051 | Time: 8.25s


Epoch 7 [Train]:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 7 [Val]:   0%|          | 0/2 [00:00<?, ?it/s]

Saved best model to `fine_tuned_pegasus_custom` at epoch 7
Epoch 07 | Train Loss: 2.9139 | Val Loss: 5.7260 | LR: 0.000015 | Time: 8.40s


Epoch 8 [Train]:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 8 [Val]:   0%|          | 0/2 [00:00<?, ?it/s]

Saved best model to `fine_tuned_pegasus_custom` at epoch 8
Epoch 08 | Train Loss: 2.8703 | Val Loss: 5.6894 | LR: 0.000089 | Time: 8.46s


Epoch 9 [Train]:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 9 [Val]:   0%|          | 0/2 [00:00<?, ?it/s]

Saved best model to `fine_tuned_pegasus_custom` at epoch 9
Epoch 09 | Train Loss: 2.7737 | Val Loss: 5.6409 | LR: 0.000069 | Time: 8.39s


Epoch 10 [Train]:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 10 [Val]:   0%|          | 0/2 [00:00<?, ?it/s]

Saved best model to `fine_tuned_pegasus_custom` at epoch 10
Epoch 10 | Train Loss: 2.6835 | Val Loss: 5.6369 | LR: 0.000010 | Time: 8.28s


VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_val_loss,█▅▃▃▂▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
lr,▅█▃▂█▄▁▇▆▁
train_loss,█▅▃▃▂▂▁▁▁▁
val_loss,█▅▃▃▂▁▁▁▁▁

0,1
best_val_loss,5.64091
epoch,10.0
lr,1e-05
train_loss,2.68354
val_loss,5.63685


In [14]:
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
model = PegasusForConditionalGeneration.from_pretrained(save_dir).to(device)

In [15]:
# Evaluation
model.to(device)
model.eval()

predictions = []

with torch.no_grad():
    for step, batch in enumerate(tqdm(test_loader, desc="Generating summaries")):
        batch = {k: v.to(device) for k, v in batch.items()}
        output_ids = model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=MAX_LENGTH_SUMMARY,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )
        batch_preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        predictions.extend(batch_preds)

test_sample = test_df.iloc[:len(predictions)].copy()
test_sample["predicted_summary"] = predictions
test_sample.to_csv(output_path, index=False)

print(f"✅ File has been saved at: {output_path}")

Generating summaries:   0%|          | 0/106 [00:00<?, ?it/s]

✅ File has been saved at: ../dataft/test_pred_5.csv


In [16]:
test_pred = pd.read_csv(output_path)

In [17]:
display(test_pred[["articles","summaries", "predicted_summary"]].head(10))

Unnamed: 0,articles,summaries,predicted_summary
0,A Florida bus passenger was arrested for throw...,"Joel Parker, 33, was riding the bus in St John...",
1,Aston Villa may be able to sign Cordoba strike...,Aston Villa have held talks over Cordoba strik...,
2,A South Carolina mother of four died over the ...,"Adam Leheup ran Fitness 535 in Columbia, South...",
3,A playground in Tokyo has been found to have d...,Soil underneath a slide in the park showed ext...,
4,A lonely shepherd has been found dead alongsid...,"Body of Jose Alberto, 58, discovered at home i...",
5,"Four years after signing for Arsenal, Wellingt...",Wellington Silva signed for Arsenal in 2011 fo...,
6,Joey Barton has urged QPR to use their win ove...,Queens Park Rangers strode to a 4-1 victory ag...,
7,An Indian woman who holds several awards for t...,Smita Srivastava currently holds record for lo...,
8,Drilling threes on the buzzer is all good as f...,LeBron James posted an unhappy picture at the ...,
9,A teenager killed by police in Illinois on Sat...,"Justus Howell, 17, was running from scene of a...",


In [18]:
# Tính điểm ROUGE
if "summaries" in test_pred.columns:
    rouge = Rouge()
    scores = rouge.get_scores(predictions, test_sample["summaries"].tolist(), avg=True)

    print("ROUGE scores:")
    print(f"ROUGE-1: {scores['rouge-1']['f']:.4f}")
    print(f"ROUGE-2: {scores['rouge-2']['f']:.4f}")
    print(f"ROUGE-L: {scores['rouge-l']['f']:.4f}")
else:
    print("⚠️ Không tìm thấy cột 'summaries' để tính ROUGE.")

ValueError: Hypothesis is empty.