In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from torchvision import transforms
import torch.optim as optim
import random 
from tqdm import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
from tqdm.auto import tqdm
from datetime import datetime
import wandb
import time
import os
import re
import math
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW

from torch.utils.data import DataLoader, Dataset
from rouge import Rouge


2025-04-15 13:37:18.584646: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-15 13:37:19.141830: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744699039.347036    1467 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744699039.404705    1467 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744699039.888053    1467 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
NUM_EPOCHS = 50
BATCH_SIZE = 8
FRAC_SAMPLE = 0.01
MAX_LENGTH_ARTICLE = 512
MIN_LENGTH_ARTICLE = 50
MAX_LENGTH_SUMMARY = 128
MIN_LENGTH_SUMMARY = 20
HIDDEN_DIM = 128
LEARNING_RATE = 1e-5
MAX_PLATEAU_COUNT = 7
WEIGHT_DECAY = 1e-4
USE_SCHEDULER = True
NUM_CYCLES = 5


model_dir = "../Model"
datafilter = "../dataft"
save_dir = "fine_tuned_bart_cosine_4"
output_path = os.path.join(datafilter, "test_pred_4.csv")
os.makedirs(datafilter, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.makedirs(save_dir, exist_ok=True)

In [3]:
train_data = pd.read_csv("../dataset/train.csv")
validation_data = pd.read_csv("../dataset/validation.csv")
test_data = pd.read_csv("../dataset/test.csv")

# add col
train_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)
validation_data.rename(columns={"highlights": "summaries","article":"articles"}, inplace=True)
test_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)

train_data["article_word_count"] = train_data["articles"].astype(str).apply(lambda x: len(x.split()))
train_data["summary_word_count"] = train_data["summaries"].astype(str).apply(lambda x: len(x.split()))

validation_data["article_word_count"] = validation_data["articles"].astype(str).apply(lambda x: len(x.split()))
validation_data["summary_word_count"] = validation_data["summaries"].astype(str).apply(lambda x: len(x.split()))

test_data["article_word_count"] = test_data["articles"].astype(str).apply(lambda x: len(x.split()))
test_data["summary_word_count"] = test_data["summaries"].astype(str).apply(lambda x: len(x.split()))

# filter range
train_data = train_data[
    (train_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (train_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (train_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (train_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

validation_data = validation_data[
    (validation_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (validation_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (validation_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (validation_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]
test_data = test_data[
    (test_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (test_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (test_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (test_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

train_sample = train_data.sample(frac=FRAC_SAMPLE, random_state=1)
validation_sample = validation_data.sample(frac=FRAC_SAMPLE, random_state=1)
test_sample = test_data.sample(frac=1, random_state=1)
train_sample.info()
print("\n")
validation_sample.info()
train_sample.to_csv(os.path.join(datafilter,"train_sample.csv"), index=False)
test_sample.to_csv(os.path.join(datafilter,"test_sample.csv"), index=False)
validation_sample.to_csv(os.path.join(datafilter,"validation_sample.csv"), index=False)


<class 'pandas.core.frame.DataFrame'>
Index: 19198 entries, 144417 to 201560
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  19198 non-null  object
 1   articles            19198 non-null  object
 2   summaries           19198 non-null  object
 3   article_word_count  19198 non-null  int64 
 4   summary_word_count  19198 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 899.9+ KB


<class 'pandas.core.frame.DataFrame'>
Index: 994 entries, 8901 to 8365
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  994 non-null    object
 1   articles            994 non-null    object
 2   summaries           994 non-null    object
 3   article_word_count  994 non-null    int64 
 4   summary_word_count  994 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 46.6+ KB


In [4]:
train_sample = pd.read_csv("../dataft/train_sample.csv")
validation_sample = pd.read_csv("../dataft/validation_sample.csv")
test_sample = pd.read_csv("../dataft/test_sample.csv")
train_sample.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19198 entries, 0 to 19197
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  19198 non-null  object
 1   articles            19198 non-null  object
 2   summaries           19198 non-null  object
 3   article_word_count  19198 non-null  int64 
 4   summary_word_count  19198 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 750.1+ KB


In [5]:
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=MAX_LENGTH_ARTICLE, max_output_length=MAX_LENGTH_SUMMARY):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        article = self.data.iloc[index]["articles"]
        summary = self.data.iloc[index]["summaries"]
        input_ids = self.tokenizer.encode(article, max_length=self.max_input_length, truncation=True, padding="max_length")
        output_ids = self.tokenizer.encode(summary, max_length=self.max_output_length, truncation=True, padding="max_length")
        return {"input_ids": input_ids, "attention_mask": [int(token_id != 0) for token_id in input_ids], "decoder_input_ids": output_ids[:-1], "decoder_attention_mask": [1] * (len(output_ids) - 1), "labels": output_ids[1:]}
train_df = train_sample
test_df = test_sample
val_df = validation_sample

In [6]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)


In [7]:
train_dataset = SummarizationDataset(train_df, tokenizer)
val_dataset = SummarizationDataset(val_df, tokenizer)

In [8]:
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    decoder_input_ids = [item["decoder_input_ids"] for item in batch]
    decoder_attention_mask = [item["decoder_attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]
    max_input_length = max(len(ids) for ids in input_ids)
    max_output_length = max(len(ids) for ids in decoder_input_ids)
    input_ids = [ids + [0] * (max_input_length - len(ids)) for ids in input_ids]
    attention_mask = [mask + [0] * (max_input_length - len(mask)) for mask in attention_mask]
    decoder_input_ids = [ids + [0] * (max_output_length - len(ids)) for ids in decoder_input_ids]
    decoder_attention_mask = [mask + [0] * (max_output_length - len(mask)) for mask in decoder_attention_mask]
    labels = [ids + [-100] * (max_output_length - len(ids)) for ids in labels]
    return {"input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask), "decoder_input_ids": torch.tensor(decoder_input_ids), "decoder_attention_mask": torch.tensor(decoder_attention_mask), "labels": torch.tensor(labels)}

In [9]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,collate_fn=collate_fn)

In [10]:
from transformers import get_cosine_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
num_training_steps = (len(train_loader) * NUM_EPOCHS)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.2*num_training_steps),
    num_training_steps=num_training_steps,
    num_cycles=NUM_CYCLES 
)


In [11]:
len(train_loader),len(val_loader)

(2400, 125)

In [12]:
wandb.init(
    project="Finetune-Summarization",
    name=f"bartbase-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    config={
        "model": "Bartbase_cosine_3",
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "weight_decay": WEIGHT_DECAY,
        "num_epochs": NUM_EPOCHS,
        "num_cycles": NUM_CYCLES,
        "data_ratio": FRAC_SAMPLE,
        
    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvubkk67[0m ([33mvubkk67-hanoi-university-of-science-and-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
# Save best model
best_val_loss = float("inf")
# W&B setup
wandb.watch(model)
# Training loop
for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
            batch = {k: v.to(device) for k, v in batch.items()}  # Thêm dòng này
            outputs = model(**batch)
            val_loss += outputs.loss.item()
        val_loss /= len(val_loader)
        current_lr = scheduler.get_last_lr()[0]

    # W&B log
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "lr": current_lr,
        "best_val_loss": best_val_loss
    })

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"Saved best model to `{save_dir}` at epoch {epoch+1}")

    print(
        f"Epoch {epoch+1:02d} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Val Loss: {val_loss:.4f} | "
        f"LR: {current_lr:.6f} | "
        f"Time: {time.time() - start_time:.2f}s"
    )

# W&B end
wandb.finish()


Epoch 1 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 1 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]



Saved best model to `fine_tuned_bart_cosine_3` at epoch 1
Epoch 01 | Train Loss: 6.6842 | Val Loss: 1.7433 | LR: 0.000002 | Time: 3545.45s


Epoch 2 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 2 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 2
Epoch 02 | Train Loss: 1.4059 | Val Loss: 1.0862 | LR: 0.000004 | Time: 3536.71s


Epoch 3 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 3 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 3
Epoch 03 | Train Loss: 1.0780 | Val Loss: 0.9893 | LR: 0.000006 | Time: 3536.24s


Epoch 4 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 4 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 4
Epoch 04 | Train Loss: 0.9909 | Val Loss: 0.9517 | LR: 0.000008 | Time: 3535.62s


Epoch 5 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 5 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 5
Epoch 05 | Train Loss: 0.9349 | Val Loss: 0.9183 | LR: 0.000010 | Time: 3541.56s


Epoch 6 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 6 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 6
Epoch 06 | Train Loss: 0.8750 | Val Loss: 0.8912 | LR: 0.000008 | Time: 4649.90s


Epoch 7 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 7 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 7
Epoch 07 | Train Loss: 0.8250 | Val Loss: 0.8646 | LR: 0.000003 | Time: 6934.88s


Epoch 8 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 8 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 8
Epoch 08 | Train Loss: 0.7915 | Val Loss: 0.8617 | LR: 0.000000 | Time: 6693.46s


Epoch 9 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 9 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 09 | Train Loss: 0.7839 | Val Loss: 0.8642 | LR: 0.000001 | Time: 6689.07s


Epoch 10 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 10 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 10
Epoch 10 | Train Loss: 0.7824 | Val Loss: 0.8483 | LR: 0.000006 | Time: 6691.95s


Epoch 11 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 11 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 11
Epoch 11 | Train Loss: 0.7708 | Val Loss: 0.8478 | LR: 0.000010 | Time: 6680.27s


Epoch 12 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 12 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 12
Epoch 12 | Train Loss: 0.7463 | Val Loss: 0.8467 | LR: 0.000009 | Time: 6680.45s


Epoch 13 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 13 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 13
Epoch 13 | Train Loss: 0.7134 | Val Loss: 0.8409 | LR: 0.000005 | Time: 6687.05s


Epoch 14 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 14 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 14 | Train Loss: 0.6858 | Val Loss: 0.8419 | LR: 0.000001 | Time: 6680.70s


Epoch 15 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 15 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 15 | Train Loss: 0.6724 | Val Loss: 0.8431 | LR: 0.000000 | Time: 6689.28s


Epoch 16 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 16 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 16 | Train Loss: 0.6726 | Val Loss: 0.8419 | LR: 0.000004 | Time: 6681.27s


Epoch 17 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 17 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 17 | Train Loss: 0.6742 | Val Loss: 0.8486 | LR: 0.000008 | Time: 6680.23s


Epoch 18 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 18 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 18 | Train Loss: 0.6655 | Val Loss: 0.8519 | LR: 0.000010 | Time: 6681.74s


Epoch 19 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 19 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 19 | Train Loss: 0.6440 | Val Loss: 0.8519 | LR: 0.000007 | Time: 6674.54s


Epoch 20 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 20 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 20 | Train Loss: 0.6165 | Val Loss: 0.8522 | LR: 0.000003 | Time: 6665.81s


Epoch 21 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 21 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 21 | Train Loss: 0.5965 | Val Loss: 0.8545 | LR: 0.000000 | Time: 6869.01s


Epoch 22 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 22 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 22 | Train Loss: 0.5931 | Val Loss: 0.8561 | LR: 0.000002 | Time: 6947.32s


Epoch 23 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 23 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 23 | Train Loss: 0.5973 | Val Loss: 0.8610 | LR: 0.000007 | Time: 6680.22s


Epoch 24 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 24 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 24 | Train Loss: 0.5963 | Val Loss: 0.8610 | LR: 0.000010 | Time: 6680.61s


Epoch 25 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

In [14]:
# tokenizer = PegasusTokenizer.from_pretrained(save_dir)
# model = PegasusForConditionalGeneration.from_pretrained(save_dir).to(device)
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained(save_dir)
model = BartForConditionalGeneration.from_pretrained(save_dir).to(device)


In [15]:
test_dataset = SummarizationDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,collate_fn=collate_fn)
len(test_loader)

528

In [16]:
model.to(device)
model.eval()

predictions = []

with torch.no_grad():
    for step, batch in enumerate(tqdm(test_loader, desc="Generating summaries")):
        batch = {k: v.to(device) for k, v in batch.items()}

        output_ids = model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=MAX_LENGTH_SUMMARY,
            decoder_start_token_id=tokenizer.pad_token_id,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )

        batch_preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        predictions.extend(batch_preds)
test_sample = test_sample.iloc[:len(predictions)].copy()
test_sample["predicted_summary"] = predictions
test_sample.to_csv(output_path, index=False)

print(f"✅ File has been saved at: {output_path}")

Generating summaries:   0%|          | 0/528 [00:00<?, ?it/s]

✅ File has been saved at: ../dataft/test_pred_3.csv


In [17]:
test_pred = pd.read_csv(output_path)

In [18]:
display(test_pred[["articles","summaries", "predicted_summary"]].head(2))


Unnamed: 0,articles,summaries,predicted_summary
0,A Florida bus passenger was arrested for throw...,"Joel Parker, 33, was riding the bus in St John...","Joel Parker, 33, was about to get off the Sun..."
1,Aston Villa may be able to sign Cordoba strike...,Aston Villa have held talks over Cordoba strik...,ston Villa may be able to sign Cordoba striker...


In [19]:
# Tính điểm ROUGE
if "summaries" in test_pred.columns:
    rouge = Rouge()
    scores = rouge.get_scores(predictions, test_sample["summaries"].tolist(), avg=True)

    print("ROUGE scores:")
    print(f"ROUGE-1: {scores['rouge-1']['f']:.4f}")
    print(f"ROUGE-2: {scores['rouge-2']['f']:.4f}")
    print(f"ROUGE-L: {scores['rouge-l']['f']:.4f}")
else:
    print("⚠️ Không tìm thấy cột 'summaries' để tính ROUGE.")

ROUGE scores:
ROUGE-1: 0.3807
ROUGE-2: 0.1660
ROUGE-L: 0.3589
