In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from torchvision import transforms
import torch.optim as optim
import random 
from tqdm import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
from tqdm.auto import tqdm
from datetime import datetime
import wandb
import time
import os
import re
import math
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW

from torch.utils.data import DataLoader, Dataset
from rouge import Rouge


In [24]:
NUM_EPOCHS = 50
BATCH_SIZE = 8
FRAC_SAMPLE = 0.005
MAX_LENGTH_ARTICLE = 512
MIN_LENGTH_ARTICLE = 50
MAX_LENGTH_SUMMARY = 128
MIN_LENGTH_SUMMARY = 20
HIDDEN_DIM = 128
LEARNING_RATE = 1e-5
MAX_PLATEAU_COUNT = 7
WEIGHT_DECAY = 1e-4
CLIP = 1
USE_PRETRAINED_EMB = True
USE_SCHEDULER = True
SCHEDULER_TYPE = "plateau"  # hoặc cosine, linear
NUM_CYCLES = 7


model_dir = "../Model"
datafilter = "../dataft"
save_dir = "fine_tuned_bart_cosine_3"
output_path = os.path.join(datafilter, "test_pred_3.csv")
os.makedirs(datafilter, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.makedirs(save_dir, exist_ok=True)

In [25]:
train_data = pd.read_csv("../dataset/train.csv")
validation_data = pd.read_csv("../dataset/validation.csv")
test_data = pd.read_csv("../dataset/test.csv")

# add col
train_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)
validation_data.rename(columns={"highlights": "summaries","article":"articles"}, inplace=True)
test_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)

train_data["article_word_count"] = train_data["articles"].astype(str).apply(lambda x: len(x.split()))
train_data["summary_word_count"] = train_data["summaries"].astype(str).apply(lambda x: len(x.split()))

validation_data["article_word_count"] = validation_data["articles"].astype(str).apply(lambda x: len(x.split()))
validation_data["summary_word_count"] = validation_data["summaries"].astype(str).apply(lambda x: len(x.split()))

test_data["article_word_count"] = test_data["articles"].astype(str).apply(lambda x: len(x.split()))
test_data["summary_word_count"] = test_data["summaries"].astype(str).apply(lambda x: len(x.split()))

# filter range
train_data = train_data[
    (train_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (train_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (train_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (train_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

validation_data = validation_data[
    (validation_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (validation_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (validation_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (validation_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]
test_data = test_data[
    (test_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (test_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (test_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (test_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

train_sample = train_data.sample(frac=FRAC_SAMPLE, random_state=1)
validation_sample = validation_data.sample(frac=FRAC_SAMPLE, random_state=1)
test_sample = test_data.sample(frac=1, random_state=1)
train_sample.info()
print("\n")
validation_sample.info()
train_sample.to_csv(os.path.join(datafilter,"train_sample.csv"), index=False)
test_sample.to_csv(os.path.join(datafilter,"test_sample.csv"), index=False)
validation_sample.to_csv(os.path.join(datafilter,"validation_sample.csv"), index=False)


In [4]:
train_sample = pd.read_csv("../dataft/train_sample.csv")
validation_sample = pd.read_csv("../dataft/validation_sample.csv")
test_sample = pd.read_csv("../dataft/test_sample.csv")
train_sample.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19198 entries, 0 to 19197
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  19198 non-null  object
 1   articles            19198 non-null  object
 2   summaries           19198 non-null  object
 3   article_word_count  19198 non-null  int64 
 4   summary_word_count  19198 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 750.1+ KB


In [5]:
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=MAX_LENGTH_ARTICLE, max_output_length=MAX_LENGTH_SUMMARY):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        article = self.data.iloc[index]["articles"]
        summary = self.data.iloc[index]["summaries"]
        input_ids = self.tokenizer.encode(article, max_length=self.max_input_length, truncation=True, padding="max_length")
        output_ids = self.tokenizer.encode(summary, max_length=self.max_output_length, truncation=True, padding="max_length")
        return {"input_ids": input_ids, "attention_mask": [int(token_id != 0) for token_id in input_ids], "decoder_input_ids": output_ids[:-1], "decoder_attention_mask": [1] * (len(output_ids) - 1), "labels": output_ids[1:]}
train_df = train_sample
test_df = test_sample
val_df = validation_sample

In [6]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)


In [7]:
train_dataset = SummarizationDataset(train_df, tokenizer)
val_dataset = SummarizationDataset(val_df, tokenizer)

In [8]:
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    decoder_input_ids = [item["decoder_input_ids"] for item in batch]
    decoder_attention_mask = [item["decoder_attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]
    max_input_length = max(len(ids) for ids in input_ids)
    max_output_length = max(len(ids) for ids in decoder_input_ids)
    input_ids = [ids + [0] * (max_input_length - len(ids)) for ids in input_ids]
    attention_mask = [mask + [0] * (max_input_length - len(mask)) for mask in attention_mask]
    decoder_input_ids = [ids + [0] * (max_output_length - len(ids)) for ids in decoder_input_ids]
    decoder_attention_mask = [mask + [0] * (max_output_length - len(mask)) for mask in decoder_attention_mask]
    labels = [ids + [-100] * (max_output_length - len(ids)) for ids in labels]
    return {"input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask), "decoder_input_ids": torch.tensor(decoder_input_ids), "decoder_attention_mask": torch.tensor(decoder_attention_mask), "labels": torch.tensor(labels)}

In [9]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,collate_fn=collate_fn)

In [10]:
from transformers import get_cosine_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
num_training_steps = (len(train_loader) * NUM_EPOCHS)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=500,
    num_training_steps=num_training_steps,
    num_cycles=NUM_CYCLES 
)


In [11]:
len(train_loader),len(val_loader)

(2400, 125)

In [12]:
wandb.init(
    project="Finetune-Summarization",
    name=f"bartbase-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    config={
        "model": "Bartbase_cosine_3",
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "weight_decay": WEIGHT_DECAY,
        "num_epochs": NUM_EPOCHS,
        "num_cycles": NUM_CYCLES,
        "data_ratio": FRAC_SAMPLE,
        
    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvubkk67[0m ([33mvubkk67-hanoi-university-of-science-and-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
# Save best model
best_val_loss = float("inf")
# W&B setup
wandb.watch(model)
# Training loop
for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
            batch = {k: v.to(device) for k, v in batch.items()}  # Thêm dòng này
            outputs = model(**batch)
            val_loss += outputs.loss.item()
        val_loss /= len(val_loader)
        current_lr = scheduler.get_last_lr()[0]

    # W&B log
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "lr": current_lr,
        "best_val_loss": best_val_loss
    })

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"Saved best model to `{save_dir}` at epoch {epoch+1}")

    print(
        f"Epoch {epoch+1:02d} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Val Loss: {val_loss:.4f} | "
        f"LR: {current_lr:.6f} | "
        f"Time: {time.time() - start_time:.2f}s"
    )

# W&B end
wandb.finish()


Epoch 1 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 1 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]



Saved best model to `fine_tuned_bart` at epoch 1
Epoch 01 | Train Loss: 1.5771 | Val Loss: 0.9106 | LR: 0.000040 | Time: 4116.16s


Epoch 2 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 2 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart` at epoch 2
Epoch 02 | Train Loss: 0.8859 | Val Loss: 0.8841 | LR: 0.000027 | Time: 4098.80s


Epoch 3 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 3 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart` at epoch 3
Epoch 03 | Train Loss: 0.7899 | Val Loss: 0.8580 | LR: 0.000015 | Time: 4080.54s


Epoch 4 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 4 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart` at epoch 4
Epoch 04 | Train Loss: 0.7173 | Val Loss: 0.8447 | LR: 0.000002 | Time: 4084.26s


Epoch 5 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 5 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 05 | Train Loss: 0.6793 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4063.83s


Epoch 6 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 6 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 06 | Train Loss: 0.6790 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4064.70s


Epoch 7 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 7 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 07 | Train Loss: 0.6788 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4071.02s


Epoch 8 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 8 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 08 | Train Loss: 0.6787 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4059.81s


Epoch 9 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 9 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 09 | Train Loss: 0.6792 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4057.35s


Epoch 10 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 10 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 10 | Train Loss: 0.6793 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4057.46s


Epoch 11 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 11 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 11 | Train Loss: 0.6788 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4058.10s


Epoch 12 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 12 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 12 | Train Loss: 0.6783 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4054.64s


Epoch 13 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 13 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 13 | Train Loss: 0.6789 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4051.47s


Epoch 14 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 14 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 14 | Train Loss: 0.6789 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4051.31s


Epoch 15 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 15 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 15 | Train Loss: 0.6793 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4050.04s


Epoch 16 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 16 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 16 | Train Loss: 0.6788 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4050.63s


Epoch 17 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 17 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 17 | Train Loss: 0.6789 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4051.43s


Epoch 18 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

Epoch 18 [Val]:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch 18 | Train Loss: 0.6795 | Val Loss: 0.8482 | LR: 0.000000 | Time: 4051.82s


Epoch 19 [Train]:   0%|          | 0/2400 [00:00<?, ?it/s]

In [19]:
# tokenizer = PegasusTokenizer.from_pretrained(save_dir)
# model = PegasusForConditionalGeneration.from_pretrained(save_dir).to(device)
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained(save_dir)
model = BartForConditionalGeneration.from_pretrained(save_dir).to(device)


In [20]:
test_dataset = SummarizationDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,collate_fn=collate_fn)
len(test_loader)

2532

In [23]:
model.to(device)
model.eval()

predictions = []

with torch.no_grad():
    for step, batch in enumerate(tqdm(test_loader, desc="Generating summaries")):
        batch = {k: v.to(device) for k, v in batch.items()}

        output_ids = model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=128,
            decoder_start_token_id=tokenizer.pad_token_id,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )

        batch_preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        predictions.extend(batch_preds)
test_sample = test_sample.iloc[:len(predictions)].copy()
test_sample["predicted_summary"] = predictions
test_sample.to_csv(output_path, index=False)

print(f"✅ File has been saved at: {output_path}")

Generating summaries:   0%|          | 0/2532 [00:00<?, ?it/s]

✅ File đã được lưu tại: ../dataft/test_sample.csv


In [None]:
test_pred = pd.read_csv(output_path)

In [24]:
display(test_pred[["articles","summaries", "predicted_summary"]].head(2))


Unnamed: 0,articles,summaries,predicted_summary
0,Supermarket giant Coles has been ordered to pa...,Coles ordered to pay $2.5 million in penalties...,Supermarket giant Coles has been ordered to pa...
1,PSV Eindhoven beat Heerenveen 4-1 to win the D...,PSV beat Heerenveen to lift their first Erediv...,PSV Eindhoven beat Heerenveen 4-1 on Saturday...


In [25]:
# Tính điểm ROUGE
if "summaries" in test_pred.columns:
    rouge = Rouge()
    scores = rouge.get_scores(predictions, test_sample["summaries"].tolist(), avg=True)

    print("ROUGE scores:")
    print(f"ROUGE-1: {scores['rouge-1']['f']:.4f}")
    print(f"ROUGE-2: {scores['rouge-2']['f']:.4f}")
    print(f"ROUGE-L: {scores['rouge-l']['f']:.4f}")
else:
    print("⚠️ Không tìm thấy cột 'summaries' để tính ROUGE.")


🎯 ROUGE scores:
ROUGE-1: 0.3800
ROUGE-2: 0.1634
ROUGE-L: 0.3549
