In [1]:
import pandas as pd
import torch
from tqdm import tqdm
from tqdm.auto import tqdm
from datetime import datetime
import wandb
import time
import os
from torch.optim import AdamW

from rouge import Rouge
from bert_score import score as bert_score
from nltk.translate.meteor_score import single_meteor_score
from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import _LRScheduler
import math

In [21]:
NUM_EPOCHS = 10
BATCH_SIZE = 8
FRAC_SAMPLE = 0.01
MAX_LENGTH_ARTICLE = 512
MIN_LENGTH_ARTICLE = 50
MAX_LENGTH_SUMMARY = 128
MIN_LENGTH_SUMMARY = 20
HIDDEN_DIM = 128
LEARNING_RATE = 1e-5
# MAX_PLATEAU_COUNT = 7
WEIGHT_DECAY = 1e-4
USE_SCHEDULER = True
NUM_CYCLES = 5


model_dir = "../Model"
datafilter = "../dataft"
save_dir = "fine_tuned_bart_cosine_3"
output_path = os.path.join(datafilter, "test_pred_3.csv")
os.makedirs(datafilter, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.makedirs(save_dir, exist_ok=True)

In [5]:
train_data = pd.read_csv("../dataset/train.csv")
validation_data = pd.read_csv("../dataset/validation.csv")
test_data = pd.read_csv("../dataset/test.csv")

# add col
train_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)
validation_data.rename(columns={"highlights": "summaries","article":"articles"}, inplace=True)
test_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)

train_data["article_word_count"] = train_data["articles"].astype(str).apply(lambda x: len(x.split()))
train_data["summary_word_count"] = train_data["summaries"].astype(str).apply(lambda x: len(x.split()))

validation_data["article_word_count"] = validation_data["articles"].astype(str).apply(lambda x: len(x.split()))
validation_data["summary_word_count"] = validation_data["summaries"].astype(str).apply(lambda x: len(x.split()))

test_data["article_word_count"] = test_data["articles"].astype(str).apply(lambda x: len(x.split()))
test_data["summary_word_count"] = test_data["summaries"].astype(str).apply(lambda x: len(x.split()))

# filter range
train_data = train_data[
    (train_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (train_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (train_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (train_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

validation_data = validation_data[
    (validation_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (validation_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (validation_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (validation_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]
test_data = test_data[
    (test_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (test_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (test_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (test_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

train_sample = train_data.sample(frac=FRAC_SAMPLE, random_state=1)
validation_sample = validation_data.sample(frac=FRAC_SAMPLE, random_state=1)
test_sample = test_data.sample(frac= 0.1, random_state=1)
train_sample.info()
print("\n")
validation_sample.info()
train_sample.to_csv(os.path.join(datafilter,"train_sample.csv"), index=False)
test_sample.to_csv(os.path.join(datafilter,"test_sample.csv"), index=False)
validation_sample.to_csv(os.path.join(datafilter,"validation_sample.csv"), index=False)


<class 'pandas.core.frame.DataFrame'>
Index: 960 entries, 144417 to 108633
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  960 non-null    object
 1   articles            960 non-null    object
 2   summaries           960 non-null    object
 3   article_word_count  960 non-null    int64 
 4   summary_word_count  960 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 45.0+ KB


<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 8901 to 12116
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  50 non-null     object
 1   articles            50 non-null     object
 2   summaries           50 non-null     object
 3   article_word_count  50 non-null     int64 
 4   summary_word_count  50 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 2.3+ KB


In [6]:
train_sample = pd.read_csv("../dataft/train_sample.csv")
validation_sample = pd.read_csv("../dataft/validation_sample.csv")
test_sample = pd.read_csv("../dataft/test_sample.csv")
train_sample.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960 entries, 0 to 959
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  960 non-null    object
 1   articles            960 non-null    object
 2   summaries           960 non-null    object
 3   article_word_count  960 non-null    int64 
 4   summary_word_count  960 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 37.6+ KB


In [7]:
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=MAX_LENGTH_ARTICLE, max_output_length=MAX_LENGTH_SUMMARY):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        article = self.data.iloc[index]["articles"]
        summary = self.data.iloc[index]["summaries"]
        input_ids = self.tokenizer.encode(article, max_length=self.max_input_length, truncation=True, padding="max_length")
        output_ids = self.tokenizer.encode(summary, max_length=self.max_output_length, truncation=True, padding="max_length")
        return {"input_ids": input_ids, "attention_mask": [int(token_id != 0) for token_id in input_ids], "decoder_input_ids": output_ids[:-1], "decoder_attention_mask": [1] * (len(output_ids) - 1), "labels": output_ids[1:]}
train_df = train_sample
test_df = test_sample
val_df = validation_sample

In [8]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)


2025-05-11 00:01:38.218389: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-11 00:01:38.823330: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746896499.040342    1049 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746896499.102616    1049 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746896499.624186    1049 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [9]:
train_dataset = SummarizationDataset(train_df, tokenizer)
val_dataset = SummarizationDataset(val_df, tokenizer)

In [10]:
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    decoder_input_ids = [item["decoder_input_ids"] for item in batch]
    decoder_attention_mask = [item["decoder_attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]
    max_input_length = max(len(ids) for ids in input_ids)
    max_output_length = max(len(ids) for ids in decoder_input_ids)
    input_ids = [ids + [0] * (max_input_length - len(ids)) for ids in input_ids]
    attention_mask = [mask + [0] * (max_input_length - len(mask)) for mask in attention_mask]
    decoder_input_ids = [ids + [0] * (max_output_length - len(ids)) for ids in decoder_input_ids]
    decoder_attention_mask = [mask + [0] * (max_output_length - len(mask)) for mask in decoder_attention_mask]
    labels = [ids + [-100] * (max_output_length - len(ids)) for ids in labels]
    return {"input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask), "decoder_input_ids": torch.tensor(decoder_input_ids), "decoder_attention_mask": torch.tensor(decoder_attention_mask), "labels": torch.tensor(labels)}

In [11]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,collate_fn=collate_fn)

In [12]:
from transformers import get_cosine_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
num_training_steps = (len(train_loader) * NUM_EPOCHS)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.2*num_training_steps),
    num_training_steps=num_training_steps,
    num_cycles=NUM_CYCLES 
)


In [13]:
no_decay = ['bias', 'LayerNorm.weight']    # cần ít regularization hơn weight
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        'weight_decay': WEIGHT_DECAY,
    },
    {
        'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)

# # Scheduler
num_training_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.2 * num_training_steps),
    num_training_steps=num_training_steps,
    num_cycles=NUM_CYCLES
)

In [14]:
wandb.init(
    project="Finetune-Summarization",
    name=f"bartbase-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    config={
        "model": "Bartbase_cosine_3",
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "weight_decay": WEIGHT_DECAY,
        "num_epochs": NUM_EPOCHS,
        "num_cycles": NUM_CYCLES,
        "data_ratio": FRAC_SAMPLE,
        "warm-up": "Cosine"
        
    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvubkk67[0m ([33mvubkk67-hanoi-university-of-science-and-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [15]:
# Save best model
best_val_loss = float("inf")
# W&B setup
wandb.watch(model)
# Training loop
for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
            batch = {k: v.to(device) for k, v in batch.items()}  # Thêm dòng này
            outputs = model(**batch)
            val_loss += outputs.loss.item()
        val_loss /= len(val_loader)
        current_lr = scheduler.get_last_lr()[0]

    # W&B log
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "lr": current_lr,
        "best_val_loss": best_val_loss
    })

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"Saved best model to `{save_dir}` at epoch {epoch+1}")

    print(
        f"Epoch {epoch+1:02d} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Val Loss: {val_loss:.4f} | "
        f"LR: {current_lr:.6f} | "
        f"Time: {time.time() - start_time:.2f}s"
    )

# W&B end
wandb.finish()


Epoch 1 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 1 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]



Saved best model to `fine_tuned_bart_cosine_3` at epoch 1
Epoch 01 | Train Loss: 10.7183 | Val Loss: 7.1967 | LR: 0.000005 | Time: 214.46s


Epoch 2 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 2 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 2
Epoch 02 | Train Loss: 5.7195 | Val Loss: 2.9165 | LR: 0.000010 | Time: 213.17s


Epoch 3 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 3 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 3
Epoch 03 | Train Loss: 2.9457 | Val Loss: 2.2008 | LR: 0.000001 | Time: 211.63s


Epoch 4 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 4 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 4
Epoch 04 | Train Loss: 2.3530 | Val Loss: 1.5769 | LR: 0.000005 | Time: 211.74s


Epoch 5 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 5 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 5
Epoch 05 | Train Loss: 1.7839 | Val Loss: 1.4397 | LR: 0.000009 | Time: 211.58s


Epoch 6 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 6 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 6
Epoch 06 | Train Loss: 1.4895 | Val Loss: 1.3007 | LR: 0.000000 | Time: 211.48s


Epoch 7 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 7 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 7
Epoch 07 | Train Loss: 1.3787 | Val Loss: 1.2382 | LR: 0.000009 | Time: 212.18s


Epoch 8 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 8 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 8
Epoch 08 | Train Loss: 1.2699 | Val Loss: 1.2076 | LR: 0.000005 | Time: 211.47s


Epoch 9 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 9 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 9
Epoch 09 | Train Loss: 1.2140 | Val Loss: 1.1712 | LR: 0.000001 | Time: 212.81s


Epoch 10 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 10 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_bart_cosine_3` at epoch 10
Epoch 10 | Train Loss: 1.1692 | Val Loss: 1.1551 | LR: 0.000010 | Time: 211.61s


VBox(children=(Label(value='0.423 MB of 0.423 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_val_loss,█▃▂▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
lr,▅█▂▅▇▁▇▅▂█
train_loss,█▄▂▂▁▁▁▁▁▁
val_loss,█▃▂▁▁▁▁▁▁▁

0,1
best_val_loss,1.17121
epoch,10.0
lr,1e-05
train_loss,1.16916
val_loss,1.15506


In [16]:
# tokenizer = PegasusTokenizer.from_pretrained(save_dir)
# model = PegasusForConditionalGeneration.from_pretrained(save_dir).to(device)
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained(save_dir)
model = BartForConditionalGeneration.from_pretrained(save_dir).to(device)


In [17]:
test_dataset = SummarizationDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,collate_fn=collate_fn)
len(test_loader)

53

In [18]:
model.to(device)
model.eval()

predictions = []

with torch.no_grad():
    for step, batch in enumerate(tqdm(test_loader, desc="Generating summaries")):
        batch = {k: v.to(device) for k, v in batch.items()}

        output_ids = model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=MAX_LENGTH_SUMMARY,
            decoder_start_token_id=tokenizer.pad_token_id,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )

        batch_preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        predictions.extend(batch_preds)
test_sample = test_sample.iloc[:len(predictions)].copy()
test_sample["predicted_summary"] = predictions
test_sample.to_csv(output_path, index=False)

print(f"✅ File has been saved at: {output_path}")

Generating summaries:   0%|          | 0/53 [00:00<?, ?it/s]

✅ File has been saved at: ../dataft/test_pred_3.csv


In [20]:
test_pred = pd.read_csv(output_path)
display(test_pred[["articles","summaries", "predicted_summary"]].head(2))

# Kiểm tra cột cần thiết
if "summaries" in test_pred.columns and "predicted_summary" in test_pred.columns:
    references = test_pred["summaries"].fillna("<empty>").astype(str).tolist()
    predictions = test_pred["predicted_summary"].fillna("<empty>").astype(str).tolist()

    # Lọc các cặp hợp lệ
    valid_pairs = [
        (pred, ref) for pred, ref in zip(predictions, references)
        if pred.strip() and pred != "<empty>" and ref.strip()
    ]
    if not valid_pairs:
        print("Không có cặp hợp lệ để tính điểm.")
    else:
        filtered_preds, filtered_refs = zip(*valid_pairs)

        # ROUGE
        rouge = Rouge()
        rouge_scores = rouge.get_scores(filtered_preds, filtered_refs, avg=True)
        print("ROUGE scores:")
        print(f"ROUGE-1: {rouge_scores['rouge-1']['f']:.4f}")
        print(f"ROUGE-2: {rouge_scores['rouge-2']['f']:.4f}")
        print(f"ROUGE-L: {rouge_scores['rouge-l']['f']:.4f}")

        # BERTScore
        P, R, F1 = bert_score(filtered_preds, filtered_refs, lang="en", verbose=False)
        print("BERTScore:")
        print(f"Precision: {P.mean().item():.4f}")
        print(f"Recall:    {R.mean().item():.4f}")
        print(f"F1:        {F1.mean().item():.4f}")

        # METEOR
        print("METEOR Score (trung bình):")
        meteor_scores = [single_meteor_score(ref.split(), pred.split()) 
                        for pred, ref in zip(filtered_preds, filtered_refs)]
        print(f"METEOR: {sum(meteor_scores)/len(meteor_scores):.4f}")

else:
    print("Không tìm thấy đủ cột 'summaries' và 'predicted_summary' để tính điểm.")


Unnamed: 0,articles,summaries,predicted_summary
0,A Florida bus passenger was arrested for throw...,"Joel Parker, 33, was riding the bus in St John...","Joel Parker, 33, was arrested for throwing a ..."
1,Aston Villa may be able to sign Cordoba strike...,Aston Villa have held talks over Cordoba strik...,Aston Villa could sign Cordoba striker Florin...


ROUGE scores:
ROUGE-1: 0.3843
ROUGE-2: 0.1697
ROUGE-L: 0.3613


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore:
Precision: 0.8800
Recall:    0.8887
F1:        0.8842
METEOR Score (trung bình):
METEOR: 0.3590
