In [17]:
import pandas as pd
import torch
import os 
# os.environ["WANDB_DISABLED"] = "true"
from tqdm import tqdm
from tqdm.auto import tqdm
from datetime import datetime
import wandb
import time
import os
from transformers import get_cosine_schedule_with_warmup
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW

from torch.utils.data import DataLoader, Dataset
from rouge import Rouge
from bert_score import score as bert_score
from nltk.translate.meteor_score import single_meteor_score
from nltk.tokenize import word_tokenize



In [2]:
NUM_EPOCHS = 30
BATCH_SIZE = 8
FRAC_SAMPLE = 0.01
MAX_LENGTH_ARTICLE = 512
MIN_LENGTH_ARTICLE = 50
MAX_LENGTH_SUMMARY = 128
MIN_LENGTH_SUMMARY = 20
HIDDEN_DIM = 128
LEARNING_RATE = 1e-5
MAX_PLATEAU_COUNT = 5
WEIGHT_DECAY = 1e-4
NUM_CYCLES = 5


model_dir = "../Model"
datafilter = "../dataft"
save_dir = "fine_tuned_t5_small"
output_path = os.path.join(datafilter, "test_pred_t5_small_0.csv")
os.makedirs(datafilter, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.makedirs(save_dir, exist_ok=True)

In [3]:
# train_data = pd.read_csv("../dataset/train.csv")
# validation_data = pd.read_csv("../dataset/validation.csv")
# test_data = pd.read_csv("../dataset/test.csv")

# # add col
# train_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)
# validation_data.rename(columns={"highlights": "summaries","article":"articles"}, inplace=True)
# test_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)

# train_data["article_word_count"] = train_data["articles"].astype(str).apply(lambda x: len(x.split()))
# train_data["summary_word_count"] = train_data["summaries"].astype(str).apply(lambda x: len(x.split()))

# validation_data["article_word_count"] = validation_data["articles"].astype(str).apply(lambda x: len(x.split()))
# validation_data["summary_word_count"] = validation_data["summaries"].astype(str).apply(lambda x: len(x.split()))

# test_data["article_word_count"] = test_data["articles"].astype(str).apply(lambda x: len(x.split()))
# test_data["summary_word_count"] = test_data["summaries"].astype(str).apply(lambda x: len(x.split()))

# # filter range
# train_data = train_data[
#     (train_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
#     (train_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
#     (train_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
#     (train_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
# ]

# validation_data = validation_data[
#     (validation_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
#     (validation_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
#     (validation_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
#     (validation_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
# ]
# test_data = test_data[
#     (test_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
#     (test_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
#     (test_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
#     (test_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
# ]

# train_sample = train_data.sample(frac=FRAC_SAMPLE, random_state=1)
# validation_sample = validation_data.sample(frac=FRAC_SAMPLE, random_state=1)
# test_sample = test_data.sample(frac=1, random_state=1)
# train_sample.info()
# print("\n")
# validation_sample.info()
# train_sample.to_csv(os.path.join(datafilter,"train_sample.csv"), index=False)
# test_sample.to_csv(os.path.join(datafilter,"test_sample.csv"), index=False)
# validation_sample.to_csv(os.path.join(datafilter,"validation_sample.csv"), index=False)


In [4]:
train_sample = pd.read_csv("../dataft/train_sample.csv")
validation_sample = pd.read_csv("../dataft/validation_sample.csv")
test_sample = pd.read_csv("../dataft/test_sample.csv")
train_sample.info()
test_sample.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960 entries, 0 to 959
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  960 non-null    object
 1   articles            960 non-null    object
 2   summaries           960 non-null    object
 3   article_word_count  960 non-null    int64 
 4   summary_word_count  960 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 37.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  42 non-null     object
 1   articles            42 non-null     object
 2   summaries           42 non-null     object
 3   article_word_count  42 non-null     int64 
 4   summary_word_count  42 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 1.8+ KB


In [5]:
# Custom Dataset
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=MAX_LENGTH_ARTICLE, max_output_length=MAX_LENGTH_SUMMARY):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        article = self.data.iloc[index]["articles"]
        summary = self.data.iloc[index]["summaries"]
        
        # T5 need prefix:
        input_text = "summarize: " + article
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_input_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        outputs = self.tokenizer(
            summary,
            max_length=self.max_output_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        
        return {
            "input_ids": inputs.input_ids.squeeze(),
            "attention_mask": inputs.attention_mask.squeeze(),
            "labels": outputs.input_ids.squeeze()
        }
train_df = train_sample
val_df = validation_sample
test_df = test_sample


In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
train_dataset = SummarizationDataset(train_df, tokenizer)
val_dataset = SummarizationDataset(val_df, tokenizer)
test_dataset = SummarizationDataset(test_df, tokenizer)

In [8]:
# DataLoader
def collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [9]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

In [10]:
# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
num_training_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.2 * num_training_steps),
    num_training_steps=num_training_steps,
    num_cycles=NUM_CYCLES
)

In [11]:
wandb.init(
    project="Finetune-Summarization",
    name=f"t5small-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    config={
        "model": "t5-small",
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "weight_decay": WEIGHT_DECAY,
        "num_epochs": NUM_EPOCHS,
        "num_cycles": NUM_CYCLES,
        "data_ratio": FRAC_SAMPLE,
        "warm_up": "Cosine"
    }
)


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvubkk67[0m ([33mvubkk67-hanoi-university-of-science-and-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
# Training loop
best_val_loss = float("inf")
plateau_count = 0
wandb.watch(model)

for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
        val_loss /= len(val_loader)
        current_lr = scheduler.get_last_lr()[0]

    # W&B log
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "lr": current_lr,
        "best_val_loss": best_val_loss
    })

    # Save best model or increment plateau counter
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        plateau_count = 0  # Reset plateau counter
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"Saved best model to `{save_dir}` at epoch {epoch+1}")
    else:
        plateau_count += 1
        print(f"No improvement in val_loss. Plateau count: {plateau_count}/{MAX_PLATEAU_COUNT}")

    # Early stopping check
    if plateau_count >= MAX_PLATEAU_COUNT:
        print(f"Early stopping triggered at epoch {epoch+1} due to {plateau_count} consecutive non-improvements.")
        break

    print(
        f"Epoch {epoch+1:02d} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Val Loss: {val_loss:.4f} | "
        f"LR: {current_lr:.6f} | "
        f"Time: {time.time() - start_time:.2f}s"
    )

wandb.finish()

Epoch 1 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 1
Epoch 01 | Train Loss: 10.6276 | Val Loss: 9.4441 | LR: 0.000002 | Time: 115.44s


Epoch 2 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 2 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 2
Epoch 02 | Train Loss: 9.1191 | Val Loss: 7.3647 | LR: 0.000003 | Time: 118.83s


Epoch 3 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 3 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 3
Epoch 03 | Train Loss: 6.1366 | Val Loss: 3.8329 | LR: 0.000005 | Time: 118.67s


Epoch 4 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 4 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 4
Epoch 04 | Train Loss: 2.8895 | Val Loss: 1.4977 | LR: 0.000007 | Time: 118.38s


Epoch 5 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 5 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 5
Epoch 05 | Train Loss: 1.8569 | Val Loss: 1.2144 | LR: 0.000008 | Time: 119.77s


Epoch 6 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 6 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 6
Epoch 06 | Train Loss: 1.6044 | Val Loss: 1.1754 | LR: 0.000010 | Time: 117.34s


Epoch 7 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 7 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 7
Epoch 07 | Train Loss: 1.4568 | Val Loss: 1.1598 | LR: 0.000006 | Time: 117.51s


Epoch 8 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 8 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 8
Epoch 08 | Train Loss: 1.3743 | Val Loss: 1.1545 | LR: 0.000001 | Time: 125.60s


Epoch 9 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 9 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 9
Epoch 09 | Train Loss: 1.3478 | Val Loss: 1.1538 | LR: 0.000001 | Time: 122.91s


Epoch 10 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 10 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 10
Epoch 10 | Train Loss: 1.3266 | Val Loss: 1.1457 | LR: 0.000007 | Time: 116.22s


Epoch 11 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 11 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 11
Epoch 11 | Train Loss: 1.2315 | Val Loss: 1.1361 | LR: 0.000010 | Time: 116.68s


Epoch 12 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 12 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 12
Epoch 12 | Train Loss: 1.1349 | Val Loss: 1.1260 | LR: 0.000005 | Time: 117.41s


Epoch 13 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 13 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 13
Epoch 13 | Train Loss: 1.1089 | Val Loss: 1.1232 | LR: 0.000000 | Time: 114.86s


Epoch 14 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 14 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 14
Epoch 14 | Train Loss: 1.1066 | Val Loss: 1.1224 | LR: 0.000002 | Time: 113.93s


Epoch 15 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 15 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 15
Epoch 15 | Train Loss: 1.0896 | Val Loss: 1.1125 | LR: 0.000009 | Time: 113.44s


Epoch 16 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 16 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 16
Epoch 16 | Train Loss: 1.0507 | Val Loss: 1.0955 | LR: 0.000009 | Time: 116.88s


Epoch 17 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 17 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 17
Epoch 17 | Train Loss: 1.0265 | Val Loss: 1.0831 | LR: 0.000004 | Time: 121.02s


Epoch 18 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 18 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 18
Epoch 18 | Train Loss: 1.0161 | Val Loss: 1.0808 | LR: 0.000000 | Time: 115.40s


Epoch 19 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 19 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 19
Epoch 19 | Train Loss: 1.0143 | Val Loss: 1.0786 | LR: 0.000004 | Time: 117.50s


Epoch 20 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 20 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 20
Epoch 20 | Train Loss: 1.0092 | Val Loss: 1.0647 | LR: 0.000009 | Time: 118.94s


Epoch 21 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 21 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 21
Epoch 21 | Train Loss: 0.9866 | Val Loss: 1.0497 | LR: 0.000009 | Time: 116.82s


Epoch 22 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 22 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 22
Epoch 22 | Train Loss: 0.9675 | Val Loss: 1.0430 | LR: 0.000003 | Time: 116.96s


Epoch 23 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 23 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 23
Epoch 23 | Train Loss: 0.9647 | Val Loss: 1.0420 | LR: 0.000000 | Time: 116.49s


Epoch 24 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 24 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 24
Epoch 24 | Train Loss: 0.9605 | Val Loss: 1.0395 | LR: 0.000005 | Time: 115.72s


Epoch 25 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 25 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 25
Epoch 25 | Train Loss: 0.9540 | Val Loss: 1.0322 | LR: 0.000010 | Time: 117.43s


Epoch 26 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 26 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 26
Epoch 26 | Train Loss: 0.9434 | Val Loss: 1.0253 | LR: 0.000007 | Time: 117.47s


Epoch 27 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 27 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 27
Epoch 27 | Train Loss: 0.9342 | Val Loss: 1.0224 | LR: 0.000001 | Time: 118.19s


Epoch 28 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 28 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 28
Epoch 28 | Train Loss: 0.9299 | Val Loss: 1.0222 | LR: 0.000001 | Time: 117.50s


Epoch 29 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 29 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 29
Epoch 29 | Train Loss: 0.9273 | Val Loss: 1.0201 | LR: 0.000006 | Time: 122.34s


Epoch 30 [Train]:   0%|          | 0/120 [00:00<?, ?it/s]

Epoch 30 [Val]:   0%|          | 0/7 [00:00<?, ?it/s]

Saved best model to `fine_tuned_t5_small` at epoch 30
Epoch 30 | Train Loss: 0.9209 | Val Loss: 1.0156 | LR: 0.000010 | Time: 116.57s


VBox(children=(Label(value='0.225 MB of 0.225 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_val_loss,█▆▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
lr,▂▃▅▆▇█▅▁▂▆█▅▁▃▇█▄▁▄█▇▃▁▅█▆▂▁▅█
train_loss,█▇▅▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▆▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_val_loss,1.02014
epoch,30.0
lr,1e-05
train_loss,0.9209
val_loss,1.01561


In [13]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained(save_dir).to(device)


In [14]:
len(test_loader)

6

In [15]:
# Generate predictions
model.eval()
predictions = []

with torch.no_grad():
    for step, batch in enumerate(tqdm(test_loader, desc="Generating summaries")):
        batch = {k: v.to(device) for k, v in batch.items()}

        output_ids = model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=MAX_LENGTH_SUMMARY,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )

        batch_preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        predictions.extend(batch_preds)

# Save predictions
test_sample = test_df.iloc[:len(predictions)].copy()
test_sample["predicted_summary"] = predictions
test_sample.to_csv(output_path, index=False)

print(f"✅ File has been saved at: {output_path}")

Generating summaries:   0%|          | 0/6 [00:00<?, ?it/s]

✅ File has been saved at: ../dataft/test_pred_t5_small_0.csv


In [19]:
test_pred = pd.read_csv(output_path)
display(test_pred[["articles","summaries", "predicted_summary"]].head(2))

# Kiểm tra cột cần thiết
if "summaries" in test_pred.columns and "predicted_summary" in test_pred.columns:
    references = test_pred["summaries"].fillna("<empty>").astype(str).tolist()
    predictions = test_pred["predicted_summary"].fillna("<empty>").astype(str).tolist()

    # Lọc các cặp hợp lệ
    valid_pairs = [
        (pred, ref) for pred, ref in zip(predictions, references)
        if pred.strip() and pred != "<empty>" and ref.strip()
    ]
    if not valid_pairs:
        print("Không có cặp hợp lệ để tính điểm.")
    else:
        filtered_preds, filtered_refs = zip(*valid_pairs)

        # ROUGE
        rouge = Rouge()
        rouge_scores = rouge.get_scores(filtered_preds, filtered_refs, avg=True)
        print("ROUGE scores:")
        print(f"ROUGE-1: {rouge_scores['rouge-1']['f']:.4f}")
        print(f"ROUGE-2: {rouge_scores['rouge-2']['f']:.4f}")
        print(f"ROUGE-L: {rouge_scores['rouge-l']['f']:.4f}")

        # BERTScore
        P, R, F1 = bert_score(filtered_preds, filtered_refs, lang="en", verbose=False)
        print("BERTScore:")
        print(f"Precision: {P.mean().item():.4f}")
        print(f"Recall:    {R.mean().item():.4f}")
        print(f"F1:        {F1.mean().item():.4f}")

        # METEOR
        print("METEOR Score (trung bình):")
        meteor_scores = [single_meteor_score(ref.split(), pred.split()) 
                        for pred, ref in zip(filtered_preds, filtered_refs)]
        print(f"METEOR: {sum(meteor_scores)/len(meteor_scores):.4f}")

else:
    print("Không tìm thấy đủ cột 'summaries' và 'predicted_summary' để tính điểm.")


Unnamed: 0,articles,summaries,predicted_summary
0,A Florida bus passenger was arrested for throw...,"Joel Parker, 33, was riding the bus in St John...","Joel Parker, 33, was about to get off the Suns..."
1,Aston Villa may be able to sign Cordoba strike...,Aston Villa have held talks over Cordoba strik...,Aston Villa could sign Cordoba striker Florin ...


ROUGE scores:
ROUGE-1: 0.4039
ROUGE-2: 0.1948
ROUGE-L: 0.3863


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore:
Precision: 0.8880
Recall:    0.8710
F1:        0.8793
METEOR Score (trung bình):
METEOR: 0.2937
