## Weighted Averaging

In [1]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer, BartForConditionalGeneration, PegasusTokenizer, PegasusForConditionalGeneration
from tqdm import tqdm
import os
import numpy as np
from rouge import Rouge


2025-05-11 12:09:08.029441: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-11 12:09:08.176261: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746940148.294715   14567 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746940148.339106   14567 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746940148.465545   14567 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
NUM_EPOCHS = 50
BATCH_SIZE = 8
FRAC_SAMPLE = 0.01
MAX_LENGTH_ARTICLE = 512
MIN_LENGTH_ARTICLE = 50
MAX_LENGTH_SUMMARY = 128
MIN_LENGTH_SUMMARY = 20
HIDDEN_DIM = 128
LEARNING_RATE = 1e-5
PATIENCE = 5  # For early stopping
WEIGHT_DECAY = 1e-4
NUM_CYCLES = 5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
datafilter = "../dataft"
output_path = os.path.join(datafilter, "test_pred_ensemble.csv")
save_dir_bart = "fine_tuned_bart_cosine_3"
save_dir_t5 = "fine_tuned_t5_small"
save_dir_pegasus = "fine_tuned_pegasus_custom"

In [3]:
train_data = pd.read_csv("../dataset/train.csv")
validation_data = pd.read_csv("../dataset/validation.csv")
test_data = pd.read_csv("../dataset/test.csv")

# add col
train_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)
validation_data.rename(columns={"highlights": "summaries","article":"articles"}, inplace=True)
test_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)

train_data["article_word_count"] = train_data["articles"].astype(str).apply(lambda x: len(x.split()))
train_data["summary_word_count"] = train_data["summaries"].astype(str).apply(lambda x: len(x.split()))

validation_data["article_word_count"] = validation_data["articles"].astype(str).apply(lambda x: len(x.split()))
validation_data["summary_word_count"] = validation_data["summaries"].astype(str).apply(lambda x: len(x.split()))

test_data["article_word_count"] = test_data["articles"].astype(str).apply(lambda x: len(x.split()))
test_data["summary_word_count"] = test_data["summaries"].astype(str).apply(lambda x: len(x.split()))

# filter range
train_data = train_data[
    (train_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (train_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (train_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (train_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

validation_data = validation_data[
    (validation_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (validation_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (validation_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (validation_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]
test_data = test_data[
    (test_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (test_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (test_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (test_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

train_sample = train_data.sample(frac=FRAC_SAMPLE, random_state=1)
validation_sample = validation_data.sample(frac=FRAC_SAMPLE, random_state=1)
test_sample = test_data.sample(frac=0.1, random_state=1)
train_sample.info()
print("\n")
validation_sample.info()
# train_sample.to_csv(os.path.join(datafilter,"train_sample.csv"), index=False)
# test_sample.to_csv(os.path.join(datafilter,"test_sample.csv"), index=False)
# validation_sample.to_csv(os.path.join(datafilter,"validation_sample.csv"), index=False)


<class 'pandas.core.frame.DataFrame'>
Index: 960 entries, 144417 to 108633
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  960 non-null    object
 1   articles            960 non-null    object
 2   summaries           960 non-null    object
 3   article_word_count  960 non-null    int64 
 4   summary_word_count  960 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 45.0+ KB


<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 8901 to 12116
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  50 non-null     object
 1   articles            50 non-null     object
 2   summaries           50 non-null     object
 3   article_word_count  50 non-null     int64 
 4   summary_word_count  50 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 2.3+ KB


In [4]:
test_df = test_sample

In [5]:
class BartBaseDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=MAX_LENGTH_ARTICLE, max_output_length=MAX_LENGTH_SUMMARY):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        article = self.data.iloc[index]["articles"]
        summary = self.data.iloc[index]["summaries"]
        input_ids = self.tokenizer.encode(article, max_length=self.max_input_length, truncation=True, padding="max_length")
        output_ids = self.tokenizer.encode(summary, max_length=self.max_output_length, truncation=True, padding="max_length")
        return {"input_ids": input_ids, "attention_mask": [int(token_id != 0) for token_id in input_ids], "decoder_input_ids": output_ids[:-1], "decoder_attention_mask": [1] * (len(output_ids) - 1), "labels": output_ids[1:]}

def collate_bart(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    decoder_input_ids = [item["decoder_input_ids"] for item in batch]
    decoder_attention_mask = [item["decoder_attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]
    max_input_length = max(len(ids) for ids in input_ids)
    max_output_length = max(len(ids) for ids in decoder_input_ids)
    input_ids = [ids + [0] * (max_input_length - len(ids)) for ids in input_ids]
    attention_mask = [mask + [0] * (max_input_length - len(mask)) for mask in attention_mask]
    decoder_input_ids = [ids + [0] * (max_output_length - len(ids)) for ids in decoder_input_ids]
    decoder_attention_mask = [mask + [0] * (max_output_length - len(mask)) for mask in decoder_attention_mask]
    labels = [ids + [-100] * (max_output_length - len(ids)) for ids in labels]
    return {"input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask), "decoder_input_ids": torch.tensor(decoder_input_ids), "decoder_attention_mask": torch.tensor(decoder_attention_mask), "labels": torch.tensor(labels)}

bart_tokenizer = BartTokenizer.from_pretrained(save_dir_bart)
bart_model = BartForConditionalGeneration.from_pretrained(save_dir_bart).to(device)

test_dataset_bart = BartBaseDataset(test_df, bart_tokenizer)
test_loader_bart = DataLoader(test_dataset_bart, batch_size=8, collate_fn=collate_bart)

In [6]:
class T5SmallDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=MAX_LENGTH_ARTICLE, max_output_length=MAX_LENGTH_SUMMARY):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        article = self.data.iloc[index]["articles"]
        summary = self.data.iloc[index]["summaries"]
        
        # T5 need prefix:
        input_text = "summarize: " + article
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_input_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        outputs = self.tokenizer(
            summary,
            max_length=self.max_output_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        
        return {
            "input_ids": inputs.input_ids.squeeze(),
            "attention_mask": inputs.attention_mask.squeeze(),
            "labels": outputs.input_ids.squeeze()
        }
def collate_t5(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }
    
t5_tokenizer = T5Tokenizer.from_pretrained(save_dir_t5)
t5_model = T5ForConditionalGeneration.from_pretrained(save_dir_t5).to(device)

test_dataset_t5 = T5SmallDataset(test_df, t5_tokenizer)
test_loader_t5 = DataLoader(test_dataset_t5, batch_size=8, collate_fn=collate_t5)

In [7]:
pegasus_tokenizer = PegasusTokenizer.from_pretrained(save_dir_pegasus)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(save_dir_pegasus).to(device)

class PegaCustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=MAX_LENGTH_ARTICLE, max_output_length=MAX_LENGTH_SUMMARY):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        article = self.data.iloc[index]["articles"]
        summary = self.data.iloc[index]["summaries"]
        input_ids = self.tokenizer.encode(article, max_length=self.max_input_length, truncation=True, padding="max_length")
        output_ids = self.tokenizer.encode(summary, max_length=self.max_output_length, truncation=True, padding="max_length")
        return {"input_ids": input_ids, "attention_mask": [int(token_id != 0) for token_id in input_ids], "decoder_input_ids": output_ids[:-1], "decoder_attention_mask": [1] * (len(output_ids) - 1), "labels": output_ids[1:]}

def collate_pega(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    decoder_input_ids = [item["decoder_input_ids"] for item in batch]
    decoder_attention_mask = [item["decoder_attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]
    max_input_length = max(len(ids) for ids in input_ids)
    max_output_length = max(len(ids) for ids in decoder_input_ids)
    input_ids = [ids + [0] * (max_input_length - len(ids)) for ids in input_ids]
    attention_mask = [mask + [0] * (max_input_length - len(mask)) for mask in attention_mask]
    decoder_input_ids = [ids + [0] * (max_output_length - len(ids)) for ids in decoder_input_ids]
    decoder_attention_mask = [mask + [0] * (max_output_length - len(mask)) for mask in decoder_attention_mask]
    labels = [ids + [-100] * (max_output_length - len(ids)) for ids in labels]
    
    labels = torch.tensor(labels)
    labels[labels == pegasus_tokenizer.pad_token_id] = -100
    
    return {"input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask), "decoder_input_ids": torch.tensor(decoder_input_ids), "decoder_attention_mask": torch.tensor(decoder_attention_mask), "labels": torch.tensor(labels)}

test_dataset_pegasus = PegaCustomDataset(test_df, pegasus_tokenizer)
test_loader_pegasus = DataLoader(test_dataset_pegasus, batch_size=8, collate_fn=collate_pega)

In [8]:
def generate_summaries(model, tokenizer, data_loader, model_type="bart"):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc=f"Generating {model_type} summaries"):
            batch = {k: v.to(device) for k, v in batch.items()}
            if model_type == "t5":
                output_ids = model.generate(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    max_length=MAX_LENGTH_SUMMARY,
                    num_beams=4,
                    length_penalty=2.0,
                    early_stopping=True
                )
            else:  # BART hoặc Pegasus
                output_ids = model.generate(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    max_length=MAX_LENGTH_SUMMARY,
                    num_beams=4,
                    length_penalty=2.0,
                    early_stopping=True,
                    decoder_start_token_id=tokenizer.pad_token_id
                )

            batch_preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            predictions.extend(batch_preds)
    return predictions

In [9]:
bart_predictions = generate_summaries(bart_model, bart_tokenizer, test_loader_bart, "bart")
t5_predictions = generate_summaries(t5_model, t5_tokenizer, test_loader_t5, "t5")
pegasus_predictions = generate_summaries(pegasus_model, pegasus_tokenizer, test_loader_pegasus, "pegasus")


Generating bart summaries: 100%|██████████| 53/53 [03:05<00:00,  3.49s/it]
Generating t5 summaries: 100%|██████████| 53/53 [01:55<00:00,  2.18s/it]
  return {"input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask), "decoder_input_ids": torch.tensor(decoder_input_ids), "decoder_attention_mask": torch.tensor(decoder_attention_mask), "labels": torch.tensor(labels)}
Generating pegasus summaries: 100%|██████████| 53/53 [00:48<00:00,  1.09it/s]


In [10]:
# Lưu dự đoán vào DataFrame
test_df["bart_summary"] = bart_predictions
test_df["t5_summary"] = t5_predictions
test_df["pegasus_summary"] = pegasus_predictions

In [11]:
display(test_df.head(2))

Unnamed: 0,id,articles,summaries,article_word_count,summary_word_count,bart_summary,t5_summary,pegasus_summary
9204,fc8f37cb5bc8fe97794175fae6b876f07cf3fda4,A Florida bus passenger was arrested for throw...,"Joel Parker, 33, was riding the bus in St John...",143,58,"Joel Parker, 33, was arrested for throwing a ...","Joel Parker, 33, was about to get off the Suns...","the driver, 33, was arrested at St James Parke..."
10729,8d1da9b0197d9c733db56bdfa62332d04144398d,Aston Villa may be able to sign Cordoba strike...,Aston Villa have held talks over Cordoba strik...,189,38,Aston Villa could sign Cordoba striker Florin...,Aston Villa could sign Cordoba striker Florin ...,d to raise £2.5million for the Spanish side . ...


In [12]:
# test_pred = pd.read_csv(output_path)
print("bart-base")
# Tính điểm ROUGE
if "summaries" in test_df.columns:
    rouge = Rouge()
    scores = rouge.get_scores(test_df["bart_summary"].tolist(), test_df["summaries"].tolist(), avg=True)

    print("ROUGE scores:")
    print(f"ROUGE-1: {scores['rouge-1']['f']:.4f}")
    print(f"ROUGE-2: {scores['rouge-2']['f']:.4f}")
    print(f"ROUGE-L: {scores['rouge-l']['f']:.4f}")
else:
    print("⚠️ Không tìm thấy cột 'summaries' để tính ROUGE.")
print("t5")    
# Tính điểm ROUGE
if "summaries" in test_df.columns:
    rouge = Rouge()
    scores = rouge.get_scores(test_df["t5_summary"].tolist(), test_df["summaries"].tolist(), avg=True)

    print("ROUGE scores:")
    print(f"ROUGE-1: {scores['rouge-1']['f']:.4f}")
    print(f"ROUGE-2: {scores['rouge-2']['f']:.4f}")
    print(f"ROUGE-L: {scores['rouge-l']['f']:.4f}")
else:
    print("⚠️ Không tìm thấy cột 'summaries' để tính ROUGE.")

print("pegasus")    
# Tính điểm ROUGE
if "summaries" in test_df.columns:
    rouge = Rouge()
    scores = rouge.get_scores(test_df["pegasus_summary"].tolist(), test_df["summaries"].tolist(), avg=True)

    print("ROUGE scores:")
    print(f"ROUGE-1: {scores['rouge-1']['f']:.4f}")
    print(f"ROUGE-2: {scores['rouge-2']['f']:.4f}")
    print(f"ROUGE-L: {scores['rouge-l']['f']:.4f}")
else:
    print("⚠️ Không tìm thấy cột 'summaries' để tính ROUGE.")

bart-base
ROUGE scores:
ROUGE-1: 0.3843
ROUGE-2: 0.1697
ROUGE-L: 0.3613
t5
ROUGE scores:
ROUGE-1: 0.4071
ROUGE-2: 0.1924
ROUGE-L: 0.3867
pegasus
ROUGE scores:
ROUGE-1: 0.2602
ROUGE-2: 0.0602
ROUGE-L: 0.2446


In [13]:
from nltk.tokenize import sent_tokenize
WEIGHTS = {"bart": 0.40, "t5": 0.50, "pegasus": 0.10}  # Trọng số mới

# Hàm xử lý input mới và tạo tóm tắt từ từng mô hình
def generate_single_summary(model, tokenizer, article, model_type="bart"):
    model.eval()
    with torch.no_grad():
        if model_type == "t5":
            input_text = "summarize: " + article
            inputs = t5_tokenizer(
                input_text,
                max_length=MAX_LENGTH_ARTICLE,
                truncation=True,
                padding="max_length",
                return_tensors="pt"
            ).to(device)
            output_ids = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=MAX_LENGTH_SUMMARY,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )
        else:  # BART hoặc Pegasus
            inputs = tokenizer(
                article,
                max_length=MAX_LENGTH_ARTICLE,
                truncation=True,
                padding="max_length",
                return_tensors="pt"
            ).to(device)
            output_ids = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=MAX_LENGTH_SUMMARY,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True,
                decoder_start_token_id=tokenizer.pad_token_id
            )
        summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary

# Hàm hậu xử lý tóm tắt
def post_process_summary(summary, max_length=MAX_LENGTH_SUMMARY):
    sentences = sent_tokenize(summary)
    unique_sentences = list(dict.fromkeys(sentences))  # Loại bỏ trùng lặp
    tokenized = bart_tokenizer.encode(" ".join(unique_sentences), truncation=True, max_length=max_length)
    return bart_tokenizer.decode(tokenized, skip_special_tokens=True)

# Hàm ensemble dựa trên ROUGE-L
def ensemble_summaries(article):
    # Tạo tóm tắt từ từng mô hình
    bart_summary = generate_single_summary(bart_model, bart_tokenizer, article, "bart")
    t5_summary = generate_single_summary(t5_model, t5_tokenizer, article, "t5")
    pegasus_summary = generate_single_summary(pegasus_model, pegasus_tokenizer, article, "pegasus")
    
    # Danh sách tóm tắt
    summaries = {
        "bart": bart_summary,
        "t5": t5_summary,
        "pegasus": pegasus_summary
    }
    
    # Tính ROUGE-L để chọn tóm tắt tốt nhất
    candidates = [summaries["bart"], summaries["t5"], summaries["pegasus"]]
    rouge = Rouge()
    scores = []
    
    for i, cand in enumerate(candidates):
        others = [c for j, c in enumerate(candidates) if j != i]
        # Tính điểm ROUGE-L trung bình so với các tóm tắt khác
        if others:  # Đảm bảo có tóm tắt khác để so sánh
            rouge_scores = rouge.get_scores([cand] * len(others), others, avg=True)
            rouge_l_score = rouge_scores["rouge-l"]["f"]
            weighted_score = rouge_l_score * WEIGHTS[list(summaries.keys())[i]]
            scores.append(weighted_score)
        else:
            scores.append(0.0)  # Nếu không có tóm tắt khác, gán điểm 0
    
    # Chọn tóm tắt có điểm cao nhất
    best_idx = np.argmax(scores)
    best_summary = candidates[best_idx]
    
    # Hậu xử lý
    final_summary = post_process_summary(best_summary)
    return final_summary

# Hàm chính để gọi từ ngoài
def summarize_article(article):
    """
    Nhận một bài viết mới và trả về tóm tắt ensemble.
    
    Args:
        article (str): Bài viết cần tóm tắt.
    
    Returns:
        str: Tóm tắt ensemble.
    """
    return ensemble_summaries(article)

# Ví dụ sử dụng
if __name__ == "__main__":
    # Input mới
    new_article = """
Animal stories for kids fascinate and intrigue their curious minds. These tales do more than just entertain – they plant seeds of wisdom that help little ones learn about right and wrong, caring for others, and how to be a good person in the world. Children learn important life lessons in a fun and memorable way through the adventures of furry and feathered friends. Aesop’s Fables, Roald Dahl’s The Fantastic Mr. Fox, Rudyard Kipling’s The Jungle Book, and the Panchatantra are exciting adventure stories on animals that kids may enjoy.

Children can be taught about various aspects of life through storytelling. While some messages from the moral stories in English are simple and easy to follow, others may be intense and cannot be delivered directly. Science has proven that using animals enables authors to tell a powerful story while also maintaining emotional distance (1).

Here is a compilation of some of the best short stories for kids that they may enjoy hearing. Encourage your child to explore these stories together and discuss the morals they convey.
    """
    
    # Tạo tóm tắt
    summary = summarize_article(new_article)
    print("Tóm tắt ensemble:")
    print(summary)
    print(summary == new_article)

Tóm tắt ensemble:
Aesop’s Fables, Roald Dahl’s The Fantastic Mr. Fox, Rudyard Kipling’s The Jungle Book, and the Panchatantra are exciting adventure stories on animals. Children can be taught about various aspects of life through storytelling.
False


In [15]:
from tqdm.auto import tqdm
tqdm.pandas() 
test_df["predicted_ensemble"] = test_df["articles"].progress_apply(summarize_article)

  0%|          | 0/422 [00:00<?, ?it/s]

In [16]:
# Tính điểm ROUGE
if "summaries" in test_df.columns:
    rouge = Rouge()
    scores = rouge.get_scores(test_df["predicted_ensemble"].tolist(), test_df["summaries"].tolist(), avg=True)

    print("ROUGE scores:")
    print(f"ROUGE-1: {scores['rouge-1']['f']:.4f}")
    print(f"ROUGE-2: {scores['rouge-2']['f']:.4f}")
    print(f"ROUGE-L: {scores['rouge-l']['f']:.4f}")
else:
    print("⚠️ Không tìm thấy cột 'summaries' để tính ROUGE.")

ROUGE scores:
ROUGE-1: 0.4078
ROUGE-2: 0.1930
ROUGE-L: 0.3878


In [17]:
test_df.to_csv(output_path, index=False)