## Weighted Averaging

In [None]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer, BartForConditionalGeneration, PegasusTokenizer, PegasusForConditionalGeneration
from tqdm import tqdm
import os
import numpy as np
from rouge_score import rouge_scorer


In [None]:
NUM_EPOCHS = 50
BATCH_SIZE = 8
FRAC_SAMPLE = 0.01
MAX_LENGTH_ARTICLE = 512
MIN_LENGTH_ARTICLE = 50
MAX_LENGTH_SUMMARY = 128
MIN_LENGTH_SUMMARY = 20
HIDDEN_DIM = 128
LEARNING_RATE = 1e-5
PATIENCE = 5  # For early stopping
WEIGHT_DECAY = 1e-4
NUM_CYCLES = 5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
datafilter = "../dataft"
output_path = os.path.join(datafilter, "test_pred_ensemble.csv")
save_dir_bart = "fine_tuned_bart_cosine_3"
save_dir_t5 = "fine_tuned_t5_small"
save_dir_pegasus = "fine_tuned_pegasus_custom"

In [None]:
train_data = pd.read_csv("../dataset/train.csv")
validation_data = pd.read_csv("../dataset/validation.csv")
test_data = pd.read_csv("../dataset/test.csv")

# add col
train_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)
validation_data.rename(columns={"highlights": "summaries","article":"articles"}, inplace=True)
test_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)

train_data["article_word_count"] = train_data["articles"].astype(str).apply(lambda x: len(x.split()))
train_data["summary_word_count"] = train_data["summaries"].astype(str).apply(lambda x: len(x.split()))

validation_data["article_word_count"] = validation_data["articles"].astype(str).apply(lambda x: len(x.split()))
validation_data["summary_word_count"] = validation_data["summaries"].astype(str).apply(lambda x: len(x.split()))

test_data["article_word_count"] = test_data["articles"].astype(str).apply(lambda x: len(x.split()))
test_data["summary_word_count"] = test_data["summaries"].astype(str).apply(lambda x: len(x.split()))

# filter range
train_data = train_data[
    (train_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (train_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (train_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (train_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

validation_data = validation_data[
    (validation_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (validation_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (validation_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (validation_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]
test_data = test_data[
    (test_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (test_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (test_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (test_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

train_sample = train_data.sample(frac=FRAC_SAMPLE, random_state=1)
validation_sample = validation_data.sample(frac=FRAC_SAMPLE, random_state=1)
test_sample = test_data.sample(frac=1, random_state=1)
train_sample.info()
print("\n")
validation_sample.info()
train_sample.to_csv(os.path.join(datafilter,"train_sample.csv"), index=False)
test_sample.to_csv(os.path.join(datafilter,"test_sample.csv"), index=False)
validation_sample.to_csv(os.path.join(datafilter,"validation_sample.csv"), index=False)


In [None]:
train_sample = pd.read_csv("../dataft/train_sample.csv")
validation_sample = pd.read_csv("../dataft/validation_sample.csv")
test_sample = pd.read_csv("../dataft/test_sample.csv")
train_sample.info()


In [None]:
bart_tokenizer = BartTokenizer.from_pretrained(save_dir_bart)
bart_model = BartForConditionalGeneration.from_pretrained(save_dir_bart).to(device)
t5_tokenizer = T5Tokenizer.from_pretrained(save_dir_t5)
t5_model = T5ForConditionalGeneration.from_pretrained(save_dir_t5).to(device)
pegasus_tokenizer = PegasusTokenizer.from_pretrained(save_dir_pegasus)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(save_dir_pegasus).to(device)

In [None]:
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=MAX_LENGTH_ARTICLE, max_output_length=MAX_LENGTH_SUMMARY):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        article = self.data.iloc[index]["articles"]
        summary = self.data.iloc[index]["summaries"]
        input_text = "summarize: " + article if isinstance(self.tokenizer, T5Tokenizer) else article
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_input_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        outputs = self.tokenizer(
            summary,
            max_length=self.max_output_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            "input_ids": inputs.input_ids.squeeze(),
            "attention_mask": inputs.attention_mask.squeeze(),
            "labels": outputs.input_ids.squeeze(),
            "article": article,
            "summary": summary
        }

In [None]:
test_df = test_sample
# Create test dataset (use T5 tokenizer for simplicity, as it works for all models)
test_dataset = SummarizationDataset(test_df, t5_tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [None]:
def generate_summaries(model, tokenizer, loader, is_t5=False):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Generating summaries"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            output_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=MAX_LENGTH_SUMMARY,
                min_length=MIN_LENGTH_SUMMARY,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )
            batch_preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            predictions.extend(batch_preds)
    return predictions

In [None]:
# generate summaries for each model
bart_preds = generate_summaries(bart_model, bart_tokenizer, test_loader)
t5_preds = generate_summaries(t5_model, t5_tokenizer, test_loader, is_t5=True)
pegasus_preds = generate_summaries(pegasus_model, pegasus_tokenizer, test_loader)

In [None]:
# Ensemble function (Weighted Averaging)
def ensemble_summaries(bart_preds, t5_preds, pegasus_preds, weights=[0.4, 0.2, 0.4]):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    ensemble_preds = []
    
    for bart_pred, t5_pred, pegasus_pred in zip(bart_preds, t5_preds, pegasus_preds):
        candidates = [bart_pred, t5_pred, pegasus_pred]
        # Split into sentences (simple split for demo)
        candidate_sentences = [sent.split('. ') for sent in candidates]
        # Flatten and collect unique sentences
        all_sentences = []
        for sents in candidate_sentences:
            all_sentences.extend(sents)
        all_sentences = list(set([s.strip() for s in all_sentences if s.strip()]))
        
        if not all_sentences:
            ensemble_preds.append(bart_pred)  # Fallback to BART if no sentences
            continue
        
        # Score each sentence based on ROUGE with reference to all candidates
        sentence_scores = []
        for sent in all_sentences:
            scores = []
            for cand in candidates:
                if sent in cand:
                    score = scorer.score(cand, sent)['rouge1'].fmeasure
                    scores.append(score)
                else:
                    scores.append(0.0)
            # Weighted score
            weighted_score = sum(w * s for w, s in zip(weights, scores))
            sentence_scores.append((sent, weighted_score))
        
        # Select top sentences
        sentence_scores.sort(key=lambda x: x[1], reverse=True)
        selected_sentences = sentence_scores[:3]  # Select top 3 sentences
        ensemble_summary = '. '.join([sent for sent, _ in selected_sentences if sent])
        ensemble_preds.append(ensemble_summary)
    
    return ensemble_preds

# Ensemble predictions
weights = [0.5, 0.3, 0.2]  # Higher weights for BART, t5
ensemble_preds = ensemble_summaries(bart_preds, t5_preds, pegasus_preds, weights)


In [None]:
# Save predictions
test_sample = test_df.iloc[:len(ensemble_preds)].copy()
test_sample["bart_summary"] = bart_preds
test_sample["t5_summary"] = t5_preds
test_sample["pegasus_summary"] = pegasus_preds
test_sample["ensemble_summary"] = ensemble_preds
test_sample.to_csv(output_path, index=False)

print(f"✅ File has been saved at: {output_path}")

In [None]:
test_pred = pd.read_csv(output_path)
# Tính điểm ROUGE
if "summaries" in test_pred.columns:
    rouge = Rouge()
    scores = rouge.get_scores(predictions, test_sample["summaries"].tolist(), avg=True)

    print("ROUGE scores:")
    print(f"ROUGE-1: {scores['rouge-1']['f']:.4f}")
    print(f"ROUGE-2: {scores['rouge-2']['f']:.4f}")
    print(f"ROUGE-L: {scores['rouge-l']['f']:.4f}")
else:
    print("⚠️ Không tìm thấy cột 'summaries' để tính ROUGE.")