# Vietnamese Text Summarization - mT5-Small Fine-tuning

‚úÖ **Model**: google/mt5-small (300M params)  
‚úÖ **Task**: Abstractive Summarization for Vietnamese  
‚úÖ **Strategy**: Properly structured seq2seq with optimized hyperparameters  
‚úÖ **Dataset**: Vietnamese documents with human-written summaries  

---

## Key Improvements in This Version

1. **Standardized Summarization Task Format**
   - Proper prefix: "t√≥m t·∫Øt: " for all inputs
   - Consistent max lengths (input: 512, output: 128)

2. **Stable Training Configuration**
   - Learning rate: 2e-4 (optimal for mT5)
   - Batch size: 2 with gradient accumulation: 8 (effective batch: 16)
   - FP16 enabled on CUDA GPUs
   - Warmup steps: 500

3. **Comprehensive Evaluation**
   - ROUGE-1, ROUGE-2, ROUGE-L metrics
   - Sample output inspection (metrics aren't everything!)

4. **Optimized Inference**
   - Beam search: 4-6 beams
   - Length penalty: 1.0-1.5
   - Repetition penalty: 1.2

## 1. Install Packages

In [None]:
# Install required packages
!pip install -q transformers datasets accelerate sentencepiece evaluate rouge-score py-rouge scikit-learn protobuf torch --root-user-action=ignore

print("‚úÖ All packages installed!")

## 2. Load and Verify Data

In [None]:
import re
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict

# Simple Vietnamese sentence tokenizer
def sent_tokenize(text: str) -> list[str]:
    """Vietnamese sentence tokenizer"""
    pattern = r'(?<=[.!?])\s+(?=[A-Z√Ä√Å·∫†·∫¢√É√Ç·∫¶·∫§·∫¨·∫®·∫™ƒÇ·∫∞·∫Æ·∫∂·∫≤·∫¥√à√â·∫∏·∫∫·∫º√ä·ªÄ·∫æ·ªÜ·ªÇ·ªÑ√å√ç·ªä·ªàƒ®√í√ì·ªå·ªé√ï√î·ªí·ªê·ªò·ªî·ªñ∆†·ªú·ªö·ª¢·ªû·ª†√ô√ö·ª§·ª¶≈®∆Ø·ª™·ª®·ª∞·ª¨·ªÆ·ª≤√ù·ª¥·ª∂·ª∏ƒê])'
    sentences = re.split(pattern, text)
    return [s.strip() for s in sentences if s.strip()]

# Load dataset
print("üìä Loading Vietnamese Summarization Dataset...")
train_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/validation.csv")
test_df = pd.read_csv("data/test.csv")

print(f"‚úì Train: {len(train_df):,} samples")
print(f"‚úì Validation: {len(val_df):,} samples")
print(f"‚úì Test: {len(test_df):,} samples")

# Analyze data statistics
def analyze_lengths(df: pd.DataFrame, name: str):
    doc_words = df['document'].apply(lambda x: len(x.split()))
    sum_words = df['summary'].apply(lambda x: len(x.split()))
    compression_ratio = (sum_words.mean() / doc_words.mean() * 100)
    
    print(f"\n{name}:")
    print(f"  Avg document: {doc_words.mean():.0f} words, Avg summary: {sum_words.mean():.0f} words")
    print(f"  Compression ratio: {compression_ratio:.1f}%")

analyze_lengths(train_df, "Train")
analyze_lengths(val_df, "Validation")
analyze_lengths(test_df, "Test")

# Convert to HuggingFace Dataset
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df[['document', 'summary']], preserve_index=False),
    'validation': Dataset.from_pandas(val_df[['document', 'summary']], preserve_index=False),
    'test': Dataset.from_pandas(test_df[['document', 'summary']], preserve_index=False)
})

print(f"\nüìù Sample:")
sample = dataset['train'][0]
print(f"Document: {sample['document'][:200]}...")
print(f"Summary: {sample['summary'][:150]}...")

In [None]:
# ============================================================================
# üçé MAC FIX: FORCE CPU TO AVOID MPS BUGS
# ============================================================================
import os

# FORCE CPU - MPS has too many bugs with mT5
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

# Additional fix: completely disable MPS
os.environ['PYTORCH_FORCE_CPU'] = '1'

print("üçé Mac compatibility mode enabled")
print("   MPS COMPLETELY DISABLED - forcing CPU")
print("   (Training will be slow but stable)")
print("   ")
print("‚ö†Ô∏è  IMPORTANT: You MUST restart your kernel after running this cell!")
print("   Kernel ‚Üí Restart ‚Üí Run cells from beginning")

In [8]:
# ============================================================================
# TextRank Implementation
# ============================================================================
class TextRankSummarizer:
    """TextRank algorithm for extractive summarization"""
    
    def __init__(self, top_n: int = 3, damping: float = 0.85):
        self.top_n = top_n
        self.damping = damping
        self.tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')
        self.model = AutoModel.from_pretrained('vinai/phobert-base')
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
    
    def get_sentence_embedding(self, sentence: str) -> np.ndarray:
        """Get PhoBERT embedding for a sentence"""
        inputs = self.tokenizer(
            sentence, 
            return_tensors='pt', 
            truncation=True, 
            max_length=256
        ).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            # Use CLS token embedding
            embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        
        return embedding[0]
    
    def build_similarity_matrix(self, sentences: list[str]) -> np.ndarray:
        """Build similarity matrix between sentences"""
        print(f"  Computing embeddings for {len(sentences)} sentences...")
        embeddings = []
        
        for sent in tqdm(sentences, desc="Encoding"):
            emb = self.get_sentence_embedding(sent)
            embeddings.append(emb)
        
        embeddings = np.array(embeddings)
        similarity_matrix = cosine_similarity(embeddings)
        
        return similarity_matrix
    
    def textrank(self, similarity_matrix: np.ndarray) -> np.ndarray:
        """Run TextRank algorithm (PageRank on sentence graph)"""
        # Create graph from similarity matrix
        nx_graph = nx.from_numpy_array(similarity_matrix)
        
        # Compute PageRank scores
        scores = nx.pagerank(nx_graph, alpha=self.damping)
        
        return np.array(list(scores.values()))
    
    def summarize(self, document: str, num_sentences: int = None) -> str:
        """Generate extractive summary using TextRank"""
        if num_sentences is None:
            num_sentences = self.top_n
        
        # Split into sentences
        sentences = sent_tokenize(document)
        
        if len(sentences) <= num_sentences:
            return document
        
        # Build similarity matrix
        similarity_matrix = self.build_similarity_matrix(sentences)
        
        # Run TextRank
        scores = self.textrank(similarity_matrix)
        
        # Select top sentences
        ranked_indices = np.argsort(scores)[::-1][:num_sentences]
        
        # Sort by original order to maintain coherence
        ranked_indices = sorted(ranked_indices)
        
        # Extract summary
        summary_sentences = [sentences[i] for i in ranked_indices]
        summary = ' '.join(summary_sentences)
        
        return summary

print("‚úÖ TextRank Summarizer created!")

‚úÖ TextRank Summarizer created!


from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
import evaluate

print("‚úÖ Transformers imported successfully!")

In [12]:
def preprocess_function(examples):
    """Tokenize inputs and targets"""
    # Add prefix
    inputs = ["t√≥m t·∫Øt: " + doc for doc in examples["document"]]
    
    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding=False
    )
    
    # Tokenize targets
    labels = tokenizer(
        text_target=examples["summary"],
        max_length=128,
        truncation=True,
        padding=False
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("üîÑ Tokenizing dataset...")
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing"
)

# Verify
sample = tokenized_datasets["train"][0]
print(f"\nSample tokenized data:")
print(f"Input length: {len(sample['input_ids'])}")
print(f"Label length: {len(sample['labels'])}")
print(f"Input IDs (first 20): {sample['input_ids'][:20]}")
print(f"Labels (first 20): {sample['labels'][:20]}")

# Decode to verify
decoded_input = tokenizer.decode(sample['input_ids'][:50])
decoded_label = tokenizer.decode(sample['labels'][:50])
print(f"\nDecoded input: {decoded_input}")
print(f"Decoded label: {decoded_label}")

print("\n‚úÖ Tokenization complete!")

üîÑ Tokenizing dataset...


Tokenizing:   0%|          | 0/15620 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Tokenizing:   0%|          | 0/1952 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/1953 [00:00<?, ? examples/s]


Sample tokenized data:
Input length: 512
Label length: 128
Input IDs (first 20): [259, 164459, 259, 270, 2289, 270, 267, 26965, 441, 317, 708, 262, 4650, 276, 441, 259, 29828, 382, 2291, 441]
Labels (first 20): [458, 1858, 382, 2291, 261, 300, 908, 562, 1075, 4501, 718, 369, 273, 331, 2294, 7790, 370, 562, 1075, 261]

Decoded input: t√≥m t·∫Øt: L√° N c·ªßa c√¢y N l√¥ h·ªôi N ch·ª©a V ƒë·∫ßy A ch·∫•t N gel N v√† b·∫°n N c√≥ th·ªÉ h√°i V m·ªói khi N c
Decoded label: L√¥ h·ªôi, v·ªõi ch·∫•t gel gi√†u d∆∞·ª°ng ch·∫•t, c√≥ th·ªÉ s·ª≠ d·ª•ng ƒë·ªÉ ch·ªØa l√†nh c√°c v·∫•n ƒë·ªÅ v·ªÅ da nh∆∞ b·ªè

‚úÖ Tokenization complete!


In [13]:
print("=== DETAILED LABEL CHECK ===")
sample = tokenized_datasets["train"][0]

print(f"Input IDs (first 20): {sample['input_ids'][:20]}")
print(f"Labels (first 20): {sample['labels'][:20]}")

# Count -100
num_neg100 = sum(1 for l in sample['labels'] if l == -100)
num_valid = len(sample['labels']) - num_neg100

print(f"\nTotal labels: {len(sample['labels'])}")
print(f"Number of -100: {num_neg100}")
print(f"Valid labels: {num_valid}")
print(f"Percentage valid: {num_valid/len(sample['labels'])*100:.1f}%")

# Decode valid labels
valid_labels = [l for l in sample['labels'] if l != -100]
if valid_labels:
    decoded = tokenizer.decode(valid_labels)
    print(f"\nDecoded valid labels: {decoded}")
else:
    print("\n‚ùå‚ùå‚ùå NO VALID LABELS - ALL ARE -100! ‚ùå‚ùå‚ùå")

=== DETAILED LABEL CHECK ===
Input IDs (first 20): [259, 164459, 259, 270, 2289, 270, 267, 26965, 441, 317, 708, 262, 4650, 276, 441, 259, 29828, 382, 2291, 441]
Labels (first 20): [458, 1858, 382, 2291, 261, 300, 908, 562, 1075, 4501, 718, 369, 273, 331, 2294, 7790, 370, 562, 1075, 261]

Total labels: 128
Number of -100: 0
Valid labels: 128
Percentage valid: 100.0%

Decoded valid labels: L√¥ h·ªôi, v·ªõi ch·∫•t gel gi√†u d∆∞·ª°ng ch·∫•t, c√≥ th·ªÉ s·ª≠ d·ª•ng ƒë·ªÉ ch·ªØa l√†nh c√°c v·∫•n ƒë·ªÅ v·ªÅ da nh∆∞ b·ªèng n·∫Øng, g√†u v√† da kh√¥. B·∫°n c√≥ th·ªÉ s·ª≠ d·ª•ng l√° l√¥ h·ªôi t∆∞∆°i ƒë·ªÉ l·∫•y gel, b√¥i tr·ª±c ti·∫øp l√™n da b·ªã t·ªïn th∆∞∆°ng. L∆∞u √Ω, gel l√¥ h·ªôi kh√¥ng n√™n b√¥i l√™n v√πng</s>


# Load mT5-Small model and tokenizer
MODEL_NAME = "google/mt5-small"

print(f"üì• Loading model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

print(f"‚úÖ Model loaded successfully!")
print(f"   Parameters: {model.num_parameters():,}")
print(f"   Vocab size: {tokenizer.vocab_size:,}")

# Move to device
model = model.to(device)
print(f"   Device: {device}")

In [15]:
print("=== DETAILED LABEL CHECK ===")
sample = tokenized_datasets["train"][0]

print(f"Input IDs (first 20): {sample['input_ids'][:20]}")
print(f"Labels (first 20): {sample['labels'][:20]}")

# Count -100
num_neg100 = sum(1 for l in sample['labels'] if l == -100)
num_valid = len(sample['labels']) - num_neg100

print(f"\nTotal labels: {len(sample['labels'])}")
print(f"Number of -100: {num_neg100}")
print(f"Valid labels: {num_valid}")
print(f"Percentage valid: {num_valid/len(sample['labels'])*100:.1f}%")

# Decode valid labels
valid_labels = [l for l in sample['labels'] if l != -100]
if valid_labels:
    decoded = tokenizer.decode(valid_labels)
    print(f"\nDecoded valid labels: {decoded}")
else:
    print("\n‚ùå‚ùå‚ùå NO VALID LABELS - ALL ARE -100! ‚ùå‚ùå‚ùå")

=== DETAILED LABEL CHECK ===
Input IDs (first 20): [259, 164459, 259, 270, 2289, 270, 267, 26965, 441, 317, 708, 262, 4650, 276, 441, 259, 29828, 382, 2291, 441]
Labels (first 20): [458, 1858, 382, 2291, 261, 300, 908, 562, 1075, 4501, 718, 369, 273, 331, 2294, 7790, 370, 562, 1075, 261]

Total labels: 128
Number of -100: 0
Valid labels: 128
Percentage valid: 100.0%

Decoded valid labels: L√¥ h·ªôi, v·ªõi ch·∫•t gel gi√†u d∆∞·ª°ng ch·∫•t, c√≥ th·ªÉ s·ª≠ d·ª•ng ƒë·ªÉ ch·ªØa l√†nh c√°c v·∫•n ƒë·ªÅ v·ªÅ da nh∆∞ b·ªèng n·∫Øng, g√†u v√† da kh√¥. B·∫°n c√≥ th·ªÉ s·ª≠ d·ª•ng l√° l√¥ h·ªôi t∆∞∆°i ƒë·ªÉ l·∫•y gel, b√¥i tr·ª±c ti·∫øp l√™n da b·ªã t·ªïn th∆∞∆°ng. L∆∞u √Ω, gel l√¥ h·ªôi kh√¥ng n√™n b√¥i l√™n v√πng</s>


## 8. Train Model üöÄ

In [None]:
# ============================================================================
# 6Ô∏è‚É£ METRIC ‚Äì ƒê·ª™NG CH·ªà NH√åN LOSS!
# ============================================================================
# Chu·∫©n ƒë√°nh gi√°: ROUGE-1, ROUGE-2, ROUGE-L (quan tr·ªçng nh·∫•t)
# ‚ö†Ô∏è Nh∆∞ng: ROUGE cao ‚â† t√≥m t·∫Øt hay
# üëâ Lu√¥n ƒë·ªçc sample output b·∫±ng m·∫Øt ng∆∞·ªùi

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    """
    Compute ROUGE scores v√† hi·ªÉn th·ªã sample predictions
    ƒë·ªÉ ƒë√°nh gi√° ch·∫•t l∆∞·ª£ng th·ª±c t·∫ø
    """
    predictions, labels = eval_pred
    
    # N·∫øu predictions l√† logits, l·∫•y argmax
    if len(predictions.shape) == 3:
        predictions = np.argmax(predictions, axis=-1)
    
    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in labels (padding tokens)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # üëÅÔ∏è LU√îN HI·ªÇN TH·ªä SAMPLE ƒë·ªÉ ki·ªÉm tra ch·∫•t l∆∞·ª£ng th·ª±c t·∫ø
    if len(decoded_preds) > 0:
        print(f"\n{'='*70}")
        print("üìù SAMPLE PREDICTION (ƒë·ªÉ ƒë√°nh gi√° ch·∫•t l∆∞·ª£ng th·ª±c t·∫ø):")
        print(f"{'='*70}")
        print(f"Prediction: {decoded_preds[0][:200]}")
        print(f"Reference:  {decoded_labels[0][:200]}")
        print(f"{'='*70}\n")
    
    # Clean text
    decoded_preds = ["\n".join(pred.strip().split()) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip().split()) for label in decoded_labels]
    
    # Compute ROUGE
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=False
    )
    
    # Return scores
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"],
        "rougeLsum": result["rougeLsum"],
    }

print("‚úÖ Metrics defined")

## 9. Evaluate on Test Set

## 10. Test Inference

In [None]:
# ============================================================================
# 5Ô∏è‚É£ TRAINING STRATEGY (PH·∫¶N X∆Ø∆†NG S·ªêNG)
# ============================================================================

# Data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100
)

# 5.2 Training arguments (baseline ·ªïn ƒë·ªãnh)
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5_vi_sum",
    
    # Batch size strategy
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,  # Gi·∫£ l·∫≠p batch size 16
    
    # Learning rate cho mT5
    # üëâ 1e-4 ‚Üí ·ªïn ƒë·ªãnh
    # üëâ 2e-4 ‚Üí nhanh h∆°n (recommended)
    # üëâ >3e-4 ‚Üí d·ªÖ n·ªï loss üí£
    learning_rate=2e-4,
    warmup_steps=500,
    num_train_epochs=3,
    weight_decay=0.01,
    
    # Evaluation strategy
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    
    # Generation settings for evaluation
    predict_with_generate=True,
    generation_max_length=128,
    generation_num_beams=4,
    
    # Optimization
    fp16=USE_FP16,  # Enable FP16 on CUDA
    gradient_checkpointing=USE_GRAD_CHECKPOINT,
    
    # Logging
    logging_steps=100,
    logging_first_step=True,
    save_total_limit=2,
    
    # Best model selection
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",  # ROUGE-L l√† quan tr·ªçng nh·∫•t
    greater_is_better=True,
    
    report_to="none",
)

# Create Seq2Seq Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("‚úÖ Trainer initialized!")
print(f"\nüìä Training Configuration:")
print(f"   Device: {device}")
print(f"   FP16: {USE_FP16}")
print(f"   Per-device batch size: {training_args.per_device_train_batch_size}")
print(f"   Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"   Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"   Learning rate: {training_args.learning_rate}")
print(f"   Warmup steps: {training_args.warmup_steps}")
print(f"   Total epochs: {training_args.num_train_epochs}")
print(f"   Eval every: {training_args.eval_steps} steps")

## 12. Quick Test with New Text