# Intrinsic Evaluation: Vietnamese Dialect-to-Standard Translation

This notebook evaluates various sequence-to-sequence models for Vietnamese dialect-to-standard translation.

## Models Evaluated
- `bmd1905/vietnamese-correction-v2`
- `vinai/bartpho-syllable-base`
- `vinai/bartpho-word-base`
- `VietAI/vit5-base`
- `facebook/mbart-large-50`

## Reproducibility
- **Random Seed**: 42 (set for Python, NumPy, PyTorch, and Transformers)
- All experiments use the same seed for reproducibility
- Model initialization, data shuffling, and training are deterministic

## Usage
1. Install dependencies (Cell 1)
2. Set model name in Cell 2 (default: `VietAI/vit5-base`)
3. Run all cells to train and evaluate the model
4. Results are saved in `./results` and model in `./best_model`

## Metrics
- BLEU: Bilingual Evaluation Understudy
- ROUGE-L: Longest Common Subsequence
- METEOR: Metric for Evaluation of Translation with Explicit ORdering
- WER: Word Error Rate
- CER: Character Error Rate


In [None]:
!pip install datasets==4.0.0 evaluate==0.4.6 accelerate==1.11.0 rouge_score==0.1.2 jiwer==4.0.0 editdistance==0.8.1 -q

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
import evaluate
import torch
from jiwer import wer as calculate_wer
import editdistance
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import random
import os

# Set random seed for reproducibility
RANDOM_SEED = 42  # Seed value used for all experiments

def set_seed(seed):
    """Set random seed for reproducibility across all libraries"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # For multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

# Apply seed
set_seed(RANDOM_SEED)
print(f"Random seed set to: {RANDOM_SEED}")

# Download necessary NLTK resources
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

train_df = pd.read_csv("https://huggingface.co/datasets/Biu3010/ViDia2Std/resolve/main/train.csv").dropna()
valid_df = pd.read_csv("https://huggingface.co/datasets/Biu3010/ViDia2Std/resolve/main/dev.csv").dropna()
test_df = pd.read_csv("https://huggingface.co/datasets/Biu3010/ViDia2Std/resolve/main/test.csv").dropna()
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

model_name = "facebook/mbart-large-50" # [bmd1905/vietnamese-correction-v2, vinai/bartpho-syllable-base, vinai/bartpho-word-base, VietAI/vit5-base, facebook/mbart-large-50]
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Move model to device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Model loaded on device: {device}")

max_length = 50

def preprocess_function(examples):
    inputs = examples["dialect"]
    targets = examples["standard"]

    model_inputs = tokenizer(
        inputs,
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )
    tokenizer.src_lang = "vi_VN"
    tokenizer.tgt_lang = "vi_VN"

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_valid = valid_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Load metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Replace -100 with the pad token ID before decoding
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Now decode both sequences
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = {}

    # Calculate ROUGE metrics (with proper key handling)
    try:
        rouge_result = rouge.compute(
            predictions=decoded_preds,
            references=decoded_labels,
            use_stemmer=True
        )
        # Use rougeL since that's what's available
        result["rouge_l"] = rouge_result["rougeL"]
    except Exception as e:
        print(f"ROUGE calculation error: {e}")
        result["rouge_l"] = float('nan')

    # Calculate BLEU (with corrected format)
    try:
        # The metric automatically wraps references into list of lists if provided as list of strings (see bleu.py)
        bleu_result = bleu.compute(
            predictions=decoded_preds,
            references=decoded_labels
        )
        result["bleu"] = bleu_result["bleu"]
    except Exception as e:
        print(f"BLEU calculation error: {e}")
        # Fallback: calculate BLEU manually using NLTK
        try:
            bleu_scores = []
            smoothing_function = SmoothingFunction().method1
            for pred, label in zip(decoded_preds, decoded_labels):
                pred_tokens = pred.split()
                label_tokens = label.split()
                if len(label_tokens) > 0:
                    score = sentence_bleu([label_tokens], pred_tokens, smoothing_function=smoothing_function)
                    bleu_scores.append(score)
            result["bleu"] = np.mean(bleu_scores) if bleu_scores else float('nan')
        except Exception as e2:
            print(f"Manual BLEU calculation also failed: {e2}")
            result["bleu"] = float('nan')

    # Calculate METEOR
    try:
        meteor_scores = []
        for pred, label in zip(decoded_preds, decoded_labels):
            # METEOR expects a list of tokens
            pred_tokens = pred.split()
            label_tokens = label.split()
            if len(label_tokens) > 0:  # Avoid empty references
                meteor_scores.append(meteor_score([label_tokens], pred_tokens))

        result["meteor"] = np.mean(meteor_scores) if meteor_scores else float('nan')
    except Exception as e:
        print(f"METEOR calculation error: {e}")
        result["meteor"] = float('nan')

    # Calculate WER (Word Error Rate)
    try:
        valid_pairs = [(ref, pred) for ref, pred in zip(decoded_labels, decoded_preds) if len(ref.strip()) > 0]

        if valid_pairs:
            # Unzip the valid pairs
            valid_refs, valid_preds = zip(*valid_pairs)

            # Calculate WER only on valid pairs
            wer_scores = [calculate_wer(ref, pred) for ref, pred in zip(valid_refs, valid_preds)]
            result["wer"] = np.mean(wer_scores)
        else:
            result["wer"] = float('nan')
    except Exception as e:
        print(f"WER calculation error: {e}")
        result["wer"] = float('nan')

    # Calculate CER (Character Error Rate)
    def calculate_cer(ref, pred):
        if len(ref) == 0:
            return 1.0 if len(pred) > 0 else 0.0
        return editdistance.eval(ref, pred) / max(len(ref), 1)

    try:
        cer_scores = [calculate_cer(ref, pred) for ref, pred in zip(decoded_labels, decoded_preds)]
        result["cer"] = np.mean(cer_scores)
    except Exception as e:
        print(f"CER calculation error: {e}")
        result["cer"] = float('nan')

    # Round all results for better readability
    return {k: round(v, 4) if not isinstance(v, float) or not np.isnan(v) else v for k, v in result.items()}

# Enhanced Training Arguments with Early Stopping Configuration
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none",
    metric_for_best_model="eval_bleu",  # Primary metric for early stopping decision
    greater_is_better=True,  # Higher BLEU scores are better
    save_strategy="epoch",  # Save model after each epoch
    load_best_model_at_end=True,  # Load the best performing model at the end
    eval_accumulation_steps=1,
    save_steps=500,  # Additional checkpoint saving
    logging_first_step=True,  # Log the first training step
    dataloader_pin_memory=torch.cuda.is_available(),  # Performance optimization
    seed=RANDOM_SEED,  # Set seed for data shuffling and training
    data_seed=RANDOM_SEED,  # Set seed for data sampling
)

# Initialize Early Stopping Callback with enhanced configuration
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=1,  # Stop if no improvement for 3 consecutive evaluations
    early_stopping_threshold=0.01  # Minimum improvement threshold
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

# Training with early stopping monitoring
print("Starting training with early stopping...")
print(f"Maximum epochs: {training_args.num_train_epochs}")
print(f"Early stopping patience: {early_stopping_callback.early_stopping_patience}")
print(f"Monitoring metric: {training_args.metric_for_best_model}")

In [None]:
# Train the model
trainer.train()

In [None]:
test_results = trainer.evaluate(tokenized_test, metric_key_prefix="test")
print(f"Test results: {test_results}")

# Function for inference on new text
def normalize_text(text, max_length=50):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate prediction with deterministic settings
    output = model.generate(
        **inputs,
        max_length=max_length,
        do_sample=False,  # Use greedy decoding for reproducibility
        num_beams=1,  # No beam search for deterministic results
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Decode prediction
    normalized_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return normalized_text

model_save_path = "./best_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

In [None]:
def translate(text, max_length=50):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length)

    inputs.pop("token_type_ids", None)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate with deterministic settings for reproducibility
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        do_sample=False,  # Use greedy decoding for reproducibility
        num_beams=1,  # No beam search for deterministic results
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
print(translate("Cấy mấn ni nỏ hở rứa mô, hấn có phần lót bên trong là áo da màu nude mà"))