<a href="https://colab.research.google.com/github/bukhosi-eugene-mpande/nlp-project/blob/eugene.mpande%2Ftasks%2Fcombination/evaluation_flores101.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install datasets>=2.15.0
%pip install transformers>=4.36.0
%pip install torch>=2.1.0
%pip install sacrebleu>=2.3.1
%pip install tqdm>=4.66.1
%pip install numpy>=1.24.0
%pip install regex>=2023.10.3
%pip install accelerate>=0.26.0
%pip install bert-score>=0.3.13
%pip install sentence-transformers>=2.2.2

In [2]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from tqdm import tqdm
import logging
from sacrebleu import corpus_bleu
import os
import json
from sacrebleu.metrics import BLEU, CHRF
from bert_score import score as bert_score
from sentence_transformers import SentenceTransformer
import numpy as np
from collections import Counter
import re

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA (NVIDIA GPU) for acceleration")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Metal Performance Shaders) for acceleration")
else:
    device = torch.device("cpu")
    print("No GPU acceleration available, using CPU")

Using CUDA (NVIDIA GPU) for acceleration


In [4]:
def load_and_prepare_data(target_lang, split="dev"):
    """Load and prepare the FLORES-101 dataset for the specified language."""
    try:
        dataset_name = "facebook/flores"
        source_lang = "eng_Latn"

        # Map language codes to their full format with script
        lang_code_map = {
            "hau": "hau_Latn",
            "nso": "nso_Latn",
            "zul": "zul_Latn"
        }

        target_lang_full = lang_code_map.get(target_lang)
        if not target_lang_full:
            raise ValueError(f"Unsupported target language code: {target_lang}")

        # Load source and target datasets
        source_dataset = load_dataset(dataset_name, name=source_lang, split=split, trust_remote_code=True)
        target_dataset = load_dataset(dataset_name, name=target_lang_full, split=split, trust_remote_code=True)

        # Create training pairs
        training_data = {
            "input_text": [src["sentence"] for src in source_dataset],
            "target_text": [tgt["sentence"] for tgt in target_dataset]
        }

        # Convert to HuggingFace Dataset
        from datasets import Dataset
        return Dataset.from_dict(training_data)
    except Exception as e:
        print(f"Error loading dataset: {e}")
        raise


In [5]:
def initialize_model(model_path="facebook/nllb-200-distilled-600M"):
    """Initialize the NLLB model and tokenizer."""
    try:
        # Check for MPS (Metal Performance Shaders) availability
        if torch.cuda.is_available():
            device = torch.device("cuda")
            print("Using CUDA (NVIDIA GPU) for acceleration")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
            print("Using MPS (Metal Performance Shaders) for acceleration")
        else:
            device = torch.device("cpu")
            print("No GPU acceleration available, using CPU")

        # Load model and tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

        return tokenizer, model, device
    except Exception as e:
        print(f"Error initializing model: {e}")
        raise


In [6]:
def translate_english_to_target_lang(model, tokenizer, device, ref_sentences, output_file, target_lang_code, batch_size=16):
    """Translate English sentences to target language using the trained NLLB model."""
    try:
        # Map language codes to NLLB language codes
        nllb_lang_codes = {
            "hau": "hau_Latn",
            "nso": "nso_Latn",
            "zul": "zul_Latn"
        }

        target_lang = nllb_lang_codes.get(target_lang_code)
        if not target_lang:
            raise ValueError(f"Unsupported target language code: {target_lang_code}")

        forced_bos_token_id = tokenizer.convert_tokens_to_ids(target_lang)

        print(f"Starting translation to {target_lang}...")

        # Prepare all sentences for batch processing
        all_sentences = ref_sentences["input_text"]

        # Ensure output directory exists
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

        with open(output_file, "w", encoding="utf-8") as f:
            # Process in batches
            for i in tqdm(range(0, len(all_sentences), batch_size), desc=f"Translating to {target_lang}"):
                batch_sentences = all_sentences[i:i + batch_size]

                # Tokenize batch without length limits
                inputs = tokenizer(
                    batch_sentences,
                    return_tensors="pt",
                    padding=True,
                    truncation=False
                ).to(device)

                with torch.no_grad():
                    translated = model.generate(
                        **inputs,
                        forced_bos_token_id=forced_bos_token_id,
                        num_beams=4,
                        early_stopping=True
                    )

                # Decode and write batch results
                decoded = tokenizer.batch_decode(translated, skip_special_tokens=True)
                for translation in decoded:
                    f.write(translation.strip() + "\n")

        print(f"Translation completed. Results saved to {output_file}")
    except Exception as e:
        print(f"Error during translation: {e}")
        raise


In [7]:
def calculate_metrics(hypotheses, references, lang):
    """Calculate various translation metrics."""
    metrics = {}

    try:
        # BLEU score
        bleu = BLEU()
        metrics['BLEU'] = bleu.corpus_score(hypotheses, [references]).score

        # chrF score
        chrf = CHRF()
        metrics['chrF'] = chrf.corpus_score(hypotheses, [references]).score

        # BERTScore
        P, R, F1 = bert_score(hypotheses, references, lang=lang, device='mps' if torch.backends.mps.is_available() else 'cpu')
        metrics['BERTScore'] = F1.mean().item()

        # Semantic Similarity Score (replacing COMET)
        model = SentenceTransformer('all-MiniLM-L6-v2')
        hyp_embeddings = model.encode(hypotheses, convert_to_tensor=True)
        ref_embeddings = model.encode(references, convert_to_tensor=True)
        similarity = torch.nn.functional.cosine_similarity(hyp_embeddings, ref_embeddings)
        metrics['Semantic_Score'] = similarity.mean().item()

        # Error Type Frequency Distribution
        error_types = analyze_errors(hypotheses, references)
        metrics['Error_Distribution'] = error_types

    except Exception as e:
        print(f"Error calculating metrics: {e}")
        metrics['error'] = str(e)

    return metrics


In [8]:
def analyze_errors(hypotheses, references):
    """Analyze translation errors and their distribution."""
    error_types = Counter()

    for hyp, ref in zip(hypotheses, references):
        # Word order errors
        hyp_words = set(hyp.split())
        ref_words = set(ref.split())
        if hyp_words == ref_words and hyp != ref:
            error_types['word_order'] += 1

        # Missing words
        missing = ref_words - hyp_words
        if missing:
            error_types['missing_words'] += len(missing)

        # Extra words
        extra = hyp_words - ref_words
        if extra:
            error_types['extra_words'] += len(extra)

        # Case errors
        if hyp.lower() == ref.lower() and hyp != ref:
            error_types['case_errors'] += 1

        # Punctuation errors
        hyp_no_punct = re.sub(r'[^\w\s]', '', hyp)
        ref_no_punct = re.sub(r'[^\w\s]', '', ref)
        if hyp_no_punct == ref_no_punct and hyp != ref:
            error_types['punctuation_errors'] += 1

    return dict(error_types)


In [9]:
    # Configuration
    languages = {
        "hausa": "hau",
        "northern-sotho": "nso",
        "zulu": "zul"
    }

    model_name = "nllb-200-distilled-600M"
    model_path = "facebook/nllb-200-distilled-600M"
    output_base_dir = "output"

    # Initialize model and tokenizer
    tokenizer, model, device = initialize_model(model_path)

    # Translate for each language
    for lang, code in languages.items():
        output_file = f"{output_base_dir}/{model_name}/flores101.{lang}.hyp.txt"

        # Check if translation already exists
        if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
            print(f"Translation for {lang} already exists. Skipping translation...")
        else:
            print(f"Translating to {lang}...")
            # Use devtest split for testing
            test_data = load_and_prepare_data(target_lang=code, split="devtest")
            translate_english_to_target_lang(model, tokenizer, device, test_data, output_file, code)

        # Calculate metrics
        try:
            with open(output_file, "r", encoding="utf-8") as f:
                hyps = [line.strip() for line in f]

            with open(f"{lang}/flores101.{lang}.ref.test.txt", "r", encoding="utf-8") as f:
                refs = [line.strip() for line in f]

            metrics = calculate_metrics(hyps, refs, lang)

            print(f"\nMetrics for {lang}:")
            print("-" * 30)
            for metric_name, value in metrics.items():
                if metric_name != 'Error_Distribution':
                    print(f"{metric_name:15}: {value:.4f}")

            if 'Error_Distribution' in metrics:
                print("\nError Distribution:")
                for error_type, count in metrics['Error_Distribution'].items():
                    print(f"{error_type:20}: {count}")

        except Exception as e:
            print(f"Error calculating metrics for {lang}: {e}")
            print(f"Could not calculate metrics for {lang}")

    # Print summary of all metrics
    print("\nSummary of all metrics:")
    print("-" * 60)
    print(f"{'Language':15} {'BLEU':>8} {'chrF':>8} {'BERTScore':>10} {'Semantic':>8}")
    print("-" * 60)

    for lang in languages:
        try:
            output_file = f"{output_base_dir}/{model_name}/flores101.{lang}.hyp.txt"
            with open(output_file, "r", encoding="utf-8") as f:
                hyps = [line.strip() for line in f]

            with open(f"{lang}/flores101.{lang}.ref.test.txt", "r", encoding="utf-8") as f:
                refs = [line.strip() for line in f]

            metrics = calculate_metrics(hyps, refs, lang)
            print(f"{lang:15} {metrics['BLEU']:8.2f} {metrics['chrF']:8.2f} {metrics['BERTScore']:10.2f} {metrics['Semantic_Score']:8.2f}")
        except Exception as e:
            print(f"Error calculating metrics for {lang}: {e}")
            print(f"{lang:15} {'Error':>8} {'Error':>8} {'Error':>10} {'Error':>8}")

    print("-" * 60)

Using CUDA (NVIDIA GPU) for acceleration


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

(…)6cea38b9e3d5efcdcb9c251d6b40538e1aab555a:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

(…)b3c438311629547285129b0b81dadabd01bca665:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

(…)1ecdf1e485509035f6b51dfe84f1ada83eefcc42:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

Translating to hausa...


README.md:   0%|          | 0.00/11.8k [00:00<?, ?B/s]

flores.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.6M [00:00<?, ?B/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Starting translation to hau_Latn...



Translating to hau_Latn:   0%|          | 0/64 [00:00<?, ?it/s][A
Translating to hau_Latn:   2%|▏         | 1/64 [00:14<15:18, 14.58s/it][A
Translating to hau_Latn:   3%|▎         | 2/64 [00:20<09:53,  9.57s/it][A
Translating to hau_Latn:   5%|▍         | 3/64 [00:23<06:29,  6.38s/it][A
Translating to hau_Latn:   6%|▋         | 4/64 [00:29<06:11,  6.19s/it][A
Translating to hau_Latn:   8%|▊         | 5/64 [00:32<04:57,  5.04s/it][A
Translating to hau_Latn:   9%|▉         | 6/64 [00:34<03:58,  4.11s/it][A
Translating to hau_Latn:  11%|█         | 7/64 [00:37<03:42,  3.91s/it][A
Translating to hau_Latn:  12%|█▎        | 8/64 [00:40<03:17,  3.53s/it][A
Translating to hau_Latn:  14%|█▍        | 9/64 [00:42<02:52,  3.14s/it][A
Translating to hau_Latn:  16%|█▌        | 10/64 [00:45<02:40,  2.98s/it][A
Translating to hau_Latn:  17%|█▋        | 11/64 [00:50<03:08,  3.55s/it][A
Translating to hau_Latn:  19%|█▉        | 12/64 [00:53<02:50,  3.28s/it][A
Translating to hau_Latn:  20%

Translation completed. Results saved to output/nllb-200-distilled-600M/flores101.hausa.hyp.txt


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Metrics for hausa:
------------------------------
BLEU           : 23.7964
chrF           : 51.3562
BERTScore      : 0.8214
Semantic_Score : 0.7891

Error Distribution:
missing_words       : 11561
extra_words         : 10500
Translating to northern-sotho...


Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Starting translation to nso_Latn...


Translating to nso_Latn: 100%|██████████| 64/64 [04:00<00:00,  3.75s/it]


Translation completed. Results saved to output/nllb-200-distilled-600M/flores101.northern-sotho.hyp.txt
Error calculating metrics for northern-sotho: [Errno 2] No such file or directory: 'northern-sotho/flores101.northern-sotho.ref.test.txt'
Could not calculate metrics for northern-sotho
Translating to zulu...


Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Starting translation to zul_Latn...


Translating to zul_Latn: 100%|██████████| 64/64 [03:31<00:00,  3.31s/it]


Translation completed. Results saved to output/nllb-200-distilled-600M/flores101.zulu.hyp.txt

Metrics for zulu:
------------------------------
BLEU           : 16.8243
chrF           : 56.2718
BERTScore      : 0.8334
Semantic_Score : 0.7892

Error Distribution:
missing_words       : 9805
extra_words         : 9112

Summary of all metrics:
------------------------------------------------------------
Language            BLEU     chrF  BERTScore Semantic
------------------------------------------------------------
hausa              23.80    51.36       0.82     0.79
Error calculating metrics for northern-sotho: [Errno 2] No such file or directory: 'northern-sotho/flores101.northern-sotho.ref.test.txt'
northern-sotho     Error    Error      Error    Error
zulu               16.82    56.27       0.83     0.79
------------------------------------------------------------
