In [None]:
%pip install transformers torch
%pip install --upgrade pip setuptools wheel


Collecting transformers
  Using cached transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Using cached transformers-4.52.3-py3-none-any.whl (10.5 MB)
Using cached safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl (418 kB)
Using cached tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl (2.7 MB)
Installing collected packages: safetensors, tokenizers, transformers
Successfully installed safetensors-0.5.3 tokenizers-0.21.1 transformers-4.52.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart t

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from datasets import load_dataset
from tqdm import tqdm
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_and_prepare_data(dataset_name="gsarti/flores_101", language="eng", split="devtest"):
    """Load and prepare the Flores dataset."""
    try:
        dataset = load_dataset(dataset_name, name=language, split=split, trust_remote_code=True)
        return [{"eng": entry["sentence"]} for entry in dataset]
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        raise

def initialize_model(model_name="facebook/nllb-200-distilled-600M"):
    """Initialize and return the translation model and tokenizer."""
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        return tokenizer, model, device
    except Exception as e:
        logger.error(f"Error initializing model: {e}")
        raise

def translate_english_to_zulu(ref_sentences, output_file="flores101.hyp.txt"):
    """
    Translate English sentences to Zulu using NLLB-200 model.
    
    Args:
        ref_sentences: List of dictionaries with "eng" keys containing sentences
        output_file: Path to save the translations
    """
    try:
        tokenizer, model, device = initialize_model()
        target_lang = "zul_Latn"  # Zulu language code for NLLB
        
        # Get the target language token ID
        forced_bos_token_id = tokenizer.convert_tokens_to_ids(target_lang)
        
        logger.info(f"Starting translation of {len(ref_sentences)} sentences...")
        
        with open(output_file, "w", encoding="utf-8") as f:
            for entry in tqdm(ref_sentences, desc="Translating"):
                # Tokenize the input text
                inputs = tokenizer(
                    entry["eng"],
                    return_tensors="pt",
                    padding=True,
                    truncation=True
                ).to(device)
                
                # Generate translation with target language specification
                with torch.no_grad():
                    translated = model.generate(
                        **inputs,
                        forced_bos_token_id=forced_bos_token_id
                    )
                
                decoded = tokenizer.decode(translated[0], skip_special_tokens=True)
                f.write(decoded.strip() + "\n")
        
        logger.info(f"Translation completed. Results saved to {output_file}")
    except Exception as e:
        logger.error(f"Error during translation: {e}")
        raise

if __name__ == "__main__":
    # Load data
    ref_sentences = load_and_prepare_data()
    
    # Perform translation
    translate_english_to_zulu(ref_sentences)


INFO:__main__:Starting translation of 1012 sentences...
Translating: 100%|██████████| 1012/1012 [19:10<00:00,  1.14s/it]
INFO:__main__:Translation completed. Results saved to flores101.hyp.txt
