# AMT - AUTOMATIC MACHINE TRANSLATION

@alessioborgi

### 0: IMPORTING LIBRARIES

In [8]:
!pip install datasets huggingface-hub pandas transformers datasets tiktoken protobuf sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m811.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [11]:
# Importing libraries for step 1).
import os
import pandas as pd
from huggingface_hub import login
from datasets import load_dataset

# Importing libraries for step 2).
from transformers import MBartForConditionalGeneration, MBart50Tokenizer

### 1: LOADING THE DATASET

#### 1.1: PUSH THE DATASET TO HUGGING-FACE

In [5]:
def upload_to_hf_dataset(
    hf_token: str,
    data_file_path: str,
    repo_name: str,
    file_format: str = "csv",
    split_name: str = "test",
):
    """
    Uploads a local file as a Hugging Face Dataset.

    Args:
        hf_token: Your Hugging Face access token.
        data_file_path: Path to the local data file.
        repo_name: The target repo on HF (e.g. "username/my-dataset").
        file_format: One of "csv", "json", "tsv", etc. Default "csv".
        split_name: Name of the dataset split (e.g. "train", "test"). Default "test".
    """
    # 1) Authenticate to HuggingFace.
    login(token=hf_token)

    # 2) Load local file.
    data_files = { split_name: data_file_path }
    dataset = load_dataset(file_format, data_files=data_files)

    # 3) Push to Hub.
    dataset.push_to_hub(repo_name, token=hf_token)
    print(f"Dataset available at https://huggingface.co/datasets/{repo_name}")

In [6]:
hf_token = "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"
local_path = "/Users/alessioborgi/GitHub/AMT-AutomaticMachineTranslation/test_data/dataset_cleaned.csv"
repo_name  = "Alessio-Borgi/archaic-italian-cleaned-test"

upload_to_hf_dataset(
    hf_token=hf_token,
    data_file_path=local_path,
    repo_name=repo_name,
    file_format="csv",
    split_name="test",
)

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 134.88ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]
No files have been modified since last commit. Skipping to prevent empty commit.


Dataset available at https://huggingface.co/datasets/Alessio-Borgi/archaic-italian-cleaned-test


#### 1.2: LOADING DATASET FROM HUGGING-FACE

In [3]:
ds = load_dataset("Alessio-Borgi/archaic-italian-cleaned-test")

In [4]:
ds

DatasetDict({
    test: Dataset({
        features: ['Author', 'Date', 'Region', 'Sentence'],
        num_rows: 97
    })
})

#### 1.3: EXPLORING THE TEST DATASET

In [25]:
def explore_dataset(dataset_name):
    ''' Function to explore a dataset. '''

    # Loading the dataset.
    ds = load_dataset(dataset_name)
    df = pd.DataFrame(ds["test"])

    # 1) Number of examples.
    print("Number of examples:", len(df))

    # 2) Preview first 5 examples.
    print("First 5 examples:")
    print(df.head(5), "\n")

    # 3) Sentence-length statistics.
    df["length_tokens"] = df["Sentence"].apply(lambda x: len(x.split()))
    print("Sentence length (tokens) stats:")
    print(df["length_tokens"].describe(), "\n")

    # 4 Take out the column names.
    print("Column names:", df.columns.tolist(), "\n")

In [26]:
# Explore the dataset.
explore_dataset(dataset_name="Alessio-Borgi/archaic-italian-cleaned-test")

Number of examples: 97
First 5 examples:
                        Author     Date Region  \
0              Brunetto Latini  1260-61  fior.   
1                Bono Giamboni     1292  fior.   
2     Valerio Massimo (red. V1     1336  fior.   
3  Lucano volg. (ed. Marinoni)  1330/40  prat.   
4              Brunetto Latini  1260-61  fior.   

                                            Sentence  
0  quella guerra ben fatta l' opera perché etc. E...  
1  crudele, e di tutte le colpe pigli vendetta, c...  
2  Non d' altra forza d' animo fue ornato Ponzio ...  
3  Se questo piace a tutti e se 'l tempo hae biso...  
4  Officio di questa arte pare che sia dicere app...   

Sentence length (tokens) stats:
count    97.000000
mean     20.041237
std       5.996384
min       6.000000
25%      16.000000
50%      20.000000
75%      24.000000
max      31.000000
Name: length_tokens, dtype: float64 

Column names: ['Author', 'Date', 'Region', 'Sentence', 'length_tokens'] 



### 2: AMT - TRANSFORMER-BASED 

#### 2.1: mBART (MULTILINGUAL BART)

**ARCHITECTURE & SIZE** 
This Transformer-based solution consists in 12-layer encoder + 12-layer decoder Transformer (≈610 M parameters).

**DESCRIPTION**
- **Pretraining**: It has been pretrained via Denoising auto-encoding on monolingual corpora in 50 languages (mBART-50).
- **Multilingual MT**: It has been fine-tuned on many-to-many bitext and supports direct “it→it” by forcing Italian as both source & target.

**REFERENCE INFORMATION**
- Hugging-Face Reference page: https://huggingface.co/docs/transformers/model_doc/mbart 
- Specific Model employed: *facebook/mbart-large-50-many-to-many-mmt*


In [12]:
# 1) Loading mBART-50 Model & Tokenizer.
model_name = "facebook/mbart-large-50-many-to-many-mmt"
mBART_tokenizer = MBart50Tokenizer.from_pretrained(model_name)
mBART_model = MBartForConditionalGeneration.from_pretrained(model_name)
mBART_tokenizer.src_lang = "it_IT"

# 2) Batched translation function.
def modernize_mbart(sentences, batch_size=8):
    ''' Function to translate sentences using mBART. '''
    # Instantiating the list to store translations.
    translations = []

    # Tokenize and generate translations in batches.
    for i in range(0, len(sentences), batch_size):
        # Create a batch of sentences.
        batch = sentences[i : i + batch_size]

        # Tokenize the batch.
        inputs = mBART_tokenizer(batch, return_tensors="pt", padding=True, truncation=True)

        # Generate translations.
        gen = mBART_model.generate(
            **inputs,
            forced_bos_token_id=mBART_tokenizer.lang_code_to_id["it_IT"],
            max_length=512,
        )
        # Decode the generated tokens and append to translations.
        translations.extend(mBART_tokenizer.batch_decode(gen, skip_special_tokens=True))
    return translations

# 3) Run on the test split (replace "text" with the actual column name if different)
arch_sentences = ds["Sentence"]
mbart_outputs = modernize_mbart(arch_sentences)

# 4) Attach back to the dataset the translations.
translated_ds_mbart = ds.add_column("mbart_translation", mbart_outputs)


ImportError: 
MBart50Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


#### 2.2: 

In [None]:
# Cell 2: Translate with NLLB-200-3.3B

from datasets import load_dataset
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# 1) Load the test dataset
ds = load_dataset("Alessio-Borgi/archaic-italian-cleaned-test")

# 2) Load NLLB-200-3.3B model & tokenizer
model_name = "facebook/nllb-200-3.3B"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer.src_lang = "ita_Latn"

# 3) Batched translation function
def modernize_nllb(sentences, batch_size=8):
    translations = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i : i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        gen = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id["ita_Latn"],
            max_length=512,
        )
        translations.extend(tokenizer.batch_decode(gen, skip_special_tokens=True))
    return translations

# 4) Run on your test split (replace "text" with the actual column name if different)
arch_sentences = ds["test"]["text"]
nllb_outputs = modernize_nllb(arch_sentences)

# 5) (Optional) Attach back to the dataset
translated_ds_nllb = ds["test"].add_column("nllb_translation", nllb_outputs)
