# AMT - AUTOMATIC MACHINE TRANSLATION

@alessioborgi

### 0: IMPORTING LIBRARIES

In [1]:
!pip install -U datasets bitsandbytes accelerate
!pip install huggingface-hub pandas transformers tiktoken protobuf sentencepiece tqdm

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==

In [13]:
# Importing libraries for step 1).
import os
import torch
import random
import pandas as pd
from tqdm.auto import tqdm
from huggingface_hub import login
from datasets import load_dataset
from huggingface_hub import hf_hub_download


# Importing libraries for step 2).
from transformers import MBartForConditionalGeneration, MBart50Tokenizer, AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig, pipeline, AutoModelForCausalLM

### 1: LOADING THE DATASET

#### 1.1: PUSH THE DATASET TO HUGGING-FACE

In [None]:
def upload_to_hf_dataset(
    hf_token: str,
    data_file_path: str,
    repo_name: str,
    file_format: str = "csv",
    split_name: str = "test",
):
    """
    Uploads a local file as a Hugging Face Dataset.

    Args:
        hf_token: Your Hugging Face access token.
        data_file_path: Path to the local data file.
        repo_name: The target repo on HF (e.g. "username/my-dataset").
        file_format: One of "csv", "json", "tsv", etc. Default "csv".
        split_name: Name of the dataset split (e.g. "train", "test"). Default "test".
    """
    # 1) Authenticate to HuggingFace.
    login(token=hf_token)

    # 2) Load local file.
    data_files = { split_name: data_file_path }
    dataset = load_dataset(file_format, data_files=data_files)

    # 3) Push to Hub.
    dataset.push_to_hub(repo_name, token=hf_token)
    print(f"Dataset available at https://huggingface.co/datasets/{repo_name}")

In [None]:
hf_token = "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"
local_path = "/Users/alessioborgi/GitHub/AMT-AutomaticMachineTranslation/test_data/dataset_cleaned.csv"
repo_name  = "Alessio-Borgi/archaic-italian-cleaned-test"

upload_to_hf_dataset(
    hf_token=hf_token,
    data_file_path=local_path,
    repo_name=repo_name,
    file_format="csv",
    split_name="test",
)

#### 1.2: LOADING DATASET FROM HUGGING-FACE

In [14]:
ds = load_dataset("Alessio-Borgi/archaic-italian-cleaned-test")

In [15]:
ds

DatasetDict({
    test: Dataset({
        features: ['Author', 'Date', 'Region', 'Sentence'],
        num_rows: 97
    })
})

#### 1.3: EXPLORING THE TEST DATASET

In [5]:
def explore_dataset(dataset_name):
    ''' Function to explore a dataset. '''

    # Loading the dataset.
    ds = load_dataset(dataset_name)
    df = pd.DataFrame(ds["test"])

    # 1) Number of examples.
    print("Number of examples:", len(df))

    # 2) Preview first 5 examples.
    print("First 5 examples:")
    print(df.head(5), "\n")

    # 3) Sentence-length statistics.
    df["length_tokens"] = df["Sentence"].apply(lambda x: len(x.split()))
    print("Sentence length (tokens) stats:")
    print(df["length_tokens"].describe(), "\n")

    # 4 Take out the column names.
    print("Column names:", df.columns.tolist(), "\n")

In [6]:
# Explore the dataset.
explore_dataset(dataset_name="Alessio-Borgi/archaic-italian-cleaned-test")

Number of examples: 97
First 5 examples:
                        Author     Date Region  \
0              Brunetto Latini  1260-61  fior.   
1                Bono Giamboni     1292  fior.   
2     Valerio Massimo (red. V1     1336  fior.   
3  Lucano volg. (ed. Marinoni)  1330/40  prat.   
4              Brunetto Latini  1260-61  fior.   

                                            Sentence  
0  quella guerra ben fatta l' opera perché etc. E...  
1  crudele, e di tutte le colpe pigli vendetta, c...  
2  Non d' altra forza d' animo fue ornato Ponzio ...  
3  Se questo piace a tutti e se 'l tempo hae biso...  
4  Officio di questa arte pare che sia dicere app...   

Sentence length (tokens) stats:
count    97.000000
mean     20.041237
std       5.996384
min       6.000000
25%      16.000000
50%      20.000000
75%      24.000000
max      31.000000
Name: length_tokens, dtype: float64 

Column names: ['Author', 'Date', 'Region', 'Sentence', 'length_tokens'] 



### 2: AMT - TRANSFORMER-BASED

#### 2.1: mBART (MULTILINGUAL BART)

**ARCHITECTURE & SIZE**
This Transformer-based solution consists in 12-layer encoder + 12-layer decoder Transformer (≈610 M parameters).

**DESCRIPTION**
- **Pretraining**: It has been pretrained via Denoising auto-encoding on monolingual corpora in 50 languages (mBART-50).
- **Multilingual MT**: It has been fine-tuned on many-to-many bitext and supports direct “it→it” by forcing Italian as both source & target.

**REFERENCE INFORMATION**
- Hugging-Face Reference page: https://huggingface.co/docs/transformers/model_doc/mbart
- Paper: https://arxiv.org/abs/2001.08210
- Specific Model employed: *facebook/mbart-large-50-many-to-many-mmt*


In [7]:
# 1) Loading mBART-50 Model & Tokenizer.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model_name = "facebook/mbart-large-50-many-to-many-mmt"
mBART_tokenizer = MBart50Tokenizer.from_pretrained(model_name)
mBART_model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)
mBART_tokenizer.src_lang = "it_IT"
mBART_tokenizer.model_max_length = 512


# 2) Updated batched translation with device placement
def modernize_mbart(sentences, batch_size=8):
    """
    Translate sentences using mBART on GPU (if available),
    showing a tqdm progress bar.
    """
    translations = []
    total_batches = (len(sentences) + batch_size - 1) // batch_size

    for i in tqdm(
        range(0, len(sentences), batch_size),
        total=total_batches,
        desc="mBART Translation",
        unit="batch",
        leave=True
    ):
        batch = sentences[i : i + batch_size]

        # Tokenization.
        inputs = mBART_tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        inputs = { name: tensor.to(device) for name, tensor in inputs.items() }

        # Generation of the Translations.
        with torch.no_grad():
            gen = mBART_model.generate(
                **inputs,
                forced_bos_token_id=mBART_tokenizer.lang_code_to_id["it_IT"],
                max_length=512,
            )
        # Decoding the extensions from tokenizer and add the translations to the list.
        translations.extend(mBART_tokenizer.batch_decode(gen, skip_special_tokens=True))
    return translations

# 3) Run on the test split.
arch_sentences = ds["test"]["Sentence"]
mbart_outputs = modernize_mbart(arch_sentences)

# 4) Attach back to the dataset the translations.
ds = ds["test"].add_column("mbart_translation", mbart_outputs)

# 5) Save the dataset with the mBART Translations.
df = ds.to_pandas()
output_path = "dataset_with_mbart_translations.csv"
df.to_csv(output_path, index=False)


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

mBART Translation:   0%|          | 0/13 [00:00<?, ?batch/s]

In [8]:
ds["mbart_translation"]

["E poi, Aiaces, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi.",
 'Crudele, e per tutte le colpe vendetta, come dice la legge, e per tutte le colpe vendetta, come dice la legge, e per tutte le colpe vendetta.',
 "Non c'è altra forza d' animosità che è stato venerato il Ponzio dell'Umiliare, un romano cavaliere.",
 'Se questo piace a tutti e se il tempo ha bisogno di Pompei per ridere e non per compagno, non riterrò più fati.',
 "L'offiziere di questo arte sembra essere solo per far credere, fine, per far credere.",
 "E' un' larghezza di vento, e' un' larghezza di nebbia, e' un' la

In [9]:
# 1) Sample 10 random indices
indices = random.sample(range(len(ds)), 10)

# 2) Print the pairs
for idx in indices:
    print(f"Archaic Sentence: {ds[idx]['Sentence']}")
    print(f"mBART Translation: {ds[idx]['mbart_translation']}\n")


Archaic Sentence: Io spero in messer Iesù di mandare tosto a voi Timoteo, acciocché io sia d'animo buono
mBART Translation: Io spero, in Messier Iesù, di mandare un tosto a Timoteo, perche' io abbia un buon umore.

Archaic Sentence: l'armi et insieme con loro passaseno tra li nimici, perçò se alcuno non avesse ardire de questo et sì avevano questo animo.
mBART Translation: l'armi e con loro passavano tra i nullai, quindi se non c'era un'armi e non c'era un'armi, loro passavano tra i nullai, quindi se non c'era un'armi e non c'era un'armi.

Archaic Sentence: Corbio nipote d' Ortensio menò sua vita più bassa e più viziosa
mBART Translation: Corbio, nephew of Ortensio, ha fatto la vita più bassa e più visiva.

Archaic Sentence: quello che sopra tutti gli altri perdonasse a' cittadini, e a cui più sicuramente possiate credere; poi ch'egli fu vostro comandatore.
mBART Translation: Quello che perdono per i cittadini, e che most di sicuro voi possiate credere; e poi lui fu il vostro capo.

Ar

#### 2.2: NLLB (No Language Left Behind)

**ARCHITECTURE & SIZE**
This Transformer-based solution comes from the Meta family. It's a many-to-many multilingual Seq2Seq that can be used as a rewriting model for Italian→Italian..

**DESCRIPTION**
- **High Capacity/Quality**: The flagship nllb-200-3.3B has shown state-of-the-art BLEU/COMET on many low-resource ↔ high-resource pairs, and handles morphological/orthographic variation robustly.
- **Multilingual MT**: It supports 200 languages and has full support for ita_Latn (Italian in Latin script).

**REFERENCE INFORMATION**
- Hugging-Face Reference page: https://huggingface.co/docs/transformers/en/model_doc/nllb
- Paper: https://arxiv.org/abs/2207.04672
- Specific Model employed: *facebook/nllb-200-3.3B*

In [10]:
# Set up the 8-bit quantized NLLB pipeline for Italian→Italian.
# 1) Set up the device specifics.
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "cuda" if device == 0 else "cpu")

# 2) 8-bit + offload config.
bnb = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_enable_fp32_cpu_offload=True
)

# 3) Load model in 8-bit.
model_name = "facebook/nllb-200-3.3B"
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb,
    device_map="auto"
)

# 4) Load tokenizer with src/tgt languages set.
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    src_lang="ita_Latn",
    tgt_lang="ita_Latn"
)

# 5) Build the translation pipeline.
translator = pipeline(
    "translation",
    model=model,
    tokenizer=tokenizer,
    src_lang="ita_Latn",
    tgt_lang="ita_Latn",
)

# 6) Taking the sentences to translate and translate in batches.
arch = ds["Sentence"]
results = translator(arch, batch_size=8)

# 7) Extract the Italian text.
italian_translations = [r["translation_text"] for r in results]

# 8) Attach & save to csv file.
ds = ds.add_column("nllb_translation", italian_translations)
df = ds.to_pandas()
#df.to_csv("dataset_with_nllb_italian_translations.csv", index=False)
df.to_csv("dataset_with_mBART_NLLB_translations.csv", index=False)


Using device: cuda


config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)cf3e5af37956607f4c667d891ec069aa276be0be:   0%|          | 0.00/8.55G [00:00<?, ?B/s]

(…)18c1b46dc95be6e106c36df87d13175418b3972c:   0%|          | 0.00/6.93G [00:00<?, ?B/s]

(…)4436d65cf94380c5ddd8f524cb878e090b27bb50:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/94.1k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)6cea38b9e3d5efcdcb9c251d6b40538e1aab555a:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

Device set to use cuda:0


In [11]:
ds["nllb_translation"]

["E d'altra parte Aiaces era un cavaliere franco e prodigioso alle armi, di grande guisa, ma non era pieno di grande senno",
 'crudele, e per ogni colpa prendi vendetta, come dice la legge, e a nessun cavaliere perdona i peccati.',
 "Non per altra forza d'animo fu decorato Ponzio Aufidiano, cavaliere romano.",
 'Se a tutti piace e se il tempo ha bisogno di Pompei come cavaliere e non come compagno, non ritengo più i destini.',
 "L'obiettivo di questa arte sembra essere quello di dire in modo insidioso per far credere, il fine è far credere per dirlo.",
 'Ecco, i venti venti larghi scaricano nubi risolute, e potresti credere che il cielo intero cadesse nel mare.',
 'Ma chi spera che io possa avere questa speranza, questi che non credono ancora in Cristo, vedono già con noi, e non potendo negarlo, grideranno i denti.',
 'La vendita dei morti e la presa dei vivi fecero la frode di un re feroce.',
 'Perché lui, che ora per le sue grandi regalità è feroce e onorevole, lui di ogni male affli

In [12]:
# 1) Sample 10 random indices
indices = random.sample(range(len(ds)), 10)

# 2) Print the pairs
for idx in indices:
    print(f"Archaic Sentence: {ds[idx]['Sentence']}")
    print(f"NLLB Translation: {ds[idx]['nllb_translation']}\n")


Archaic Sentence: Altressì uno amante chiamando merzé alla sua donna dice parole e ragioni molte, et ella si difende in suo dire.
NLLB Translation: Altro amante chiama Merzé alla sua donna dice parole e ragioni molte, ed ella si difende nel suo dire.

Archaic Sentence: Gorgone, e ho questa proprietà che io volo per l'aire sì come uno ucello".
NLLB Translation: Gorgone, e ho questa proprietà che volo per aria sì come un uccello".

Archaic Sentence: Non voglio, che insuperbischi per lo santo proposito, e voto della verginità vedendo le sue laudi
NLLB Translation: Non voglio che insuperbischi per il santo proposito, e voto della verginità vedendo le sue lodi

Archaic Sentence: Quando li serpenti invellenava di giorno alcuno Romano, allora iera la maraviglia a vedere come li Psille si combattevano al veleno, ché elli imolavano tutto inazzi della loro salive
NLLB Translation: Quando i serpenti venivano avvelenati di giorno da un romano, allora era una meraviglia vedere come i Psille si comb

### 3: AMT - LLM-BASED

In [28]:
hf_token = "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"

#### 3.1: LLAMA 3.1-8B

**Hugging-Face Reference Page:** https://huggingface.co/meta-llama/Llama-3.1-8B

In [29]:
# 1) Load LLAMA 3.1-8B model & tokenizer.
llama_checkpoint = "meta-llama/Llama-3.1-8B"
llama_tokenizer = AutoTokenizer.from_pretrained(llama_checkpoint, hf_token=hf_token)
llama_model = AutoModelForCausalLM.from_pretrained(
    llama_checkpoint,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)
# 2) Taking the sentences to translate and translate in batches.
sentences = ds["Sentence"]

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.1-8B.
403 Client Error. (Request ID: Root=1-68349efa-55de5ecf52b5388a22738b5a;e0ac158a-377a-41a7-ab5c-840125e1c1cc)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B/resolve/main/config.json.
Your request to access model meta-llama/Llama-3.1-8B is awaiting a review from the repo authors.

##### 3.1.1: DIRECT TRANSLATION

In [None]:
# 1) Build translation pipeline.
llama_translator = pipeline(
    "text2text-generation",
    model=llama_model,
    tokenizer=llama_tokenizer,
    trust_remote_code=True,
    device_map="auto"
)

# 2) Prepare simple prompts.
prompts = [f"Translate this archaic Italian sentence to modern Italian: {s}" for s in sentences]

# 3) Generate translations.
results = llama_translator(prompts, batch_size=8)
llama_outputs = [r["generated_text"] for r in results]

# 4) Attach and save.
translated_ds = ds.add_column("llama_translation", llama_outputs)
translated_ds.to_pandas().to_csv("llama_translations.csv", index=False)

#### 3.2: MISTRAL 7B-Instruct

**Hugging-Face Reference Page:** https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3

In [38]:
# 1) Load MISTRAL 3.1-8B model & tokenizer.
mistral_checkpoint = "mistralai/Mistral-7B-Instruct-v0.3"
mistral_tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
mistral_model = AutoModelForCausalLM.from_pretrained(
    mistral_checkpoint,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)
# 2) Taking the sentences to translate and translate in batches.
sentences = ds["Sentence"]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



##### 3.2.1: DIRECT TRANSLATION

In [41]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Use the correct pipeline type for causal LM
translator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.0
)

# Prompts for plain translation
prompts = [f"Translate this archaic Italian sentence to modern Italian: {s}" for s in arch_sentences]

# Generate
results = translator(prompts, batch_size=8)
mistral_outputs = [r[0]["generated_text"].replace(prompts[i], '').strip() for i, r in enumerate(results)]

# Save
ds = ds.add_column("mistral_translation", mistral_outputs)
ds.to_pandas().to_csv("mistral_translations.csv", index=False)
print("✅ Mistral translations saved.")

Device set to use cuda:0
The model 'M2M100ForConditionalGeneration' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM',

KeyboardInterrupt: 

In [None]:
# Cell 3: Falcon-7B-Instruct Translation

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load model & tokenizer
checkpoint = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model     = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)

translator = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device_map="auto"
)

# Generate translations
prompts = [f"Translate this archaic Italian sentence to modern Italian: {s}" for s in sentences]
results = translator(prompts, batch_size=8)
falcon_outputs = [r["generated_text"] for r in results]

# Attach and save
translated_ds = ds.add_column("falcon_translation", falcon_outputs)
translated_ds.to_pandas().to_csv("falcon_translations.csv", index=False)

# Cell 4: Vicuna-13B Translation

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load model & tokenizer
checkpoint = "lmsys/vicuna-13b-1.3-chat"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model     = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)

translator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device_map="auto"
)

# Generate translations
prompts = [f"Translate this archaic Italian sentence to modern Italian: {s}" for s in sentences]
results = translator(prompts, batch_size=8)
vicuna_outputs = [r["generated_text"] for r in results]

# Attach and save
translated_ds = ds.add_column("vicuna_translation", vicuna_outputs)
translated_ds.to_pandas().to_csv("vicuna_translations.csv", index=False)
