# AMT - AUTOMATIC MACHINE TRANSLATION

@alessioborgi

### 0: IMPORTING LIBRARIES

In [1]:
!pip install -U datasets bitsandbytes accelerate
!pip install huggingface-hub pandas transformers tiktoken protobuf sentencepiece tqdm google-generativeai tenacity

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting accelerate
  Downloading accelerate-1.8.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9

In [2]:
# Importing libraries for step 1).
import os
import time
import json
import torch
import random
import pandas as pd
from tqdm.auto import tqdm
from huggingface_hub import login
from datasets import load_dataset
from huggingface_hub import hf_hub_download

# Importing libraries for step 2).
import re
import google.generativeai as genai
from tenacity import retry, wait_random_exponential, stop_after_attempt
from transformers import MBartForConditionalGeneration, MBart50Tokenizer, AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig, pipeline, AutoModelForCausalLM

In [3]:
# PUT HERE AL THE KEYS (HF and GEMINI).
hf_token = "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"
gemini_api_key = "AIzaSyD7EbS5LLkqPbeAMmr9BR29sP98iOLtpgQ"

In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


### 1: LOADING THE DATASET

#### 1.1: PUSH THE DATASET TO HUGGING-FACE

In [None]:
def upload_to_hf_dataset(
    hf_token: str,
    data_file_path: str,
    repo_name: str,
    file_format: str = "csv",
    split_name: str = "test",
):
    """
    Uploads a local file as a Hugging Face Dataset.

    Args:
        hf_token: Your Hugging Face access token.
        data_file_path: Path to the local data file.
        repo_name: The target repo on HF (e.g. "username/my-dataset").
        file_format: One of "csv", "json", "tsv", etc. Default "csv".
        split_name: Name of the dataset split (e.g. "train", "test"). Default "test".
    """
    # 1) Authenticate to HuggingFace.
    login(token=hf_token)

    # 2) Load local file.
    data_files = { split_name: data_file_path }
    dataset = load_dataset(file_format, data_files=data_files)

    # 3) Push to Hub.
    dataset.push_to_hub(repo_name, token=hf_token)
    print(f"Dataset available at https://huggingface.co/datasets/{repo_name}")

In [None]:
hf_token = "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"
local_path = "/Users/alessioborgi/GitHub/AMT-AutomaticMachineTranslation/test_data/dataset_cleaned.csv"
repo_name  = "Alessio-Borgi/archaic-italian-cleaned-test"

upload_to_hf_dataset(
    hf_token=hf_token,
    data_file_path=local_path,
    repo_name=repo_name,
    file_format="csv",
    split_name="test",
)

#### 1.2: LOADING DATASET FROM HUGGING-FACE

In [5]:
ds = load_dataset("Alessio-Borgi/archaic-italian-cleaned-test")

README.md:   0%|          | 0.00/370 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/11.8k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/97 [00:00<?, ? examples/s]

In [6]:
ds

DatasetDict({
    test: Dataset({
        features: ['Author', 'Date', 'Region', 'Sentence'],
        num_rows: 97
    })
})

#### 1.3: EXPLORING THE TEST DATASET

In [None]:
def explore_dataset(dataset_name):
    ''' Function to explore a dataset. '''

    # Loading the dataset.
    ds = load_dataset(dataset_name)
    df = pd.DataFrame(ds["test"])

    # 1) Number of examples.
    print("Number of examples:", len(df))

    # 2) Preview first 5 examples.
    print("First 5 examples:")
    print(df.head(5), "\n")

    # 3) Sentence-length statistics.
    df["length_tokens"] = df["Sentence"].apply(lambda x: len(x.split()))
    print("Sentence length (tokens) stats:")
    print(df["length_tokens"].describe(), "\n")

    # 4 Take out the column names.
    print("Column names:", df.columns.tolist(), "\n")

In [None]:
# Explore the dataset.
explore_dataset(dataset_name="Alessio-Borgi/archaic-italian-cleaned-test")

Number of examples: 97
First 5 examples:
                        Author     Date Region  \
0              Brunetto Latini  1260-61  fior.   
1                Bono Giamboni     1292  fior.   
2     Valerio Massimo (red. V1     1336  fior.   
3  Lucano volg. (ed. Marinoni)  1330/40  prat.   
4              Brunetto Latini  1260-61  fior.   

                                            Sentence  
0  quella guerra ben fatta l' opera perché etc. E...  
1  crudele, e di tutte le colpe pigli vendetta, c...  
2  Non d' altra forza d' animo fue ornato Ponzio ...  
3  Se questo piace a tutti e se 'l tempo hae biso...  
4  Officio di questa arte pare che sia dicere app...   

Sentence length (tokens) stats:
count    97.000000
mean     20.041237
std       5.996384
min       6.000000
25%      16.000000
50%      20.000000
75%      24.000000
max      31.000000
Name: length_tokens, dtype: float64 

Column names: ['Author', 'Date', 'Region', 'Sentence', 'length_tokens'] 



### 2: AMT - TRANSFORMER-BASED

#### 2.1: mBART (MULTILINGUAL BART)

**ARCHITECTURE & SIZE**
This Transformer-based solution consists in 12-layer encoder + 12-layer decoder Transformer (≈610 M parameters).

**DESCRIPTION**
- **Pretraining**: It has been pretrained via Denoising auto-encoding on monolingual corpora in 50 languages (mBART-50).
- **Multilingual MT**: It has been fine-tuned on many-to-many bitext and supports direct “it→it” by forcing Italian as both source & target.

**REFERENCE INFORMATION**
- Hugging-Face Reference page: https://huggingface.co/docs/transformers/model_doc/mbart
- Paper: https://arxiv.org/abs/2001.08210
- Specific Model employed: *facebook/mbart-large-50-many-to-many-mmt*


In [None]:
# 1) Loading mBART-50 Model & Tokenizer.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model_name = "facebook/mbart-large-50-many-to-many-mmt"
mBART_tokenizer = MBart50Tokenizer.from_pretrained(model_name)
mBART_model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)
mBART_tokenizer.src_lang = "it_IT"
mBART_tokenizer.model_max_length = 512


# 2) Updated batched translation with device placement
def modernize_mbart(sentences, batch_size=8):
    """
    Translate sentences using mBART on GPU (if available),
    showing a tqdm progress bar.
    """
    translations = []
    total_batches = (len(sentences) + batch_size - 1) // batch_size

    for i in tqdm(
        range(0, len(sentences), batch_size),
        total=total_batches,
        desc="mBART Translation",
        unit="batch",
        leave=True
    ):
        batch = sentences[i : i + batch_size]

        # Tokenization.
        inputs = mBART_tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        inputs = { name: tensor.to(device) for name, tensor in inputs.items() }

        # Generation of the Translations.
        with torch.no_grad():
            gen = mBART_model.generate(
                **inputs,
                forced_bos_token_id=mBART_tokenizer.lang_code_to_id["it_IT"],
                max_length=512,
            )
        # Decoding the extensions from tokenizer and add the translations to the list.
        translations.extend(mBART_tokenizer.batch_decode(gen, skip_special_tokens=True))
    return translations

# 3) Run on the test split.
arch_sentences = ds["test"]["Sentence"]
mbart_outputs = modernize_mbart(arch_sentences)

# 4) Attach back to the dataset the translations.
ds = ds["test"].add_column("mbart_translation", mbart_outputs)

# 5) Save the dataset with the mBART Translations.
df = ds.to_pandas()
output_path = "dataset_with_mbart_translations.csv"
df.to_csv(output_path, index=False)


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

mBART Translation:   0%|          | 0/13 [00:00<?, ?batch/s]

In [None]:
ds["mbart_translation"]

["E poi, Aiaces, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi.",
 'Crudele, e per tutte le colpe vendetta, come dice la legge, e per tutte le colpe vendetta, come dice la legge, e per tutte le colpe vendetta.',
 "Non c'è altra forza d' animosità che è stato venerato il Ponzio dell'Umiliare, un romano cavaliere.",
 'Se questo piace a tutti e se il tempo ha bisogno di Pompei per ridere e non per compagno, non riterrò più fati.',
 "L'offiziere di questo arte sembra essere solo per far credere, fine, per far credere.",
 "E' un' larghezza di vento, e' un' larghezza di nebbia, e' un' la

In [None]:
# 1) Sample 10 random indices
indices = random.sample(range(len(ds)), 10)

# 2) Print the pairs
for idx in indices:
    print(f"Archaic Sentence: {ds[idx]['Sentence']}")
    print(f"mBART Translation: {ds[idx]['mbart_translation']}\n")


Archaic Sentence: Io spero in messer Iesù di mandare tosto a voi Timoteo, acciocché io sia d'animo buono
mBART Translation: Io spero, in Messier Iesù, di mandare un tosto a Timoteo, perche' io abbia un buon umore.

Archaic Sentence: l'armi et insieme con loro passaseno tra li nimici, perçò se alcuno non avesse ardire de questo et sì avevano questo animo.
mBART Translation: l'armi e con loro passavano tra i nullai, quindi se non c'era un'armi e non c'era un'armi, loro passavano tra i nullai, quindi se non c'era un'armi e non c'era un'armi.

Archaic Sentence: Corbio nipote d' Ortensio menò sua vita più bassa e più viziosa
mBART Translation: Corbio, nephew of Ortensio, ha fatto la vita più bassa e più visiva.

Archaic Sentence: quello che sopra tutti gli altri perdonasse a' cittadini, e a cui più sicuramente possiate credere; poi ch'egli fu vostro comandatore.
mBART Translation: Quello che perdono per i cittadini, e che most di sicuro voi possiate credere; e poi lui fu il vostro capo.

Ar

#### 2.2: NLLB (No Language Left Behind)

**ARCHITECTURE & SIZE**
This Transformer-based solution comes from the Meta family. It's a many-to-many multilingual Seq2Seq that can be used as a rewriting model for Italian→Italian..

**DESCRIPTION**
- **High Capacity/Quality**: The flagship nllb-200-3.3B has shown state-of-the-art BLEU/COMET on many low-resource ↔ high-resource pairs, and handles morphological/orthographic variation robustly.
- **Multilingual MT**: It supports 200 languages and has full support for ita_Latn (Italian in Latin script).

**REFERENCE INFORMATION**
- Hugging-Face Reference page: https://huggingface.co/docs/transformers/en/model_doc/nllb
- Paper: https://arxiv.org/abs/2207.04672
- Specific Model employed: *facebook/nllb-200-3.3B*

In [None]:
# Set up the 8-bit quantized NLLB pipeline for Italian→Italian.
# 1) Set up the device specifics.
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "cuda" if device == 0 else "cpu")

# 2) 8-bit + offload config.
bnb = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_enable_fp32_cpu_offload=True
)

# 3) Load model in 8-bit.
model_name = "facebook/nllb-200-3.3B"
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb,
    device_map="auto"
)

# 4) Load tokenizer with src/tgt languages set.
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    src_lang="ita_Latn",
    tgt_lang="ita_Latn"
)

# 5) Build the translation pipeline.
translator = pipeline(
    "translation",
    model=model,
    tokenizer=tokenizer,
    src_lang="ita_Latn",
    tgt_lang="ita_Latn",
)

# 6) Taking the sentences to translate and translate in batches.
arch = ds["Sentence"]
results = translator(arch, batch_size=8)

# 7) Extract the Italian text.
italian_translations = [r["translation_text"] for r in results]

# 8) Attach & save to csv file.
ds = ds.add_column("nllb_translation", italian_translations)
df = ds.to_pandas()
#df.to_csv("dataset_with_nllb_italian_translations.csv", index=False)
df.to_csv("dataset_with_mBART_NLLB_translations.csv", index=False)


Using device: cuda


config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)cf3e5af37956607f4c667d891ec069aa276be0be:   0%|          | 0.00/8.55G [00:00<?, ?B/s]

(…)18c1b46dc95be6e106c36df87d13175418b3972c:   0%|          | 0.00/6.93G [00:00<?, ?B/s]

(…)4436d65cf94380c5ddd8f524cb878e090b27bb50:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/94.1k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)6cea38b9e3d5efcdcb9c251d6b40538e1aab555a:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
ds["nllb_translation"]

["E d'altra parte Aiaces era un cavaliere franco e prodigioso alle armi, di grande guisa, ma non era pieno di grande senno",
 'crudele, e per ogni colpa prendi vendetta, come dice la legge, e a nessun cavaliere perdona i peccati.',
 "Non per altra forza d'animo fu decorato Ponzio Aufidiano, cavaliere romano.",
 'Se a tutti piace e se il tempo ha bisogno di Pompei come cavaliere e non come compagno, non ritengo più i destini.',
 "L'obiettivo di questa arte sembra essere quello di dire in modo insidioso per far credere, il fine è far credere per dirlo.",
 'Ecco, i venti venti larghi scaricano nubi risolute, e potresti credere che il cielo intero cadesse nel mare.',
 'Ma chi spera che io possa avere questa speranza, questi che non credono ancora in Cristo, vedono già con noi, e non potendo negarlo, grideranno i denti.',
 'La vendita dei morti e la presa dei vivi fecero la frode di un re feroce.',
 'Perché lui, che ora per le sue grandi regalità è feroce e onorevole, lui di ogni male affli

In [None]:
# 1) Sample 10 random indices
indices = random.sample(range(len(ds)), 10)

# 2) Print the pairs
for idx in indices:
    print(f"Archaic Sentence: {ds[idx]['Sentence']}")
    print(f"NLLB Translation: {ds[idx]['nllb_translation']}\n")


Archaic Sentence: Altressì uno amante chiamando merzé alla sua donna dice parole e ragioni molte, et ella si difende in suo dire.
NLLB Translation: Altro amante chiama Merzé alla sua donna dice parole e ragioni molte, ed ella si difende nel suo dire.

Archaic Sentence: Gorgone, e ho questa proprietà che io volo per l'aire sì come uno ucello".
NLLB Translation: Gorgone, e ho questa proprietà che volo per aria sì come un uccello".

Archaic Sentence: Non voglio, che insuperbischi per lo santo proposito, e voto della verginità vedendo le sue laudi
NLLB Translation: Non voglio che insuperbischi per il santo proposito, e voto della verginità vedendo le sue lodi

Archaic Sentence: Quando li serpenti invellenava di giorno alcuno Romano, allora iera la maraviglia a vedere come li Psille si combattevano al veleno, ché elli imolavano tutto inazzi della loro salive
NLLB Translation: Quando i serpenti venivano avvelenati di giorno da un romano, allora era una meraviglia vedere come i Psille si comb

### 3: AMT - LLM-BASED

#### 3.1: LLAMA-2-7b-chat-hf

**Hugging-Face Reference Page:** https://huggingface.co/meta-llama/Llama-2-7b-chat-hf

**#Params:** 7B

**GPU-RAM:** 12.9GB

In [None]:
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "cuda" if device == 0 else "cpu")
# 1) Load LLAMA 3.1-8B model & tokenizer.
llama_checkpoint = "meta-llama/Llama-2-7b-chat-hf"
llama_tokenizer = AutoTokenizer.from_pretrained(llama_checkpoint, device_map="auto", torch_dtype="auto", hf_token=hf_token)
llama_model = AutoModelForCausalLM.from_pretrained(
    llama_checkpoint,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)
# 2) Taking the sentences to translate and translate in batches.
sentences = ds["test"]["Sentence"]

##### 3.1.1: ZERO-SHOT TRANSLATION


**TOTAL TIME-TO-RUN:** 276.91 seconds

**AVG-per-SENTENCE TIME:** 2.85 seconds

In [None]:
# 3) Set pad token for batching.
if llama_tokenizer.pad_token is None:
    llama_tokenizer.padding_side = "left"

# 4) Build translation pipeline.
llama_translator = pipeline(
    "text-generation",
    model=llama_model,
    tokenizer=llama_tokenizer,
    max_new_tokens=256,
    do_sample=False,
)

# 5) Starting the batched translation.
batch_size = 8
n = len(sentences)
llama_outputs = []

total_start = time.time()
for start in tqdm(range(0, n, batch_size), desc="Translating with Llama"):
    # Format prompts in each batch
    batch_sentences = sentences[start:start+batch_size]
    batch_prompts = [f"Traduci la seguente frase dall'italiano arcaico all'italiano moderno. Solo la traduzione, senza spiegazioni:\n{s}\nRisposta:"
    for s in batch_sentences
]

    batch_results = llama_translator(batch_prompts)
    for i, r in enumerate(batch_results):
        # Remove prompt prefix from output.
        completion = r[0]["generated_text"]
        result = completion.replace(batch_prompts[i], "").strip()
        llama_outputs.append(result)
total_end = time.time()
print(f"\nTotal time: {total_end - total_start:.2f} seconds")
print(f"Average per sentence: {(total_end - total_start)/n:.2f} seconds")

# 6) Save translations.
output_path = "BorgiNonModernToModern-hw2_transl-llama.jsonl"
sentences_out = ds["test"]["Sentence"]
translations_out = llama_outputs

with open(output_path, "w", encoding="utf-8") as f:
    for s, t in zip(sentences_out, translations_out):
        entry = {
            "archaic_sentence": s,
            "llama_translation": t
        }
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")


Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Translating with Llama:   0%|          | 0/13 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Total time: 257.13 seconds
Average per sentence: 2.65 seconds


In [None]:
llama_outputs = "BorgiNonModernToModern-hw2_transl-llama.jsonl"
with open(llama_outputs, encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

n_samples = min(10, len(data))
indices = random.sample(range(len(data)), n_samples)

for idx in indices:
    print(f"Archaic Sentence: {data[idx]['archaic_sentence']}")
    print(f"Llama Translation: {data[idx]['llama_translation']}\n")

Archaic Sentence: E dunque, da che queste cose son così, Catellina, e tu non puoi buonamente qui dimorare, dubiti tu d'andartene in alcuna terra ed usare questa vita fuggendo per li diserti
Llama Translation: E dunque, da che queste cose sono così, Catellina, e tu non puoi buonamente qui dimorare, dubiti tu di andartene in alcuna terra e usare questa vita fuggendo per i deserti.

Archaic Sentence: Unde ragionevolemente Iob è interpretato dolente;
Llama Translation: Perché Ioppe è interpretato dolente.

La frase è tratta da un testo del 1400-1500 secolo e utilizza termini e structures arcaiche.

Archaic Sentence: Però che or chi spererebbe quello che eziandio questi che non vogliono ancora credere in Cristo, già veggiono con noi, e perché nol possono negare, stridono colli denti.
Llama Translation: Ma che or chi spererebbe che quelli che non vogliono ancora credere in Cristo, già veggiono con noi, e perché non possono negare, stridono colli denti.

Archaic Sentence: Se questo piace a tu

##### 3.1.2: FEW-SHOT TRANSLATION

**TOTAL TIME-TO-RUN:** 925.33 seconds

**AVG-per-SENTENCE TIME:** 9.54 seconds

In [None]:
few_shot_examples = [
    ("Ove non sia chi ti conforti, confortati da te stesso.", "Se non c'è nessuno a consolarti, consola te stesso."),
    ("Così nel suo cammino solingo andava pensoso e lento.", "Così nel suo cammino solitario procedeva pensieroso e lento."),
    ("Non è oro tutto quel che riluce.", "Non è tutto oro ciò che luccica."),
]

def build_fewshot_prompt(archaic_sentence):
    intro = "Traduci dall'italiano arcaico all'italiano moderno. Solo la traduzione, senza spiegazioni e senza riportare gli esempi precedenti!"
    shots = "\n".join([
        f"Esempio {i+1}:\nFrase: {a}\nTraduzione: {m}" for i, (a, m) in enumerate(few_shot_examples)
    ])
    test = f"Frase: {archaic_sentence}\nTraduzione:"
    return f"{intro}\n\n{shots}\n\n{test}"

def clean_translation(raw, prompt):
    # Remove the prompt
    answer = raw.replace(prompt, "").strip()
    # Cut at first "Esempio" or "Frase:" if the model keeps generating
    for stop_word in ["Esempio", "Frase:", "Traduzione:", "\n\n"]:
        idx = answer.find(stop_word)
        if idx > 1:
            answer = answer[:idx].strip()
    # Optionally: cut at first linebreak if output is multi-line
    answer = answer.split("\n")[0].strip()
    return answer



# Tokenization step.
if llama_tokenizer.pad_token is None:
    llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "left"

# Building the translation pipeline.
llama_translator = pipeline(
    "text-generation",
    model=llama_model,
    tokenizer=llama_tokenizer,
    max_new_tokens=512,
    do_sample=False,
)

batch_size = 8
n = len(sentences)
llama_outputs = []

total_start = time.time()
for start in tqdm(range(0, n, batch_size), desc="Few-Shot Translating with Llama"):
    batch_sentences = sentences[start:start+batch_size]
    batch_prompts = [build_fewshot_prompt(s) for s in batch_sentences]
    batch_results = llama_translator(batch_prompts)
    for i, r in enumerate(batch_results):
        completion = r[0]["generated_text"]
        result = clean_translation(completion, batch_prompts[i])
        llama_outputs.append(result)

total_end = time.time()
print(f"\nTotal time: {total_end - total_start:.2f} seconds")
print(f"Average per sentence: {(total_end - total_start)/n:.2f} seconds")

# Saving the Translations.
output_path = "BorgiNonModernToModern-hw2_transl-llama-fewshot.jsonl"
sentences_out = ds["test"]["Sentence"]
translations_out = llama_outputs

with open(output_path, "w", encoding="utf-8") as f:
    for s, t in zip(sentences_out, translations_out):
        entry = {
            "archaic_sentence": s,
            "llama_translation": t
        }
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Few-Shot Translating with Llama:   0%|          | 0/13 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Total time: 925.33 seconds
Average per sentence: 9.54 seconds


In [None]:
# Pritning out 10 translations to see how the translation process has gone.
with open(output_path, encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

n_samples = min(10, len(data))
indices = random.sample(range(len(data)), n_samples)

for idx in indices:
    print(f"Archaic Sentence: {data[idx]['archaic_sentence']}")
    print(f"Llama Translation: {data[idx]['llama_translation']}\n")

Archaic Sentence: Dice il poeta: oh, che bella cosa è vedere apertamente con gli occhi quando tu glel fai diretro o in culo o in altro
Llama Translation: Il poeta dice: oh, che bella cosa è vedere chiaramente con gli occhi quando tu guardi in direzione opposta o in basso o in altro.

Archaic Sentence: la moltitudine de' quali tu ài potuto vedere e riguardare lo studio e poco dinanzi udire le voci, e lle cui mani e lance apena posso ritenere.
Llama Translation: La folla di persone che hai potuto vedere e osservare nel corso del studio e poco prima di udire le voci, e le mani e le lance che penso di poter ricordare.

Archaic Sentence: Ora spaventerai li disidirosi cervi con varie e diverse paure, o lo porco cenghiare caggia in terra passato collo spiedo.
Llama Translation: Adesso ti spaventerai i cervi con diverse e varie paure, o il porco si caccia in terra con il collo spesso.

Archaic Sentence: quando nella matricola si scrivono giurano per Dio, e per Cristo, e per lo Spirito Santo
Ll

##### 3.1.3: CHAIN-OF-THOUGHT TRANSLATION

**TOTAL TIME-TO-RUN:** x seconds

**AVG-per-SENTENCE TIME:** x seconds

In [None]:
import json
import time
import random
from tqdm import tqdm
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# 1. Define your few-shot CoT examples
few_shot_cot_examples = [
    (
        "Ove non sia chi ti conforti, confortati da te stesso.",
        "Qui la frase suggerisce che, in assenza di consolatori esterni, una persona deve trovare forza dentro di sé.",
        "Se non c'è nessuno a consolarti, consola te stesso."
    ),
    (
        "Così nel suo cammino solingo andava pensoso e lento.",
        "La parola 'solingo' oggi si direbbe 'solitario', e 'pensoso e lento' suggerisce un'andatura riflessiva.",
        "Così nel suo cammino solitario procedeva pensieroso e lento."
    ),
    (
        "Non è oro tutto quel che riluce.",
        "Questa è una metafora che indica che non tutto ciò che sembra prezioso lo è veramente; si usa una versione moderna.",
        "Non è tutto oro ciò che luccica."
    ),
]

# 2. Chain-of-Thought few-shot prompt builder
def build_cot_fewshot_prompt(archaic_sentence):
    intro = (
        "Traduci dall'italiano arcaico all'italiano moderno. "
        "Prima spiega brevemente il ragionamento, poi fornisci la traduzione finale. "
        "Segui l'esempio. Non ripetere gli esempi precedenti!"
    )
    shots = "\n".join([
        f"Esempio {i+1}:\nFrase: {a}\nRagionamento: {r}\nTraduzione: {m}"
        for i, (a, r, m) in enumerate(few_shot_cot_examples)
    ])
    test = f"Frase: {archaic_sentence}\nRagionamento:"
    return f"{intro}\n\n{shots}\n\n{test}"

# 3. Clean translation (extract only model translation)
def clean_translation_cot(raw, prompt):
    answer = raw.replace(prompt, "").strip()
    # Extract after 'Traduzione:'
    if "Traduzione:" in answer:
        answer = answer.split("Traduzione:", 1)[-1].strip()
    # Cut if model keeps generating
    for stop_word in ["Esempio", "Frase:", "\n\n"]:
        idx = answer.find(stop_word)
        if idx > 1:
            answer = answer[:idx].strip()
    answer = answer.split("\n")[0].strip()
    return answer

# 4. Load your Llama model & tokenizer (change checkpoint if you want)
llama_checkpoint = "meta-llama/Llama-2-7b-chat-hf"  # Or Llama-3 if you have it!
llama_tokenizer = AutoTokenizer.from_pretrained(llama_checkpoint, trust_remote_code=True)
llama_model = AutoModelForCausalLM.from_pretrained(
    llama_checkpoint,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)

# 5. Prepare your test set
# Example: If using HuggingFace datasets: ds["test"]["Sentence"]
# Here, I'll show as loading from your test file (change as needed)
with open("/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-llama.jsonl", encoding="utf-8") as f:
    sentences = [json.loads(line)["archaic_sentence"] for line in f]

# 6. Tokenizer padding
if llama_tokenizer.pad_token is None:
    llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "left"

# 7. Build pipeline
llama_translator = pipeline(
    "text-generation",
    model=llama_model,
    tokenizer=llama_tokenizer,
    max_new_tokens=512,
    do_sample=False,
)

# 8. Translation (CoT Few-shot) batched
batch_size = 8
n = len(sentences)
llama_outputs = []

total_start = time.time()
for start in tqdm(range(0, n, batch_size), desc="CoT Few-Shot Translating with Llama"):
    batch_sentences = sentences[start:start+batch_size]
    batch_prompts = [build_cot_fewshot_prompt(s) for s in batch_sentences]
    batch_results = llama_translator(batch_prompts)
    for i, r in enumerate(batch_results):
        completion = r[0]["generated_text"]
        result = clean_translation_cot(completion, batch_prompts[i])
        llama_outputs.append(result)
total_end = time.time()
print(f"\nTotal time: {total_end - total_start:.2f} seconds")
print(f"Average per sentence: {(total_end - total_start)/n:.2f} seconds")

# 9. Save as requested
output_path = "BorgiNonModernToModern-hw2_transl-llama-fewshot-cot.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for s, t in zip(sentences, llama_outputs):
        entry = {
            "archaic_sentence": s,
            "llama_translation": t
        }
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
CoT Few-Shot Translating with Llama:   0%|          | 0/13 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not val


Total time: 1087.68 seconds
Average per sentence: 11.21 seconds





In [None]:
# 10. Print out 10 random translations
with open(output_path, encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

n_samples = min(10, len(data))
indices = random.sample(range(len(data)), n_samples)

for idx in indices:
    print(f"Archaic Sentence: {data[idx]['archaic_sentence']}")
    print(f"Llama Translation: {data[idx]['llama_translation']}\n")

Archaic Sentence: ma pur questo fu colpa tua, però che volesti cercare di vedere con gli occhi corporali la cosa invisibile.
Llama Translation: ma pur questo fu colpa tua, però che volesti cercare di vedere con gli occhi materiali la cosa invisibile.

Archaic Sentence: Ulecois, ebe un uomo rico e nobile: Orgentore fue chiamato per nome.
Llama Translation: Un tempo, un uomo ricco e nobile era chiamato Orgentore.

Archaic Sentence: sanza fallo lo porco salvatico avanza l' uomo d' udire e 'l lupo cerviere del vedere
Llama Translation: senza errore l'uomo di udito e 'l lupo cerviere' del vedere.

Archaic Sentence: L'oro verrà dall'Aquilone. Che figuriamo noi per l'Aquilone, se non il populo Gentile congelato dal freddo del peccato, il qual populo tenne sotto il giogo della sua tirannia
Llama Translation: L'oro verrà dall'Aquilone. Che cosa intendiamo per Aquilone, se non il popolo Gentile che è stato congelato dal freddo del peccato, il quale popolo ha tenuto sotto il giogo della sua tiran

##### 3.2.4: RE-ACT (REASON+ACT) TRANSLATION

**TOTAL TIME-TO-RUN:** 1030.91 seconds

**AVG-per-SENTENCE TIME:** 10.63 seconds

In [None]:
import json
import time
import random
from tqdm import tqdm
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch

# --- SETUP ---
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "cuda" if device == 0 else "cpu")

hf_token = "<YOUR_HF_TOKEN_HERE>"  # <-- Insert your HF token if needed!

# 1) Load LLAMA 3.1-8B model & tokenizer.
llama_checkpoint = "meta-llama/Llama-2-7b-chat-hf"
llama_tokenizer = AutoTokenizer.from_pretrained(llama_checkpoint, device_map="auto", torch_dtype="auto", hf_token=hf_token)
llama_model = AutoModelForCausalLM.from_pretrained(
    llama_checkpoint,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)
# 2) Load sentences to translate.
# If using Huggingface datasets:
# sentences = ds["test"]["Sentence"]
# If using a file (adapt path as needed):
with open("/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-llama.jsonl", encoding="utf-8") as f:
    sentences = [json.loads(line)["archaic_sentence"] for line in f]

# 3) Robust ReAct few-shot examples for Llama.
few_shot_react_examples = [
    (
        "Ove non sia chi ti conforti, confortati da te stesso.",
        "La frase sottolinea il valore dell'autosufficienza emotiva. 'Conforti' è un verbo arcaico per 'consolare'. Il messaggio è che bisogna consolarsi da soli se nessuno lo fa per noi.",
        "Traduci in italiano moderno.",
        "Se non c'è nessuno a consolarti, consola te stesso."
    ),
    (
        "Così nel suo cammino solingo andava pensoso e lento.",
        "'Solingo' oggi si direbbe 'solitario'. Descrive una persona che cammina da sola, lentamente, persa nei pensieri.",
        "Traduci in italiano moderno.",
        "Così nel suo cammino solitario procedeva pensieroso e lento."
    ),
    (
        "Non è oro tutto quel che riluce.",
        "È un proverbio che consiglia di non fidarsi delle apparenze; la parola moderna è 'luccica' invece di 'riluce'.",
        "Traduci in italiano moderno.",
        "Non è tutto oro ciò che luccica."
    ),
    # (Add more real examples for better performance!)
]

# 4) Robust ReAct prompt builder.
def build_react_prompt(archaic_sentence):
    intro = (
        "Traduci la seguente frase dall'italiano arcaico all'italiano moderno seguendo il metodo Reason+Act (ReAct). "
        "Per ciascun esempio, prima ragiona brevemente sul significato e sulle parole arcaiche (Pensiero), poi indica l'azione (Azione) e infine fornisci la traduzione moderna (Traduzione). "
        "Non riportare gli esempi precedenti. "
        "Ecco degli esempi:"
    )
    shots = "\n".join([
        f"Esempio {i+1}:\nFrase: {a}\nPensiero: {thought}\nAzione: {action}\nTraduzione: {modern}"
        for i, (a, thought, action, modern) in enumerate(few_shot_react_examples)
    ])
    test = f"Esempio {len(few_shot_react_examples)+1}:\nFrase: {archaic_sentence}\nPensiero:"
    return f"{intro}\n\n{shots}\n\n{test}"

# 5) Improved clean_translation function
def clean_translation_react(raw, prompt):
    answer = raw.replace(prompt, "").strip()
    if "Traduzione:" in answer:
        answer = answer.split("Traduzione:", 1)[-1].strip()
    for stop_word in ["Esempio", "Frase:", "Pensiero:", "Azione:", "\n\n"]:
        idx = answer.find(stop_word)
        if idx > 1:
            answer = answer[:idx].strip()
    answer = answer.split("\n")[0].strip()
    return answer

# 6) Tokenizer padding setup
if llama_tokenizer.pad_token is None:
    llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "left"

# 7) Build translation pipeline
llama_translator = pipeline(
    "text-generation",
    model=llama_model,
    tokenizer=llama_tokenizer,
    max_new_tokens=512,
    do_sample=False,
)

# 8) ReAct translation in batch
batch_size = 8
n = len(sentences)
llama_outputs = []

total_start = time.time()
for start in tqdm(range(0, n, batch_size), desc="ReAct Translating with Llama"):
    batch_sentences = sentences[start:start+batch_size]
    batch_prompts = [build_react_prompt(s) for s in batch_sentences]
    batch_results = llama_translator(batch_prompts)
    for i, r in enumerate(batch_results):
        completion = r[0]["generated_text"]
        result = clean_translation_react(completion, batch_prompts[i])
        llama_outputs.append(result)
total_end = time.time()
print(f"\nTotal time: {total_end - total_start:.2f} seconds")
print(f"Average per sentence: {(total_end - total_start)/n:.2f} seconds")

# 9) Save results
output_path = "BorgiNonModernToModern-hw2_transl-llama-react.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for archaic, modern in zip(sentences, llama_outputs):
        record = {
            "archaic_sentence": archaic,
            "llama_translation": modern
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
ReAct Translating with Llama:   0%|          | 0/13 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and 


Total time: 1030.91 seconds
Average per sentence: 10.63 seconds
Archaic Sentence: molto di maggiore memoria saranno faccendole al re, perciò che nella nostra cittade sempre fue santo e glorioso il nome reale, e sse furono compagni fue il loro nome santissimo;
Llama Translation: Ci saranno molte memorie che il re farà, perciò il suo nome sarà sempre onorato e reverito nella città, e i suoi fedeli lo seguiranno con devozione.

Archaic Sentence: Tarentini, i quali erano nati di quegli di Lacedemonia et facta da lloro nobile cittade de' Greci.
Llama Translation: I Tarantini, che erano nati da quelli di Sparta e creati da loro nobile città dei Greci.

Archaic Sentence: Dio, per la quale si dispensano et iudicano tutte le cose.
Llama Translation: Dio, per il quale si dispensano e giudicano tutte le cose.

Archaic Sentence: Tarco cretense che in aiuto era alli nimici avrebbe potuto fuggire, ma la vergogna d'abbandonare li nobilissimi cavalieri della gente achea poco avanti da lui
Llama Trans




In [None]:
# 10) Print 10 random translations
with open(output_path, encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

n_samples = min(10, len(data))
indices = random.sample(range(len(data)), n_samples)
for idx in indices:
    print(f"Archaic Sentence: {data[idx]['archaic_sentence']}")
    print(f"Llama Translation: {data[idx]['llama_translation']}\n")

Archaic Sentence: e quella cosa, la quale è diricta et onesta, e con virtute, quella sola penso essere lo bene.
Llama Translation: E quella cosa, che è diretta e onesta, e con virtù, quella sola penso essere il bene.

Archaic Sentence: Teseo reguardò Achelao fortemente meravigliandose e disse così: "O messer Achelao, io vi prego che voi me diciate in che modo voi perdeste
Llama Translation: Teseo guardò Achelao con meraviglia e disse: "O signore Achelao, mi prego di raccontare come perdesti".

Archaic Sentence: Vero è, ma non tine rispondo in questo tempo, perciò che ttu se' mio servo, o perciò ch' è tempo feriato, o perciò ch' io non debbo risponderti
Llama Translation: È vero che non posso rispondere adesso, perchè non sono il tuo servitore, o perchè è un giorno di festa, o perchè non devo risponderti.

Archaic Sentence: Non voglio, che insuperbischi per lo santo proposito, e voto della verginità vedendo le sue laudi
Llama Translation: Non voglio superare l'intenzione, che è onesta, 

#### 3.2: GEMMA 2B-Instruct

**Hugging-Face Reference Page:** https://huggingface.co/google/gemma-2b-it

**#Params:** 2B

**GPU-RAM:** 5.8GB

In [None]:
# 1) Load model & tokenizer.
gemma_checkpoint = "google/gemma-2b-it"
gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_checkpoint, trust_remote_code=True)
gemma_model     = AutoModelForCausalLM.from_pretrained(
    gemma_checkpoint,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)
# 2) Taking the sentences to translate and translate in batches.
sentences = ds["test"]["Sentence"]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

##### 3.2.1: ZERO-SHOT TRANSLATION

**TOTAL TIME-TO-RUN:** 105.27 seconds

**AVG-per-SENTENCE TIME:** 1.09 seconds

In [None]:
# 3) Set pad token for batching.
if gemma_tokenizer.pad_token is None:
    gemma_tokenizer.pad_token = gemma_tokenizer.eos_token
gemma_tokenizer.padding_side = "left"

# 4) Use "text-generation" pipeline.
falcon_translator = pipeline(
    "text-generation",
    model=gemma_model,
    tokenizer=gemma_tokenizer,
    trust_remote_code=True,
    device_map="auto",
    max_new_tokens=256,
    do_sample=False,
)

# 5) Prepare translation prompts.
prompts = [f"Traduci la seguente frase dall'italiano arcaico all'italiano moderno. Solo la traduzione, senza spiegazioni:\n{s}\nRisposta:" for s in sentences]

# 5) Batched generation.
batch_size = 8
n = len(sentences)
total_start = time.time()
gemma_outputs = []
for start in tqdm(range(0, len(prompts), batch_size), desc="Translating with Gemma"):
    batch_prompts = prompts[start:start+batch_size]
    batch_results = falcon_translator(batch_prompts)
    for i, r in enumerate(batch_results):
        # Remove prompt from output
        gen = r[0]["generated_text"]
        translation = gen.replace(batch_prompts[i], "").strip()
        gemma_outputs.append(translation)

# 5.5) Compute time complexity.
total_end = time.time()
print(f"\nTotal time: {total_end - total_start:.2f} seconds")
print(f"Average per sentence: {(total_end - total_start)/n:.2f} seconds")

# 6) Save or attach as usual.
jsonl_path = "BorgiNonModernToModern-hw2_transl-gemma.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
    for archaic, modern in zip(sentences, gemma_outputs):
        record = {
            "archaic_sentence": archaic,
            "gemma_translation": modern
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Translating with Gemma:   0%|          | 0/13 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore


Total time: 105.12 seconds
Average per sentence: 1.08 seconds


In [None]:
gemma_outputs = "BorgiNonModernToModern-hw2_transl-gemma.jsonl"
with open(gemma_outputs, encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

n_samples = min(10, len(data))
indices = random.sample(range(len(data)), n_samples)

for idx in indices:
    print(f"Archaic Sentence: {data[idx]['archaic_sentence']}")
    print(f"Gemma Translation: {data[idx]['gemma_translation']}\n")

Archaic Sentence: crudele, e di tutte le colpe pigli vendetta, come dice la legge, ed a neuno cavaliere perdoni che pecchi.
Gemma Translation: La frase originale è un parlamento che descrive un comportamento doloso e senza speranza.

Archaic Sentence: Sicchè dolore è a udire, quando l' usare l' arme e la fatica ricusano, con grandissimo disonore come pecore essere
Gemma Translation: Se il dolore è a udire, quando si usa l'arma e si ricusano, con grandissimo disonore come pecore essere.

Archaic Sentence: quando nella matricola si scrivono giurano per Dio, e per Cristo, e per lo Spirito Santo
Gemma Translation: quando nella matricola si scrivono giurano per Dio, Cristo e Spirito Santo.

Archaic Sentence: Quando averai nel cavaliere i detti segni veduti non andare a grandezza caendo, perchè nelle battaglie sono più utili i forti che i grandi.
Gemma Translation: Quando avrai nel cavaliere i dettagli visibili non andranno a grandezza caendo, perché nelle battaglie sono più utili i difensor

##### 3.2.2: FEW-SHOT TRANSLATION

**TOTAL TIME-TO-RUN:** 2520.21 seconds

**AVG-per-SENTENCE TIME:** 25.98 seconds

In [None]:
few_shot_examples = [
    ("Ove non sia chi ti conforti, confortati da te stesso.", "Se non c'è nessuno a consolarti, consola te stesso."),
    ("Così nel suo cammino solingo andava pensoso e lento.", "Così nel suo cammino solitario procedeva pensieroso e lento."),
    ("Non è oro tutto quel che riluce.", "Non è tutto oro ciò che luccica."),
]

def build_fewshot_prompt(archaic_sentence):
    intro = "Traduci dall'italiano arcaico all'italiano moderno. Solo la traduzione, senza spiegazioni."
    shots = "\n".join([
        f"Esempio {i+1}:\nFrase: {a}\nTraduzione: {m}" for i, (a, m) in enumerate(few_shot_examples)
    ])
    test = f"Frase: {archaic_sentence}\nTraduzione:"
    return f"{intro}\n\n{shots}\n\n{test}"

def clean_translation(raw, prompt):
    # Remove the prompt
    answer = raw.replace(prompt, "").strip()
    # Cut at first "Esempio" or "Frase:" if the model keeps generating
    for stop_word in ["Esempio", "Frase:", "Traduzione:", "\n\n"]:
        idx = answer.find(stop_word)
        if idx > 1:
            answer = answer[:idx].strip()
    # Optionally: cut at first linebreak if output is multi-line
    answer = answer.split("\n")[0].strip()
    return answer

# Tokenization step.
if gemma_tokenizer.pad_token is None:
    gemma_tokenizer.pad_token = gemma_tokenizer.eos_token
gemma_tokenizer.padding_side = "left"

# Building Translation Pipeline.
gemma_translator = pipeline(
    "text-generation",
    model=gemma_model,
    tokenizer=gemma_tokenizer,
    max_new_tokens=512,
    do_sample=False,
)

# Batched few-shot Translations.
batch_size = 8
n = len(sentences)
gemma_outputs = []

total_start = time.time()
for start in tqdm(range(0, n, batch_size), desc="Few-Shot Translating with Gemma"):
    batch_sentences = sentences[start:start+batch_size]
    batch_prompts = [build_fewshot_prompt(s) for s in batch_sentences]
    batch_results = gemma_translator(batch_prompts)
    for i, r in enumerate(batch_results):
        completion = r[0]["generated_text"]
        result = clean_translation(completion, batch_prompts[i])
        gemma_outputs.append(result)
total_end = time.time()
print(f"\nTotal time: {total_end - total_start:.2f} seconds")
print(f"Average per sentence: {(total_end - total_start)/n:.2f} seconds")

# Saving the Translations.
jsonl_path = "BorgiNonModernToModern-hw2_transl-gemma-fewshot.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
    for archaic, modern in zip(sentences, gemma_outputs):
        record = {
            "archaic_sentence": archaic,
            "gemma_translation": modern
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Few-Shot Translating with Gemma:   0%|          | 0/13 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore


Total time: 2520.21 seconds
Average per sentence: 25.98 seconds


In [None]:
# Pritning out 10 translations to see how the translation process has gone.
with open(jsonl_path, encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

n_samples = min(10, len(data))
indices = random.sample(range(len(data)), n_samples)

for idx in indices:
    print(f"Archaic Sentence: {data[idx]['archaic_sentence']}")
    print(f"Gemma Translation: {data[idx]['gemma_translation']}\n")

Archaic Sentence: Io gli apersi, e quelli fuggitte. E che bisogno è, che lo cuore tuo stia chiuso al tuo sposo Cristo?
Gemma Translation: Io gli apersi, e quelli fuggono. E che bisogno è, che il cuore tuo sia chiuso al tuo sposo Cristo?

Archaic Sentence: che prendessero la paga dal camarlingo per loro dispensa et immantenente andassero alla presenzia di messer lo papa per contradiare il passamento de' cavalieri che veniano di Cecilia in Toscana
Gemma Translation: che prendessero la paga dal camarlingo per loro dispensa e andassero alla presenzia di messer lo papa per contradiare il passamento de' cavalieri che venivano da Cecilia in Toscana.

Archaic Sentence: Alexandri, ciò è il genero e 'l figliuolo, da Phausonia, gentile iovane di Macedonia, stando in uno luogo strecto sanza guardia, fue morto.
Gemma Translation: Alessandro, questo è il genere e l'uomo, da Phausonia, gentile iovane di Macedonia, fu assassinato in un luogo stretto senza guardia.

Archaic Sentence: Se questo piace a 

##### 3.2.3: CHAIN-OF-THOUGHT TRANSLATION

**TOTAL TIME-TO-RUN:** 251.41 seconds

**AVG-per-SENTENCE TIME:** 2.59 seconds

In [None]:
import json
import time
import random
from tqdm import tqdm
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# 1. Define Chain-of-Thought few-shot examples for GEMMA
few_shot_cot_examples = [
    (
        "Ove non sia chi ti conforti, confortati da te stesso.",
        "Qui il significato è che bisogna trovare la forza dentro di sé quando non si ha nessun altro a cui appoggiarsi.",
        "Se non c'è nessuno a consolarti, consola te stesso."
    ),
    (
        "Così nel suo cammino solingo andava pensoso e lento.",
        "La parola 'solingo' corrisponde a 'solitario', e l'espressione suggerisce una camminata lenta e riflessiva.",
        "Così nel suo cammino solitario procedeva pensieroso e lento."
    ),
    (
        "Non è oro tutto quel che riluce.",
        "Questo è un modo di dire che insegna a non fidarsi delle apparenze; la versione moderna usa 'luccica'.",
        "Non è tutto oro ciò che luccica."
    ),
]

# 2. Chain-of-Thought prompt builder
def build_cot_fewshot_prompt(archaic_sentence):
    intro = (
        "Traduci dall'italiano arcaico all'italiano moderno. "
        "Prima spiega brevemente il ragionamento, poi fornisci la traduzione finale. "
        "Segui l'esempio. Non ripetere gli esempi precedenti!"
    )
    shots = "\n".join([
        f"Esempio {i+1}:\nFrase: {a}\nRagionamento: {r}\nTraduzione: {m}"
        for i, (a, r, m) in enumerate(few_shot_cot_examples)
    ])
    test = f"Frase: {archaic_sentence}\nRagionamento:"
    return f"{intro}\n\n{shots}\n\n{test}"

# 3. Clean translation (extract only model translation)
def clean_translation_cot(raw, prompt):
    answer = raw.replace(prompt, "").strip()
    # Extract after 'Traduzione:'
    if "Traduzione:" in answer:
        answer = answer.split("Traduzione:", 1)[-1].strip()
    # Cut if model keeps generating
    for stop_word in ["Esempio", "Frase:", "\n\n"]:
        idx = answer.find(stop_word)
        if idx > 1:
            answer = answer[:idx].strip()
    answer = answer.split("\n")[0].strip()
    return answer

# 4. Load your Gemma model & tokenizer
gemma_checkpoint = "google/gemma-2b-it"
gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_checkpoint, trust_remote_code=True)
gemma_model = AutoModelForCausalLM.from_pretrained(
    gemma_checkpoint,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)

# 5. Load sentences to translate (adapt this line to your source!)
with open("/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-llama.jsonl", encoding="utf-8") as f:
    sentences = [json.loads(line)["archaic_sentence"] for line in f]

# 6. Tokenizer padding
if gemma_tokenizer.pad_token is None:
    gemma_tokenizer.pad_token = gemma_tokenizer.eos_token
gemma_tokenizer.padding_side = "left"

# 7. Build pipeline
gemma_translator = pipeline(
    "text-generation",
    model=gemma_model,
    tokenizer=gemma_tokenizer,
    max_new_tokens=512,
    do_sample=False,
)

# 8. Translation (CoT Few-shot) batched
batch_size = 8
n = len(sentences)
gemma_outputs = []

total_start = time.time()
for start in tqdm(range(0, n, batch_size), desc="CoT Few-Shot Translating with Gemma"):
    batch_sentences = sentences[start:start+batch_size]
    batch_prompts = [build_cot_fewshot_prompt(s) for s in batch_sentences]
    batch_results = gemma_translator(batch_prompts)
    for i, r in enumerate(batch_results):
        completion = r[0]["generated_text"]
        result = clean_translation_cot(completion, batch_prompts[i])
        gemma_outputs.append(result)
total_end = time.time()
print(f"\nTotal time: {total_end - total_start:.2f} seconds")
print(f"Average per sentence: {(total_end - total_start)/n:.2f} seconds")

# 9. Save results
jsonl_path = "BorgiNonModernToModern-hw2_transl-gemma-fewshot-cot.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
    for archaic, modern in zip(sentences, gemma_outputs):
        record = {
            "archaic_sentence": archaic,
            "gemma_translation": modern
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
CoT Few-Shot Translating with Gemma:   0%|          | 0/13 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORM


Total time: 251.41 seconds
Average per sentence: 2.59 seconds





In [None]:
# 10. Print 10 random translations
with open(jsonl_path, encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

n_samples = min(10, len(data))
indices = random.sample(range(len(data)), n_samples)

for idx in indices:
    print(f"Archaic Sentence: {data[idx]['archaic_sentence']}")
    print(f"Gemma Translation: {data[idx]['gemma_translation']}\n")

Archaic Sentence: Vede anche le ragioni del volo degli uccielli e di tutte le cose sa rendere vero giudicio.
Gemma Translation: Si vedono anche le ragioni del viaggio degli uccielli e di tutte le cose che possono rendere vero un giudizio.

Archaic Sentence: sia in mezzo tra me e te: con noi non puo' tu già più lungamente dimorare, ch'io non lo sofferrò e non lo lascerò.
Gemma Translation: Bevi in mezzo tra noi e noi non puoi più lungamente dimorare, come se non avessi più la forza per seguirci.

Archaic Sentence: Et se l' occhio è nobile membro del corpo dell' uomo, dunque la salutazione è nobile parte della pistola, c' altressì allumina tutta la lettera come l' occhio allumina l' uomo.
Gemma Translation: Se l'occhio è un membro nobile del corpo dell'uomo, la salute è una parte importante della bellezza, e la salutazione deve essere tale da essere visibilmente più grande dell'occhio.

Archaic Sentence: Quando averai nel cavaliere i detti segni veduti non andare a grandezza caendo, perc

##### 3.2.4: RE-ACT (REASON+ACT) TRANSLATION

**TOTAL TIME-TO-RUN:** 298.97 seconds

**AVG-per-SENTENCE TIME:** 3.08 seconds

In [None]:
import json
import time
import random
from tqdm import tqdm
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# 1. Define robust ReAct few-shot examples for GEMMA
few_shot_react_examples = [
    (
        "Ove non sia chi ti conforti, confortati da te stesso.",
        "La frase sottolinea il valore dell'autosufficienza emotiva. 'Conforti' è un verbo arcaico per 'consolare'. Il messaggio è che bisogna consolarsi da soli se nessuno lo fa per noi.",
        "Traduci in italiano moderno.",
        "Se non c'è nessuno a consolarti, consola te stesso."
    ),
    (
        "Così nel suo cammino solingo andava pensoso e lento.",
        "'Solingo' oggi si direbbe 'solitario'. Descrive una persona che cammina da sola, lentamente, persa nei pensieri.",
        "Traduci in italiano moderno.",
        "Così nel suo cammino solitario procedeva pensieroso e lento."
    ),
    (
        "Non è oro tutto quel che riluce.",
        "È un proverbio che consiglia di non fidarsi delle apparenze; la parola moderna è 'luccica' invece di 'riluce'.",
        "Traduci in italiano moderno.",
        "Non è tutto oro ciò che luccica."
    ),
    # (Aggiungi almeno altri 2-3 esempi reali se vuoi massimizzare le performance!)
]

# 2. Robust ReAct prompt builder
def build_react_prompt(archaic_sentence):
    intro = (
        "Traduci la seguente frase dall'italiano arcaico all'italiano moderno seguendo il metodo Reason+Act (ReAct). "
        "Per ciascun esempio, prima ragiona brevemente sul significato e sulle parole arcaiche (Pensiero), poi indica l'azione (Azione) e infine fornisci la traduzione moderna (Traduzione). "
        "Non riportare gli esempi precedenti. "
        "Ecco degli esempi:"
    )
    shots = "\n".join([
        f"Esempio {i+1}:\nFrase: {a}\nPensiero: {thought}\nAzione: {action}\nTraduzione: {modern}"
        for i, (a, thought, action, modern) in enumerate(few_shot_react_examples)
    ])
    test = f"Esempio {len(few_shot_react_examples)+1}:\nFrase: {archaic_sentence}\nPensiero:"
    return f"{intro}\n\n{shots}\n\n{test}"

# 3. Improved clean_translation function
def clean_translation_react(raw, prompt):
    answer = raw.replace(prompt, "").strip()
    # Find the translation after "Traduzione:"
    if "Traduzione:" in answer:
        answer = answer.split("Traduzione:", 1)[-1].strip()
    # Cut off generation if model keeps generating more
    for stop_word in ["Esempio", "Frase:", "Pensiero:", "Azione:", "\n\n"]:
        idx = answer.find(stop_word)
        if idx > 1:
            answer = answer[:idx].strip()
    # Only keep the first line (just in case)
    answer = answer.split("\n")[0].strip()
    return answer

# 4. Load Gemma model & tokenizer
gemma_checkpoint = "google/gemma-2b-it"
gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_checkpoint, trust_remote_code=True)
gemma_model = AutoModelForCausalLM.from_pretrained(
    gemma_checkpoint,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)

# 5. Load sentences to translate
with open("/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-llama.jsonl", encoding="utf-8") as f:
    sentences = [json.loads(line)["archaic_sentence"] for line in f]

# 6. Tokenizer padding setup
if gemma_tokenizer.pad_token is None:
    gemma_tokenizer.pad_token = gemma_tokenizer.eos_token
gemma_tokenizer.padding_side = "left"

# 7. Build translation pipeline
gemma_translator = pipeline(
    "text-generation",
    model=gemma_model,
    tokenizer=gemma_tokenizer,
    max_new_tokens=512,
    do_sample=False,
)

# 8. ReAct translation in batch
batch_size = 8
n = len(sentences)
gemma_outputs = []

total_start = time.time()
for start in tqdm(range(0, n, batch_size), desc="ReAct Translating with Gemma"):
    batch_sentences = sentences[start:start+batch_size]
    batch_prompts = [build_react_prompt(s) for s in batch_sentences]
    batch_results = gemma_translator(batch_prompts)
    for i, r in enumerate(batch_results):
        completion = r[0]["generated_text"]
        result = clean_translation_react(completion, batch_prompts[i])
        gemma_outputs.append(result)
total_end = time.time()
print(f"\nTotal time: {total_end - total_start:.2f} seconds")
print(f"Average per sentence: {(total_end - total_start)/n:.2f} seconds")

# 9. Save results
jsonl_path = "BorgiNonModernToModern-hw2_transl-gemma-react.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
    for archaic, modern in zip(sentences, gemma_outputs):
        record = {
            "archaic_sentence": archaic,
            "gemma_translation": modern
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
ReAct Translating with Gemma:   0%|          | 0/13 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VER


Total time: 298.97 seconds
Average per sentence: 3.08 seconds





In [None]:
# 10. Print 10 random translations for inspection
with open(jsonl_path, encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

n_samples = min(10, len(data))
indices = random.sample(range(len(data)), n_samples)

for idx in indices:
    print(f"Archaic Sentence: {data[idx]['archaic_sentence']}")
    print(f"Gemma Translation: {data[idx]['gemma_translation']}\n")

Archaic Sentence: L'oro verrà dall'Aquilone. Che figuriamo noi per l'Aquilone, se non il populo Gentile congelato dal freddo del peccato, il qual populo tenne sotto il giogo della sua tirannia
Gemma Translation: L'oro verrà dall'Aquilone. Che figuriamo noi per l'Aquilone, se non il populo Gentile è congelato dal freddo del peccato, il qual populo tenne sotto il giogo della sua tirannia.

Archaic Sentence: che prendessero la paga dal camarlingo per loro dispensa et immantenente andassero alla presenzia di messer lo papa per contradiare il passamento de' cavalieri che veniano di Cecilia in Toscana
Gemma Translation: Quale prendesse la paga dal camarlingo per loro dispensa e andò alla presenzia di messer lo papa per contradire il passamento de' cavalieri che venivano di Cecilia in Toscana.

Archaic Sentence: Dio, per la quale si dispensano et iudicano tutte le cose.
Gemma Translation: Dio, per la quale si dispensano tutte le cose.

Archaic Sentence: l'armi et insieme con loro passaseno tr

### 4: LLM-AS-A-JUDGE EVALUATION

#### 4.1: GEMINI-2.0-FLASH - GENERAL EVALUATION

We will use here Gemini-2.0-Flash Model in a "General" setting evaluation. This will simply give us a single score.

In [None]:
# Information to-set:
input_jsonl = "/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-gemma.jsonl"
llm_type = "gemma"
model_name = "gemini"
judge_type = "general"

# This mustn't be changed.
translation_col = f"{llm_type}_translation"

In [None]:
# Define the function to run teh Judge.
@retry(wait=wait_random_exponential(min=10, max=60), stop=stop_after_attempt(5))
def llm_judge_general(sentence, translation):

    prompt = f"""
    You are an expert evaluator of machine translations from Archaic Italian to Modern Italian.

    For each translation, assign a score from 1 (worst) to 5 (best), using this rubric:

    1: Completely unacceptable translation. The translation has no pertinence with the original meaning; the generated sentence is either gibberish or makes no sense.
    2: Severe semantic errors, omissions or substantial additions on the original sentence. The errors are semantic and syntactic in nature. It’s still something no human would ever write.
    3: Partially wrong translation. The translation is lackluster; it contains errors, but mostly minor errors, like typos, or small semantic errors.
    4: Good translation. The translation is mostly right, substantially faithful to the original text, but the style does not perfectly match the original sentence; still fluent and comprehensible, and could be semantically acceptable.
    5: Perfect translation. The translation is accurate, fluent, complete and coherent. It retained the original meaning as much as it could.

    Evaluate ONLY the translation quality according to these guidelines.

    Original (Archaic Italian): {sentence}

    Translation (Modern Italian): {translation}

    Your score (1-5):
    """
    response = LLM_as_a_Judge_model.generate_content(prompt, generation_config={"temperature": 0.0})
    score_str = response.text.strip()
    match = re.search(r"\b([1-5])\b", score_str)
    if match:
        return int(match.group(1))
    return None

In [None]:
# 1) Configure Gemini API and model.
genai.configure(api_key=gemini_api_key)
LLM_as_a_Judge_model = genai.GenerativeModel("gemini-2.0-flash")

# 2) Load DataFrame from JSONL.
ds = pd.read_json(input_jsonl, lines=True)

# 3) Score all translations, pausing every 15 requests (to avoid API rate limits).
tqdm.pandas()
judge_col = f"{translation_col}_{judge_type}_judge_score"

scores = []
for i, row in tqdm(ds.iterrows(), total=len(ds)):
    score = llm_judge_general(row["archaic_sentence"], row[translation_col])
    scores.append(score)
    if (i + 1) % 15 == 0:
        print("🕒 Sleeping for 60 seconds to avoid API rate limit...")
        time.sleep(60)

ds[judge_col] = scores

# Save as JSONL.
output_cols = ["archaic_sentence", translation_col, judge_col]
jsonl_filename = f"BorgiNonModernToModern-hw2_transl-judge_{model_name}-{judge_type}_{llm_type}-model.jsonl"
with open(jsonl_filename, "w", encoding="utf8") as fout:
    for record in ds[output_cols].to_dict(orient="records"):
        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

  0%|          | 0/97 [00:00<?, ?it/s]

🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...


#### 4.2: GEMINI-2.0-FLASH - MULTI-CRITERIA EVALUATION

We will use here Gemini-2.0-Flash Model in a "Multi-Criteria" setting evaluation. This will give us a set of 4 scores for each sentence, evaluating the Adequacy, Fluency, Style and Completeness.

In [None]:
# Information to-set:
input_jsonl = "/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-llama.jsonl"
llm_type = "llama"
model_name = "gemini"
judge_type = "MultiCriteria"

# This mustn't be changed.
translation_col = f"{llm_type}_translation"

In [None]:
@retry(wait=wait_random_exponential(min=10, max=60), stop=stop_after_attempt(5))
def llm_judge_multicrit(sentence, translation):
    prompt = f"""
You are an expert evaluator of machine translations from Archaic Italian to Modern Italian.
For each translation, assign a score from 1 (worst) to 5 (best) on the following four criteria. Here is the meaning of each score for each criterion:

Adequacy:
1 - The translation does not capture the original meaning at all.
2 - The translation is mostly wrong; the main meaning is lost, but there are rare fragments of meaning.
3 - Some meaning is preserved, but important information is lost or altered.
4 - Most meaning is present, with only minor issues; very little is lost.
5 - All essential meaning from the original is preserved.

Fluency:
1 - The translation is unreadable or ungrammatical; clearly machine-generated.
2 - The translation has severe grammar errors, unnatural phrasing, or frequent awkwardness.
3 - Some awkwardness or minor grammar issues, but still understandable.
4 - Mostly fluent and grammatical, only rare awkward or unnatural expressions.
5 - Perfectly fluent, fully natural Italian.

Style:
1 - The tone/register is completely lost or inappropriate.
2 - The style is mostly lost; it is awkward or inappropriate for the context.
3 - The style is partially preserved but inconsistent or awkward.
4 - The style is almost fully preserved, with only minor slips.
5 - The style, tone, and register are perfectly matched to the original.

Completeness:
1 - Major parts are omitted or unnecessary parts are added.
2 - The translation is incomplete; many elements are missing or excessive additions present.
3 - Minor omissions/additions, but most information is present.
4 - Almost everything is present, with only trivial information missing or added.
5 - Complete; nothing important is lost or added.

Output ONLY the four scores as numbers 1-5, in exactly this format (no extra text):

Adequacy: <score>
Fluency: <score>
Style: <score>
Completeness: <score>

Original (Archaic Italian): {sentence}

Translation (Modern Italian): {translation}
    """
    response = LLM_as_a_Judge_model.generate_content(prompt, generation_config={"temperature": 0.0})
    text = response.text.strip()
    adequacy = re.search(r"Adequacy:\s*([1-5])", text)
    fluency = re.search(r"Fluency:\s*([1-5])", text)
    style = re.search(r"Style:\s*([1-5])", text)
    completeness = re.search(r"Completeness:\s*([1-5])", text)
    return {
        "AdequacyScore": int(adequacy.group(1)) if adequacy else None,
        "FluencyScore": int(fluency.group(1)) if fluency else None,
        "StyleScore": int(style.group(1)) if style else None,
        "CompletenessScore": int(completeness.group(1)) if completeness else None,
    }

In [None]:
# 1) Configure Gemini API and model.
genai.configure(api_key=gemini_api_key)
LLM_as_a_Judge_model = genai.GenerativeModel("gemini-2.0-flash")

# 2) Load DataFrame from JSONL.
ds = pd.read_json(input_jsonl, lines=True)

# 3) Score all translations, pausing every 15 requests (to avoid API rate limits).
tqdm.pandas()
judge_col = f"{translation_col}_{judge_type}_judge_scores"   # <-- "_scores" for dict

scores = []
for i, row in tqdm(ds.iterrows(), total=len(ds)):
    # This returns a dictionary with four scores
    score_dict = llm_judge_multicrit(row["archaic_sentence"], row[translation_col])
    scores.append(score_dict)
    if (i + 1) % 15 == 0:
        print("🕒 Sleeping for 60 seconds to avoid API rate limit...")
        time.sleep(60)

ds[judge_col] = scores

# Save as JSONL.
output_cols = ["archaic_sentence", translation_col, judge_col]
jsonl_filename = f"BorgiNonModernToModern-hw2_transl-judge_{model_name}-{judge_type}_{llm_type}-model.jsonl"
with open(jsonl_filename, "w", encoding="utf8") as fout:
    for record in ds[output_cols].to_dict(orient="records"):
        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

  0%|          | 0/97 [00:00<?, ?it/s]

🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...


ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 303.53ms


🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...


#### 4.3: QWEN-OPENELM-GEMINI DEBATE-and-CONSENSUS: REFERENCE-FREE SELF-IMPROVING LLM EVALUATION for MACHINE TRANSLATION

This evaluation framework introduces a novel, fully reference-free pipeline for scoring machine translations by leveraging the **debate and consensus** paradigm with large language models (LLMs):

**MULTI-CRITERIA DUAL-JUDGE SCORING**

Each machine translation is independently evaluated by two strong local LLMs, **Qwen** and **OpenELM**, according to four established dimensions: adequacy, fluency, style, and completeness. Each judge provides a score from 1 (worst) to 5 (best) for each criterion.

**DEBATE & CONSENSUS RESOLUTION**

The scores from both judges, along with the original sentence and translation, are then submitted to a third model, **Gemini** (via API). Gemini receives a specialized prompt to "debate" the merits of each judge's scores and determine if any criteria should be adjusted. This process simulates expert panel discussion, encouraging self-correction and consensus formation **without the need for gold-standard reference translations**.

**ROBUST REFERENCE-FREE EVALUATION**

By combining independent perspectives from different models and refining them through debate, this pipeline increases evaluation robustness and mitigates single-model bias. All debate prompts and consensus decisions are logged, providing transparency and a rich resource for future analysis.

This approach enables self-improving, reference-free machine translation evaluation, supporting large-scale, reliable assessment of translation quality in scenarios where high-quality human references are unavailable.


##### **FURTHER DETAILS**:
**QWEN 3.1.7B - Hugging-Face Reference Page:** https://huggingface.co/Qwen/Qwen3-1.7B

**OPENELM-3B-Instruct - Hugging-Face Reference Page:** https://huggingface.co/apple/OpenELM-3B-Instruct

**QWEN+OPENELM GPU-RAM:** 9.7GB
**SELENE+ARCTIC GPU-RAM:** 12.0GB

In [7]:
# Information to-set:
input_jsonl = "/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-llama.jsonl"
llm_type = "llama"
model_name = "gemini"
judge_type = "MultiCriteria-Qwen-OpenElm-Debate&Consensus"

# This mustn't be changed.
translation_col = f"{llm_type}_translation"

In [None]:
import re, json, time
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tenacity import retry, wait_random_exponential, stop_after_attempt

# — Configuration —
INPUT_JSONL = "/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-llama.jsonl"
llm_type = "llama"
model_name = "gemini"
#judge_type = "MultiCriteria-Qwen-OpenElm-Debate&Consensus"
judge_type = "qwen"
TRANSLATION_COL = f"{llm_type}_translation"
OUTPUT_JSONL = f"BorgiNonModernToModern-hw2_transl-judge_{judge_type}-{llm_type}-model.jsonl"
MULTICRIT_PROMPT = """
You are an expert evaluator of machine translations from Archaic Italian to Modern Italian.
For each translation, assign a score from 1 (worst) to 5 (best) on the following four criteria.

Adequacy:
1 - The translation does not capture the original meaning at all.
2 - The translation is mostly wrong; the main meaning is lost, but there are rare fragments of meaning.
3 - Some meaning is preserved, but important information is lost or altered.
4 - Most meaning is present, with only minor issues; very little is lost.
5 - All essential meaning from the original is preserved.

Fluency:
1 - The translation is unreadable or ungrammatical; clearly machine-generated.
2 - The translation has severe grammar errors, unnatural phrasing, or frequent awkwardness.
3 - Some awkwardness or minor grammar issues, but still understandable.
4 - Mostly fluent and grammatical, only rare awkward or unnatural expressions.
5 - Perfectly fluent, fully natural Italian.

Style:
1 - The tone/register is completely lost or inappropriate.
2 - The style is mostly lost; it is awkward or inappropriate for the context.
3 - The style is partially preserved but inconsistent or awkward.
4 - The style is almost fully preserved, with only minor slips.
5 - The style, tone, and register are perfectly matched to the original.

Completeness:
1 - Major parts are omitted or unnecessary parts are added.
2 - The translation is incomplete; many elements are missing or excessive additions present.
3 - Minor omissions/additions, but most information is present.
4 - Almost everything is present, with only trivial information missing or added.
5 - Complete; nothing important is lost or added.

Output ONLY the four scores as numbers 1-5, in exactly this format (no extra text):

Adequacy: <score>
Fluency: <score>
Style: <score>
Completeness: <score>

Original (Archaic Italian): {sentence}

Translation (Modern Italian): {translation}
"""
def parse_multicrit_scores(text):
    get = lambda label: int(
        re.search(rf"{label}:\s*([1-5])", text).group(1)
    )
    return {
        "Adequacy":    get("Adequacy"),
        "Fluency":     get("Fluency"),
        "Style":       get("Style"),
        "Completeness":get("Completeness"),
    }

@retry(wait=wait_random_exponential(10,60), stop=stop_after_attempt(5))
def qwen_judge(pipe, sentence, translation):
    prompt = MULTICRIT_PROMPT.format(sentence=sentence, translation=translation)
    out = pipe(prompt)[0]["generated_text"]
    return parse_multicrit_scores(out)

def main():
    # Load data
    df = pd.read_json(INPUT_JSONL, lines=True)

    # Load Qwen pipeline
    qwen_ckpt = "rd211/Qwen3-1.7B-Instruct-RAG"
    qwen_pipe = pipeline(
        "text-generation",
        model=AutoModelForCausalLM.from_pretrained(qwen_ckpt, device_map="auto", torch_dtype="auto", trust_remote_code=True),
        tokenizer=AutoTokenizer.from_pretrained(qwen_ckpt, trust_remote_code=True),
        max_new_tokens=512,
        do_sample=False,
    )

    # Run judgments
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        sent  = row["archaic_sentence"]
        trans = row[TRANSLATION_COL]
        scores = qwen_judge(qwen_pipe, sent, trans)
        results.append({
            "idx": idx,
            "archaic_sentence": sent,
            TRANSLATION_COL: trans,
            "qwen_scores": scores,
        })

    # Save
    with open(OUTPUT_JSONL, "w", encoding="utf8") as fout:
        for rec in results:
            fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
    print(f"✔ Qwen scores written to {OUTPUT_JSONL}")

if __name__ == "__main__":
    main()

config.json:   0%|          | 0.00/975 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/9.76k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  0%|          | 0/97 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  1%|          | 1/97 [08:40<13:52:34, 520.36s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  2%|▏         | 2/97 [17:03<13:28:16, 510.49s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  3%|▎         | 3/97 [25:13<13:04:31, 500.76s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  4%|▍         | 4/97 [33:39<12:59:29, 502.90s/it]The following generation flags are not valid and may be ignored: ['

In [None]:
import re
import json
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tenacity import retry, wait_random_exponential, stop_after_attempt

# — Configuration —
INPUT_JSONL     = "/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-llama.jsonl"
llm_type        = "llama"
judge_type      = "openELM"
TRANSLATION_COL = f"{llm_type}_translation"
OUTPUT_JSONL    = f"BorgiNonModernToModern-hw2_transl-judge_{judge_type}-{llm_type}-model.jsonl"

# — Prompt template (same as Qwen) —
MULTICRIT_PROMPT = """
You are an expert evaluator of machine translations from Archaic Italian to Modern Italian.
For each translation, assign a score from 1 (worst) to 5 (best) on the following four criteria.

Adequacy:
1 - The translation does not capture the original meaning at all.
2 - The translation is mostly wrong; the main meaning is lost, but there are rare fragments of meaning.
3 - Some meaning is preserved, but important information is lost or altered.
4 - Most meaning is present, with only minor issues; very little is lost.
5 - All essential meaning from the original is preserved.

Fluency:
1 - The translation is unreadable or ungrammatical; clearly machine-generated.
2 - The translation has severe grammar errors, unnatural phrasing, or frequent awkwardness.
3 - Some awkwardness or minor grammar issues, but still understandable.
4 - Mostly fluent and grammatical, only rare awkward or unnatural expressions.
5 - Perfectly fluent, fully natural Italian.

Style:
1 - The tone/register is completely lost or inappropriate.
2 - The style is mostly lost; it is awkward or inappropriate for the context.
3 - The style is partially preserved but inconsistent or awkward.
4 - The style is almost fully preserved, with only minor slips.
5 - The style, tone, and register are perfectly matched to the original.

Completeness:
1 - Major parts are omitted or unnecessary parts are added.
2 - The translation is incomplete; many elements are missing or excessive additions present.
3 - Minor omissions/additions, but most information is present.
4 - Almost everything is present, with only trivial information missing or added.
5 - Complete; nothing important is lost or added.

Output ONLY the four scores as numbers 1-5, in exactly this format (no extra text):

Adequacy: <score>
Fluency: <score>
Style: <score>
Completeness: <score>

Original (Archaic Italian): {sentence}

Translation (Modern Italian): {translation}
"""

def parse_multicrit_scores(text):
    get = lambda label: int(re.search(rf"{label}:\s*([1-5])", text).group(1))
    return {
        "Adequacy":    get("Adequacy"),
        "Fluency":     get("Fluency"),
        "Style":       get("Style"),
        "Completeness":get("Completeness"),
    }

@retry(wait=wait_random_exponential(10, 60), stop=stop_after_attempt(5))
def openelm_judge(pipe, sentence, translation):
    prompt = MULTICRIT_PROMPT.format(sentence=sentence, translation=translation)
    out = pipe(prompt)[0]["generated_text"]
    return parse_multicrit_scores(out)

def main():
    # Load data
    df = pd.read_json(INPUT_JSONL, lines=True)

    # Load OpenELM pipeline
    openelm_ckpt = "apple/OpenELM-3B-Instruct"
    try:
        tokenizer = AutoTokenizer.from_pretrained(openelm_ckpt, trust_remote_code=True)
    except ValueError:
        # Fallback to Llama2 tokenizer
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", trust_remote_code=True)

    openelm_pipe = pipeline(
        "text-generation",
        model=AutoModelForCausalLM.from_pretrained(
            openelm_ckpt,
            device_map="auto",
            torch_dtype="auto",
            trust_remote_code=True
        ),
        tokenizer=tokenizer,
        max_new_tokens=512,
        do_sample=False,
    )

    # Run judgments
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        sent  = row["archaic_sentence"]
        trans = row[TRANSLATION_COL]
        scores = openelm_judge(openelm_pipe, sent, trans)
        results.append({
            "idx": idx,
            "archaic_sentence": sent,
            TRANSLATION_COL: trans,
            "openelm_scores": scores,
        })

    # Save to JSONL
    with open(OUTPUT_JSONL, "w", encoding="utf8") as fout:
        for rec in results:
            fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
    print(f"✔ OpenELM scores written to {OUTPUT_JSONL}")

if __name__ == "__main__":
    main()


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  0%|          | 0/97 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 1/97 [00:19<31:05, 19.43s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 2/97 [00:22<15:04,  9.52s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  3%|▎         | 3/97 [00:24<09:54,  6.33s/it]The following generation flags are not valid and may be ign

✔ OpenELM scores written to BorgiNonModernToModern-hw2_transl-judge_OpenELM-llama-model.jsonl





In [None]:
!pip install genai

Collecting genai
  Downloading genai-2.1.0-py3-none-any.whl.metadata (6.5 kB)
Collecting ipython<9.0.0,>=8.10.0 (from genai)
  Downloading ipython-8.37.0-py3-none-any.whl.metadata (5.1 kB)
Collecting openai<0.28.0,>=0.27.0 (from genai)
  Downloading openai-0.27.10-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken<0.4.0,>=0.3.2 (from genai)
  Downloading tiktoken-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting jedi>=0.16 (from ipython<9.0.0,>=8.10.0->genai)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting stack_data (from ipython<9.0.0,>=8.10.0->genai)
  Downloading stack_data-0.6.3-py3-none-any.whl.metadata (18 kB)
Collecting traitlets>=5.13.0 (from ipython<9.0.0,>=8.10.0->genai)
  Downloading traitlets-5.14.3-py3-none-any.whl.metadata (10 kB)
Collecting executing>=1.2.0 (from stack_data->ipython<9.0.0,>=8.10.0->genai)
  Downloading executing-2.2.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting asttokens>=2.

In [11]:
!pip install google-generativeai




In [21]:
import re
import json
import time
import google.generativeai as genai
from tenacity import retry, wait_random_exponential, stop_after_attempt

# — Configuration —
llm_type        = "llama"
scores_qwen     = f"/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/eval/multi-criteria-debate&consensus(qwen-openelm-gemini)/BorgiNonModernToModern-hw2_transl-judge_qwen-llama-model.jsonl"
scores_openelm  = f"/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/eval/multi-criteria-debate&consensus(qwen-openelm-gemini)/BorgiNonModernToModern-hw2_transl-judge_openELM-llama-model.jsonl"
output_jsonl    = f"BorgiNonModernToModern-hw2_transl-judge_DebateConsensusQwenOpenELMGemini-{llm_type}-model.jsonl"
logs_jsonl      = f"BorgiNonModernToModern-hw2_transl-judge_DebateConsensusQwenOpenELMGemini-{llm_type}-logs.jsonl"

# 1) Configure Gemini API and model.
genai.configure(api_key=gemini_api_key)
gemini = genai.get_model("gemini-2.0-flash")

def parse_multicrit_scores(text):
    get = lambda label: int(re.search(rf"{label}:\s*([1-5])", text).group(1))
    return {
        "Adequacy":    get("Adequacy"),
        "Fluency":     get("Fluency"),
        "Style":       get("Style"),
        "Completeness":get("Completeness"),
    }

@retry(wait=wait_random_exponential(10, 60), stop=stop_after_attempt(5))
def gemini_debate_consensus(sentence, translation, s1, s2):
    prompt = f"""
Two expert judges scored this translation independently.

Expert 1 scores:
Adequacy: {s1['Adequacy']}
Fluency: {s1['Fluency']}
Style: {s1['Style']}
Completeness: {s1['Completeness']}

Expert 2 scores:
Adequacy: {s2['Adequacy']}
Fluency: {s2['Fluency']}
Style: {s2['Style']}
Completeness: {s2['Completeness']}

Original (Archaic Italian): {sentence}
Translation (Modern Italian): {translation}

Please debate which scores are most accurate and, if any should change,
output ONLY the final four scores in exactly this format (no extra text):

Adequacy: <1–5>
Fluency: <1–5>
Style: <1–5>
Completeness: <1–5>
"""
    # Directly call generate_text, passing the model name
    resp = genai.GenerativeModel("gemini-2.0-flash").generate_content(prompt, generation_config={"temperature": 0.0})
    text = resp.text.strip()
    return parse_multicrit_scores(text), prompt, text

def main():
    # Load the Qwen & OpenELM score files into dicts keyed by idx
    qwen_data    = {r["idx"]: r for r in map(json.loads, open(scores_qwen, encoding="utf8"))}
    openelm_data = {r["idx"]: r for r in map(json.loads, open(scores_openelm, encoding="utf8"))}

    final_records = []
    debate_logs   = []

    for idx in sorted(qwen_data):
        rec1 = qwen_data[idx]
        rec2 = openelm_data[idx]

        sent  = rec1["archaic_sentence"]
        trans = rec1[f"{llm_type}_translation"]
        s1    = rec1["qwen_scores"]
        s2    = rec2["openelm_scores"]

        consensus, prompt, response = gemini_debate_consensus(sent, trans, s1, s2)
        changed = (consensus != s1 and consensus != s2)

        final_records.append({
            "idx":                  idx,
            "archaic_sentence":     sent,
            f"{llm_type}_translation": trans,
            "qwen_scores":          s1,
            "openelm_scores":       s2,
            "debate_consensus":     consensus,
            "debate_model":         "gemini",
            "debate_changed":       changed,
        })
        debate_logs.append({
            "idx":             idx,
            "prompt":          prompt,
            "response":        response,
            "consensus_scores": consensus,
        })

        # throttle to respect rate limits
        if (idx + 1) % 15 == 0:
            time.sleep(60)

    # Write out the final judgments
    with open(output_jsonl, "w", encoding="utf8") as fout:
        for rec in final_records:
            fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

    # Write out the debate logs
    with open(logs_jsonl, "w", encoding="utf8") as fout:
        for log in debate_logs:
            fout.write(json.dumps(log, ensure_ascii=False) + "\n")

    print(f"✔ Debate & consensus written to {output_jsonl}")
    print(f"✔ Logs written to {logs_jsonl}")

if __name__ == "__main__":
    main()

✔ Debate & consensus written to BorgiNonModernToModern-hw2_transl-judge_DebateConsensusQwenOpenELMGemini-llama-model.jsonl
✔ Logs written to BorgiNonModernToModern-hw2_transl-judge_DebateConsensusQwenOpenELMGemini-llama-logs.jsonl


#### 4.3: M-PROMETHEUS - GENERAL EVALUATION

I will use here the M-Prometheus to evaluate the translations as suggested.

In [None]:
!pip install -U prometheus-eval
!pip install vllm



In [None]:
# !pip install prometheus-eval==0.2.7  # Run once in Colab or locally

import pandas as pd
from prometheus_eval.vllm import VLLM
from prometheus_eval import PrometheusEval
from prometheus_eval.prompts import ABSOLUTE_PROMPT, SCORE_RUBRIC_TEMPLATE
from tqdm import tqdm
import json
import time

# --- 1. Load your translations ---
input_jsonl = "/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-gemma.jsonl"
df = pd.read_json(input_jsonl, lines=True)

# --- 2. Set up the M-Prometheus-7B model ---
judge_model = VLLM(model="Unbabel/M-Prometheus-7B")
judge = PrometheusEval(model=judge_model, absolute_grade_template=ABSOLUTE_PROMPT)

# --- 3. Prepare a general rubric for translation quality ---
rubric_data = {
    "criteria": "General translation quality from Archaic Italian to Modern Italian.",
    "score1_description": "Completely unacceptable translation. The translation has no pertinence with the original meaning, or makes no sense.",
    "score2_description": "Severe semantic errors, omissions or substantial additions. Errors are semantic/syntactic in nature. No human would write this.",
    "score3_description": "Partially wrong translation. The translation contains errors, but mostly minor ones (typos or small semantic errors).",
    "score4_description": "Good translation. Substantially faithful, but the style does not perfectly match the original; still fluent and acceptable.",
    "score5_description": "Perfect translation. Accurate, fluent, complete, and coherent. It retains the original meaning as much as possible."
}
score_rubric = SCORE_RUBRIC_TEMPLATE.format(**rubric_data)

# --- 4. Define the judge function using Prometheus ---
def prometheus_judge(sentence, translation):
    # The instruction (as suggested by the paper and official demos)
    instruction = "You are a professional linguist. Evaluate the modern Italian translation of an archaic Italian sentence."
    response = judge.absolute_grade(
        instruction=instruction,
        orig_response=translation,
        orig_reference_answer=None,  # No gold ref available
        orig_criteria=rubric_data["criteria"],
        orig_score1_description=rubric_data["score1_description"],
        orig_score2_description=rubric_data["score2_description"],
        orig_score3_description=rubric_data["score3_description"],
        orig_score4_description=rubric_data["score4_description"],
        orig_score5_description=rubric_data["score5_description"],
        input_sentence=sentence
    )
    # Prometheus returns a dict: {'score': int, 'feedback': str}
    return response["score"], response.get("feedback", "")

# --- 5. Run evaluation on your dataset (sleep every 15 to avoid OOM/throttling if needed) ---
translation_col = "gemma_translation"
judge_col = f"{translation_col}_prometheus_general_judge_score"
judge_feedback_col = f"{translation_col}_prometheus_general_judge_feedback"

scores = []
feedbacks = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    score, feedback = prometheus_judge(row["archaic_sentence"], row[translation_col])
    scores.append(score)
    feedbacks.append(feedback)
    if (i+1) % 15 == 0:
        print("🕒 Sleeping for 10 seconds to avoid OOM/throttling...")
        time.sleep(10)  # Adjust for your hardware

df[judge_col] = scores
df[judge_feedback_col] = feedbacks

# --- 6. Save as JSONL ---
output_cols = ["archaic_sentence", translation_col, judge_col, judge_feedback_col]
output_jsonl = f"BorgiNonModernToModern-hw2_transl-judge_m-prometheus-general_{translation_col}.jsonl"

with open(output_jsonl, "w", encoding="utf8") as fout:
    for record in df[output_cols].to_dict(orient="records"):
        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"✅ All translations scored by M-Prometheus-7B and saved to {output_jsonl}")


INFO 05-28 21:26:34 [__init__.py:31] Available plugins for group vllm.general_plugins:
INFO 05-28 21:26:34 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 05-28 21:26:34 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.


config.json:   0%|          | 0.00/790 [00:00<?, ?B/s]

INFO 05-28 21:26:54 [config.py:793] This model supports multiple tasks: {'reward', 'embed', 'generate', 'score', 'classify'}. Defaulting to 'generate'.
INFO 05-28 21:26:54 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.0) with config: model='Unbabel/M-Prometheus-7B', speculative_config=None, tokenizer='Unbabel/M-Prometheus-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, co

tokenizer_config.json:   0%|          | 0.00/7.31k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

INFO 05-28 21:27:02 [cuda.py:240] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 05-28 21:27:02 [cuda.py:289] Using XFormers backend.
INFO 05-28 21:27:03 [parallel_state.py:1064] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 05-28 21:27:03 [model_runner.py:1170] Starting to load model Unbabel/M-Prometheus-7B...
INFO 05-28 21:27:04 [weight_utils.py:291] Using model weights format ['*.safetensors']


model-00001-of-00004.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.33G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

INFO 05-28 21:35:41 [weight_utils.py:307] Time spent downloading weights for Unbabel/M-Prometheus-7B: 517.368201 seconds


model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 05-28 21:36:47 [default_loader.py:280] Loading weights took 65.17 seconds
INFO 05-28 21:36:48 [model_runner.py:1202] Model loading took 14.2488 GiB and 584.227320 seconds


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 106.12 MiB is free. Process 66765 has 14.63 GiB memory in use. Of the allocated memory 14.47 GiB is allocated by PyTorch, and 54.51 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# TO-DO: STRICTLY REQUIRED
- Do the manual annotation over the generated text and study the correlation between the judge and your manual annotation. Here I will need to:
  - Do the evaluation as if I was the LLM-as-a-Judge for all the models' results.
  - Compute the Cohen's Kappa Correlation.  
- Use M-Prometheus as-LLM-as-a-Judge, and compute agreement with
your manual annotations.

# TO-DO: PROMPTING TECHNIQUES
- Role-Playing Prompt
- Meta-Prompting / Self-Consistency
-ReAct

# TO-DO: EVALUATION
- Automatic Reference-Free Metrics (LLM-based).
- Ensemble of Judges.