# AMT - AUTOMATIC MACHINE TRANSLATION

@alessioborgi

### 0: IMPORTING LIBRARIES

In [1]:
!pip install -U datasets bitsandbytes accelerate
!pip install huggingface-hub pandas transformers tiktoken protobuf sentencepiece tqdm google-generativeai tenacity

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3

In [2]:
# Importing libraries for step 1).
import os
import time
import json
import torch
import random
import pandas as pd
from tqdm.auto import tqdm
from huggingface_hub import login
from datasets import load_dataset
from huggingface_hub import hf_hub_download

# Importing libraries for step 2).
import re
import google.generativeai as genai
from tenacity import retry, wait_random_exponential, stop_after_attempt
from transformers import MBartForConditionalGeneration, MBart50Tokenizer, AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig, pipeline, AutoModelForCausalLM

In [3]:
# PUT HERE AL THE KEYS (HF and GEMINI).
hf_token = "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"
gemini_api_key = "AIzaSyD7EbS5LLkqPbeAMmr9BR29sP98iOLtpgQ"

In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


### 1: LOADING THE DATASET

#### 1.1: PUSH THE DATASET TO HUGGING-FACE

In [None]:
def upload_to_hf_dataset(
    hf_token: str,
    data_file_path: str,
    repo_name: str,
    file_format: str = "csv",
    split_name: str = "test",
):
    """
    Uploads a local file as a Hugging Face Dataset.

    Args:
        hf_token: Your Hugging Face access token.
        data_file_path: Path to the local data file.
        repo_name: The target repo on HF (e.g. "username/my-dataset").
        file_format: One of "csv", "json", "tsv", etc. Default "csv".
        split_name: Name of the dataset split (e.g. "train", "test"). Default "test".
    """
    # 1) Authenticate to HuggingFace.
    login(token=hf_token)

    # 2) Load local file.
    data_files = { split_name: data_file_path }
    dataset = load_dataset(file_format, data_files=data_files)

    # 3) Push to Hub.
    dataset.push_to_hub(repo_name, token=hf_token)
    print(f"Dataset available at https://huggingface.co/datasets/{repo_name}")

In [None]:
hf_token = "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"
local_path = "/Users/alessioborgi/GitHub/AMT-AutomaticMachineTranslation/test_data/dataset_cleaned.csv"
repo_name  = "Alessio-Borgi/archaic-italian-cleaned-test"

upload_to_hf_dataset(
    hf_token=hf_token,
    data_file_path=local_path,
    repo_name=repo_name,
    file_format="csv",
    split_name="test",
)

#### 1.2: LOADING DATASET FROM HUGGING-FACE

In [5]:
ds = load_dataset("Alessio-Borgi/archaic-italian-cleaned-test")

README.md:   0%|          | 0.00/370 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/11.8k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/97 [00:00<?, ? examples/s]

In [6]:
ds

DatasetDict({
    test: Dataset({
        features: ['Author', 'Date', 'Region', 'Sentence'],
        num_rows: 97
    })
})

#### 1.3: EXPLORING THE TEST DATASET

In [None]:
def explore_dataset(dataset_name):
    ''' Function to explore a dataset. '''

    # Loading the dataset.
    ds = load_dataset(dataset_name)
    df = pd.DataFrame(ds["test"])

    # 1) Number of examples.
    print("Number of examples:", len(df))

    # 2) Preview first 5 examples.
    print("First 5 examples:")
    print(df.head(5), "\n")

    # 3) Sentence-length statistics.
    df["length_tokens"] = df["Sentence"].apply(lambda x: len(x.split()))
    print("Sentence length (tokens) stats:")
    print(df["length_tokens"].describe(), "\n")

    # 4 Take out the column names.
    print("Column names:", df.columns.tolist(), "\n")

In [None]:
# Explore the dataset.
explore_dataset(dataset_name="Alessio-Borgi/archaic-italian-cleaned-test")

Number of examples: 97
First 5 examples:
                        Author     Date Region  \
0              Brunetto Latini  1260-61  fior.   
1                Bono Giamboni     1292  fior.   
2     Valerio Massimo (red. V1     1336  fior.   
3  Lucano volg. (ed. Marinoni)  1330/40  prat.   
4              Brunetto Latini  1260-61  fior.   

                                            Sentence  
0  quella guerra ben fatta l' opera perché etc. E...  
1  crudele, e di tutte le colpe pigli vendetta, c...  
2  Non d' altra forza d' animo fue ornato Ponzio ...  
3  Se questo piace a tutti e se 'l tempo hae biso...  
4  Officio di questa arte pare che sia dicere app...   

Sentence length (tokens) stats:
count    97.000000
mean     20.041237
std       5.996384
min       6.000000
25%      16.000000
50%      20.000000
75%      24.000000
max      31.000000
Name: length_tokens, dtype: float64 

Column names: ['Author', 'Date', 'Region', 'Sentence', 'length_tokens'] 



### 2: AMT - TRANSFORMER-BASED

#### 2.1: mBART (MULTILINGUAL BART)

**ARCHITECTURE & SIZE**
This Transformer-based solution consists in 12-layer encoder + 12-layer decoder Transformer (≈610 M parameters).

**DESCRIPTION**
- **Pretraining**: It has been pretrained via Denoising auto-encoding on monolingual corpora in 50 languages (mBART-50).
- **Multilingual MT**: It has been fine-tuned on many-to-many bitext and supports direct “it→it” by forcing Italian as both source & target.

**REFERENCE INFORMATION**
- Hugging-Face Reference page: https://huggingface.co/docs/transformers/model_doc/mbart
- Paper: https://arxiv.org/abs/2001.08210
- Specific Model employed: *facebook/mbart-large-50-many-to-many-mmt*


In [None]:
# 1) Loading mBART-50 Model & Tokenizer.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model_name = "facebook/mbart-large-50-many-to-many-mmt"
mBART_tokenizer = MBart50Tokenizer.from_pretrained(model_name)
mBART_model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)
mBART_tokenizer.src_lang = "it_IT"
mBART_tokenizer.model_max_length = 512


# 2) Updated batched translation with device placement
def modernize_mbart(sentences, batch_size=8):
    """
    Translate sentences using mBART on GPU (if available),
    showing a tqdm progress bar.
    """
    translations = []
    total_batches = (len(sentences) + batch_size - 1) // batch_size

    for i in tqdm(
        range(0, len(sentences), batch_size),
        total=total_batches,
        desc="mBART Translation",
        unit="batch",
        leave=True
    ):
        batch = sentences[i : i + batch_size]

        # Tokenization.
        inputs = mBART_tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        inputs = { name: tensor.to(device) for name, tensor in inputs.items() }

        # Generation of the Translations.
        with torch.no_grad():
            gen = mBART_model.generate(
                **inputs,
                forced_bos_token_id=mBART_tokenizer.lang_code_to_id["it_IT"],
                max_length=512,
            )
        # Decoding the extensions from tokenizer and add the translations to the list.
        translations.extend(mBART_tokenizer.batch_decode(gen, skip_special_tokens=True))
    return translations

# 3) Run on the test split.
arch_sentences = ds["test"]["Sentence"]
mbart_outputs = modernize_mbart(arch_sentences)

# 4) Attach back to the dataset the translations.
ds = ds["test"].add_column("mbart_translation", mbart_outputs)

# 5) Save the dataset with the mBART Translations.
df = ds.to_pandas()
output_path = "dataset_with_mbart_translations.csv"
df.to_csv(output_path, index=False)


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

mBART Translation:   0%|          | 0/13 [00:00<?, ?batch/s]

In [None]:
ds["mbart_translation"]

["E poi, Aiaces, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi, un po' di soldi.",
 'Crudele, e per tutte le colpe vendetta, come dice la legge, e per tutte le colpe vendetta, come dice la legge, e per tutte le colpe vendetta.',
 "Non c'è altra forza d' animosità che è stato venerato il Ponzio dell'Umiliare, un romano cavaliere.",
 'Se questo piace a tutti e se il tempo ha bisogno di Pompei per ridere e non per compagno, non riterrò più fati.',
 "L'offiziere di questo arte sembra essere solo per far credere, fine, per far credere.",
 "E' un' larghezza di vento, e' un' larghezza di nebbia, e' un' la

In [None]:
# 1) Sample 10 random indices
indices = random.sample(range(len(ds)), 10)

# 2) Print the pairs
for idx in indices:
    print(f"Archaic Sentence: {ds[idx]['Sentence']}")
    print(f"mBART Translation: {ds[idx]['mbart_translation']}\n")


Archaic Sentence: Io spero in messer Iesù di mandare tosto a voi Timoteo, acciocché io sia d'animo buono
mBART Translation: Io spero, in Messier Iesù, di mandare un tosto a Timoteo, perche' io abbia un buon umore.

Archaic Sentence: l'armi et insieme con loro passaseno tra li nimici, perçò se alcuno non avesse ardire de questo et sì avevano questo animo.
mBART Translation: l'armi e con loro passavano tra i nullai, quindi se non c'era un'armi e non c'era un'armi, loro passavano tra i nullai, quindi se non c'era un'armi e non c'era un'armi.

Archaic Sentence: Corbio nipote d' Ortensio menò sua vita più bassa e più viziosa
mBART Translation: Corbio, nephew of Ortensio, ha fatto la vita più bassa e più visiva.

Archaic Sentence: quello che sopra tutti gli altri perdonasse a' cittadini, e a cui più sicuramente possiate credere; poi ch'egli fu vostro comandatore.
mBART Translation: Quello che perdono per i cittadini, e che most di sicuro voi possiate credere; e poi lui fu il vostro capo.

Ar

#### 2.2: NLLB (No Language Left Behind)

**ARCHITECTURE & SIZE**
This Transformer-based solution comes from the Meta family. It's a many-to-many multilingual Seq2Seq that can be used as a rewriting model for Italian→Italian..

**DESCRIPTION**
- **High Capacity/Quality**: The flagship nllb-200-3.3B has shown state-of-the-art BLEU/COMET on many low-resource ↔ high-resource pairs, and handles morphological/orthographic variation robustly.
- **Multilingual MT**: It supports 200 languages and has full support for ita_Latn (Italian in Latin script).

**REFERENCE INFORMATION**
- Hugging-Face Reference page: https://huggingface.co/docs/transformers/en/model_doc/nllb
- Paper: https://arxiv.org/abs/2207.04672
- Specific Model employed: *facebook/nllb-200-3.3B*

In [None]:
# Set up the 8-bit quantized NLLB pipeline for Italian→Italian.
# 1) Set up the device specifics.
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "cuda" if device == 0 else "cpu")

# 2) 8-bit + offload config.
bnb = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_enable_fp32_cpu_offload=True
)

# 3) Load model in 8-bit.
model_name = "facebook/nllb-200-3.3B"
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb,
    device_map="auto"
)

# 4) Load tokenizer with src/tgt languages set.
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    src_lang="ita_Latn",
    tgt_lang="ita_Latn"
)

# 5) Build the translation pipeline.
translator = pipeline(
    "translation",
    model=model,
    tokenizer=tokenizer,
    src_lang="ita_Latn",
    tgt_lang="ita_Latn",
)

# 6) Taking the sentences to translate and translate in batches.
arch = ds["Sentence"]
results = translator(arch, batch_size=8)

# 7) Extract the Italian text.
italian_translations = [r["translation_text"] for r in results]

# 8) Attach & save to csv file.
ds = ds.add_column("nllb_translation", italian_translations)
df = ds.to_pandas()
#df.to_csv("dataset_with_nllb_italian_translations.csv", index=False)
df.to_csv("dataset_with_mBART_NLLB_translations.csv", index=False)


Using device: cuda


config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)cf3e5af37956607f4c667d891ec069aa276be0be:   0%|          | 0.00/8.55G [00:00<?, ?B/s]

(…)18c1b46dc95be6e106c36df87d13175418b3972c:   0%|          | 0.00/6.93G [00:00<?, ?B/s]

(…)4436d65cf94380c5ddd8f524cb878e090b27bb50:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/94.1k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)6cea38b9e3d5efcdcb9c251d6b40538e1aab555a:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
ds["nllb_translation"]

["E d'altra parte Aiaces era un cavaliere franco e prodigioso alle armi, di grande guisa, ma non era pieno di grande senno",
 'crudele, e per ogni colpa prendi vendetta, come dice la legge, e a nessun cavaliere perdona i peccati.',
 "Non per altra forza d'animo fu decorato Ponzio Aufidiano, cavaliere romano.",
 'Se a tutti piace e se il tempo ha bisogno di Pompei come cavaliere e non come compagno, non ritengo più i destini.',
 "L'obiettivo di questa arte sembra essere quello di dire in modo insidioso per far credere, il fine è far credere per dirlo.",
 'Ecco, i venti venti larghi scaricano nubi risolute, e potresti credere che il cielo intero cadesse nel mare.',
 'Ma chi spera che io possa avere questa speranza, questi che non credono ancora in Cristo, vedono già con noi, e non potendo negarlo, grideranno i denti.',
 'La vendita dei morti e la presa dei vivi fecero la frode di un re feroce.',
 'Perché lui, che ora per le sue grandi regalità è feroce e onorevole, lui di ogni male affli

In [None]:
# 1) Sample 10 random indices
indices = random.sample(range(len(ds)), 10)

# 2) Print the pairs
for idx in indices:
    print(f"Archaic Sentence: {ds[idx]['Sentence']}")
    print(f"NLLB Translation: {ds[idx]['nllb_translation']}\n")


Archaic Sentence: Altressì uno amante chiamando merzé alla sua donna dice parole e ragioni molte, et ella si difende in suo dire.
NLLB Translation: Altro amante chiama Merzé alla sua donna dice parole e ragioni molte, ed ella si difende nel suo dire.

Archaic Sentence: Gorgone, e ho questa proprietà che io volo per l'aire sì come uno ucello".
NLLB Translation: Gorgone, e ho questa proprietà che volo per aria sì come un uccello".

Archaic Sentence: Non voglio, che insuperbischi per lo santo proposito, e voto della verginità vedendo le sue laudi
NLLB Translation: Non voglio che insuperbischi per il santo proposito, e voto della verginità vedendo le sue lodi

Archaic Sentence: Quando li serpenti invellenava di giorno alcuno Romano, allora iera la maraviglia a vedere come li Psille si combattevano al veleno, ché elli imolavano tutto inazzi della loro salive
NLLB Translation: Quando i serpenti venivano avvelenati di giorno da un romano, allora era una meraviglia vedere come i Psille si comb

### 3: AMT - LLM-BASED

#### 3.1: LLAMA-2-7b-chat-hf

**Hugging-Face Reference Page:** https://huggingface.co/meta-llama/Llama-2-7b-chat-hf

**#Params:** 7B

**GPU-RAM:** 12.9GB

**TOTAL TIME-TO-RUN:** 276.91 seconds

**AVG-per-SENTENCE TIME:** 2.85 seconds

In [None]:
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "cuda" if device == 0 else "cpu")
# 1) Load LLAMA 3.1-8B model & tokenizer.
llama_checkpoint = "meta-llama/Llama-2-7b-chat-hf"
llama_tokenizer = AutoTokenizer.from_pretrained(llama_checkpoint, device_map="auto", torch_dtype="auto", hf_token=hf_token)
llama_model = AutoModelForCausalLM.from_pretrained(
    llama_checkpoint,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)
# 2) Taking the sentences to translate and translate in batches.
sentences = ds["test"]["Sentence"]

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

##### 3.1.1: ZERO-SHOT TRANSLATION

In [None]:
# 3) Set pad token for batching.
if llama_tokenizer.pad_token is None:
    llama_tokenizer.padding_side = "left"

# 4) Build translation pipeline.
llama_translator = pipeline(
    "text-generation",
    model=llama_model,
    tokenizer=llama_tokenizer,
    max_new_tokens=256,
    do_sample=False,
)

# 5) Starting the batched translation.
batch_size = 8
n = len(sentences)
llama_outputs = []

total_start = time.time()
for start in tqdm(range(0, n, batch_size), desc="Translating with Llama"):
    # Format prompts in each batch
    batch_sentences = sentences[start:start+batch_size]
    batch_prompts = [f"Traduci la seguente frase dall'italiano arcaico all'italiano moderno. Solo la traduzione, senza spiegazioni:\n{s}\nRisposta:"
    for s in batch_sentences
]

    batch_results = llama_translator(batch_prompts)
    for i, r in enumerate(batch_results):
        # Remove prompt prefix from output.
        completion = r[0]["generated_text"]
        result = completion.replace(batch_prompts[i], "").strip()
        llama_outputs.append(result)
total_end = time.time()
print(f"\nTotal time: {total_end - total_start:.2f} seconds")
print(f"Average per sentence: {(total_end - total_start)/n:.2f} seconds")

# 6) Save translations.
output_path = "BorgiNonModernToModern-hw2_transl-llama.jsonl"
sentences_out = ds["test"]["Sentence"]
translations_out = llama_outputs

with open(output_path, "w", encoding="utf-8") as f:
    for s, t in zip(sentences_out, translations_out):
        entry = {
            "archaic_sentence": s,
            "llama_translation": t
        }
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")


Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Translating with Llama:   0%|          | 0/13 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Total time: 257.13 seconds
Average per sentence: 2.65 seconds


In [None]:
llama_outputs = "BorgiNonModernToModern-hw2_transl-llama.jsonl"
with open(llama_outputs, encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

n_samples = min(10, len(data))
indices = random.sample(range(len(data)), n_samples)

for idx in indices:
    print(f"Archaic Sentence: {data[idx]['archaic_sentence']}")
    print(f"Llama Translation: {data[idx]['llama_translation']}\n")

Archaic Sentence: E dunque, da che queste cose son così, Catellina, e tu non puoi buonamente qui dimorare, dubiti tu d'andartene in alcuna terra ed usare questa vita fuggendo per li diserti
Llama Translation: E dunque, da che queste cose sono così, Catellina, e tu non puoi buonamente qui dimorare, dubiti tu di andartene in alcuna terra e usare questa vita fuggendo per i deserti.

Archaic Sentence: Unde ragionevolemente Iob è interpretato dolente;
Llama Translation: Perché Ioppe è interpretato dolente.

La frase è tratta da un testo del 1400-1500 secolo e utilizza termini e structures arcaiche.

Archaic Sentence: Però che or chi spererebbe quello che eziandio questi che non vogliono ancora credere in Cristo, già veggiono con noi, e perché nol possono negare, stridono colli denti.
Llama Translation: Ma che or chi spererebbe che quelli che non vogliono ancora credere in Cristo, già veggiono con noi, e perché non possono negare, stridono colli denti.

Archaic Sentence: Se questo piace a tu

##### 3.1.2: FEW-SHOT TRANSLATION

##### 3.1.3: CHAIN-OF-THOUGHT TRANSLATION

#### 3.2: GEMMA 2B-Instruct

**Hugging-Face Reference Page:** https://huggingface.co/google/gemma-2b-it

**#Params:** 2B

**GPU-RAM:** 5.8GB

**TOTAL TIME-TO-RUN:** 105.27 seconds

**AVG-per-SENTENCE TIME:** 1.09 seconds

In [None]:
# 1) Load model & tokenizer.
gemma_checkpoint = "google/gemma-2b-it"
gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_checkpoint, trust_remote_code=True)
gemma_model     = AutoModelForCausalLM.from_pretrained(
    gemma_checkpoint,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)
# 2) Taking the sentences to translate and translate in batches.
sentences = ds["test"]["Sentence"]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

##### 3.2.1: ZERO-SHOT TRANSLATION

In [None]:
# 3) Set pad token for batching.
if gemma_tokenizer.pad_token is None:
    gemma_tokenizer.pad_token = gemma_tokenizer.eos_token
gemma_tokenizer.padding_side = "left"

# 4) Use "text-generation" pipeline.
falcon_translator = pipeline(
    "text-generation",
    model=gemma_model,
    tokenizer=gemma_tokenizer,
    trust_remote_code=True,
    device_map="auto",
    max_new_tokens=256,
    do_sample=False,
)

# 5) Prepare translation prompts.
prompts = [f"Traduci la seguente frase dall'italiano arcaico all'italiano moderno. Solo la traduzione, senza spiegazioni:\n{s}\nRisposta:" for s in sentences]

# 5) Batched generation.
batch_size = 8
n = len(sentences)
total_start = time.time()
gemma_outputs = []
for start in tqdm(range(0, len(prompts), batch_size), desc="Translating with Gemma"):
    batch_prompts = prompts[start:start+batch_size]
    batch_results = falcon_translator(batch_prompts)
    for i, r in enumerate(batch_results):
        # Remove prompt from output
        gen = r[0]["generated_text"]
        translation = gen.replace(batch_prompts[i], "").strip()
        gemma_outputs.append(translation)

# 5.5) Compute time complexity.
total_end = time.time()
print(f"\nTotal time: {total_end - total_start:.2f} seconds")
print(f"Average per sentence: {(total_end - total_start)/n:.2f} seconds")

# 6) Save or attach as usual.
jsonl_path = "BorgiNonModernToModern-hw2_transl-gemma.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
    for archaic, modern in zip(sentences, gemma_outputs):
        record = {
            "archaic_sentence": archaic,
            "gemma_translation": modern
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Translating with Gemma:   0%|          | 0/13 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore


Total time: 105.12 seconds
Average per sentence: 1.08 seconds


In [None]:
gemma_outputs = "BorgiNonModernToModern-hw2_transl-gemma.jsonl"
with open(gemma_outputs, encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

n_samples = min(10, len(data))
indices = random.sample(range(len(data)), n_samples)

for idx in indices:
    print(f"Archaic Sentence: {data[idx]['archaic_sentence']}")
    print(f"Gemma Translation: {data[idx]['gemma_translation']}\n")

Archaic Sentence: crudele, e di tutte le colpe pigli vendetta, come dice la legge, ed a neuno cavaliere perdoni che pecchi.
Gemma Translation: La frase originale è un parlamento che descrive un comportamento doloso e senza speranza.

Archaic Sentence: Sicchè dolore è a udire, quando l' usare l' arme e la fatica ricusano, con grandissimo disonore come pecore essere
Gemma Translation: Se il dolore è a udire, quando si usa l'arma e si ricusano, con grandissimo disonore come pecore essere.

Archaic Sentence: quando nella matricola si scrivono giurano per Dio, e per Cristo, e per lo Spirito Santo
Gemma Translation: quando nella matricola si scrivono giurano per Dio, Cristo e Spirito Santo.

Archaic Sentence: Quando averai nel cavaliere i detti segni veduti non andare a grandezza caendo, perchè nelle battaglie sono più utili i forti che i grandi.
Gemma Translation: Quando avrai nel cavaliere i dettagli visibili non andranno a grandezza caendo, perché nelle battaglie sono più utili i difensor

##### 3.2.2: FEW-SHOT TRANSLATION

##### 3.2.3: CHAIN-OF-THOUGHT TRANSLATION

### 4: LLM-AS-A-JUDGE EVALUATION

#### 4.1: GEMINI-2.0-FLASH - GENERAL EVALUATION

We will use here Gemini-2.0-Flash Model in a "General" setting evaluation. This will simply give us a single score.

In [None]:
# Information to-set:
input_jsonl = "/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-gemma.jsonl"
llm_type = "gemma"
model_name = "gemini"
judge_type = "general"

# This mustn't be changed.
translation_col = f"{llm_type}_translation"

In [None]:
# Define the function to run teh Judge.
@retry(wait=wait_random_exponential(min=10, max=60), stop=stop_after_attempt(5))
def llm_judge_general(sentence, translation):

    prompt = f"""
    You are an expert evaluator of machine translations from Archaic Italian to Modern Italian.

    For each translation, assign a score from 1 (worst) to 5 (best), using this rubric:

    1: Completely unacceptable translation. The translation has no pertinence with the original meaning; the generated sentence is either gibberish or makes no sense.
    2: Severe semantic errors, omissions or substantial additions on the original sentence. The errors are semantic and syntactic in nature. It’s still something no human would ever write.
    3: Partially wrong translation. The translation is lackluster; it contains errors, but mostly minor errors, like typos, or small semantic errors.
    4: Good translation. The translation is mostly right, substantially faithful to the original text, but the style does not perfectly match the original sentence; still fluent and comprehensible, and could be semantically acceptable.
    5: Perfect translation. The translation is accurate, fluent, complete and coherent. It retained the original meaning as much as it could.

    Evaluate ONLY the translation quality according to these guidelines.

    Original (Archaic Italian): {sentence}

    Translation (Modern Italian): {translation}

    Your score (1-5):
    """
    response = LLM_as_a_Judge_model.generate_content(prompt, generation_config={"temperature": 0.0})
    score_str = response.text.strip()
    match = re.search(r"\b([1-5])\b", score_str)
    if match:
        return int(match.group(1))
    return None

In [None]:
# 1) Configure Gemini API and model.
genai.configure(api_key=gemini_api_key)
LLM_as_a_Judge_model = genai.GenerativeModel("gemini-2.0-flash")

# 2) Load DataFrame from JSONL.
ds = pd.read_json(input_jsonl, lines=True)

# 3) Score all translations, pausing every 15 requests (to avoid API rate limits).
tqdm.pandas()
judge_col = f"{translation_col}_{judge_type}_judge_score"

scores = []
for i, row in tqdm(ds.iterrows(), total=len(ds)):
    score = llm_judge_general(row["archaic_sentence"], row[translation_col])
    scores.append(score)
    if (i + 1) % 15 == 0:
        print("🕒 Sleeping for 60 seconds to avoid API rate limit...")
        time.sleep(60)

ds[judge_col] = scores

# Save as JSONL.
output_cols = ["archaic_sentence", translation_col, judge_col]
jsonl_filename = f"BorgiNonModernToModern-hw2_transl-judge_{model_name}-{judge_type}_{llm_type}-model.jsonl"
with open(jsonl_filename, "w", encoding="utf8") as fout:
    for record in ds[output_cols].to_dict(orient="records"):
        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

  0%|          | 0/97 [00:00<?, ?it/s]

🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...


#### 4.2: GEMINI-2.0-FLASH - MULTI-CRITERIA EVALUATION

We will use here Gemini-2.0-Flash Model in a "Multi-Criteria" setting evaluation. This will give us a set of 4 scores for each sentence, evaluating the Adequacy, Fluency, Style and Completeness.

In [None]:
# Information to-set:
input_jsonl = "/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-llama.jsonl"
llm_type = "llama"
model_name = "gemini"
judge_type = "MultiCriteria"

# This mustn't be changed.
translation_col = f"{llm_type}_translation"

In [None]:
@retry(wait=wait_random_exponential(min=10, max=60), stop=stop_after_attempt(5))
def llm_judge_multicrit(sentence, translation):
    prompt = f"""
You are an expert evaluator of machine translations from Archaic Italian to Modern Italian.
For each translation, assign a score from 1 (worst) to 5 (best) on the following four criteria. Here is the meaning of each score for each criterion:

Adequacy:
1 - The translation does not capture the original meaning at all.
2 - The translation is mostly wrong; the main meaning is lost, but there are rare fragments of meaning.
3 - Some meaning is preserved, but important information is lost or altered.
4 - Most meaning is present, with only minor issues; very little is lost.
5 - All essential meaning from the original is preserved.

Fluency:
1 - The translation is unreadable or ungrammatical; clearly machine-generated.
2 - The translation has severe grammar errors, unnatural phrasing, or frequent awkwardness.
3 - Some awkwardness or minor grammar issues, but still understandable.
4 - Mostly fluent and grammatical, only rare awkward or unnatural expressions.
5 - Perfectly fluent, fully natural Italian.

Style:
1 - The tone/register is completely lost or inappropriate.
2 - The style is mostly lost; it is awkward or inappropriate for the context.
3 - The style is partially preserved but inconsistent or awkward.
4 - The style is almost fully preserved, with only minor slips.
5 - The style, tone, and register are perfectly matched to the original.

Completeness:
1 - Major parts are omitted or unnecessary parts are added.
2 - The translation is incomplete; many elements are missing or excessive additions present.
3 - Minor omissions/additions, but most information is present.
4 - Almost everything is present, with only trivial information missing or added.
5 - Complete; nothing important is lost or added.

Output ONLY the four scores as numbers 1-5, in exactly this format (no extra text):

Adequacy: <score>
Fluency: <score>
Style: <score>
Completeness: <score>

Original (Archaic Italian): {sentence}

Translation (Modern Italian): {translation}
    """
    response = LLM_as_a_Judge_model.generate_content(prompt, generation_config={"temperature": 0.0})
    text = response.text.strip()
    adequacy = re.search(r"Adequacy:\s*([1-5])", text)
    fluency = re.search(r"Fluency:\s*([1-5])", text)
    style = re.search(r"Style:\s*([1-5])", text)
    completeness = re.search(r"Completeness:\s*([1-5])", text)
    return {
        "AdequacyScore": int(adequacy.group(1)) if adequacy else None,
        "FluencyScore": int(fluency.group(1)) if fluency else None,
        "StyleScore": int(style.group(1)) if style else None,
        "CompletenessScore": int(completeness.group(1)) if completeness else None,
    }

In [None]:
# 1) Configure Gemini API and model.
genai.configure(api_key=gemini_api_key)
LLM_as_a_Judge_model = genai.GenerativeModel("gemini-2.0-flash")

# 2) Load DataFrame from JSONL.
ds = pd.read_json(input_jsonl, lines=True)

# 3) Score all translations, pausing every 15 requests (to avoid API rate limits).
tqdm.pandas()
judge_col = f"{translation_col}_{judge_type}_judge_scores"   # <-- "_scores" for dict

scores = []
for i, row in tqdm(ds.iterrows(), total=len(ds)):
    # This returns a dictionary with four scores
    score_dict = llm_judge_multicrit(row["archaic_sentence"], row[translation_col])
    scores.append(score_dict)
    if (i + 1) % 15 == 0:
        print("🕒 Sleeping for 60 seconds to avoid API rate limit...")
        time.sleep(60)

ds[judge_col] = scores

# Save as JSONL.
output_cols = ["archaic_sentence", translation_col, judge_col]
jsonl_filename = f"BorgiNonModernToModern-hw2_transl-judge_{model_name}-{judge_type}_{llm_type}-model.jsonl"
with open(jsonl_filename, "w", encoding="utf8") as fout:
    for record in ds[output_cols].to_dict(orient="records"):
        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

  0%|          | 0/97 [00:00<?, ?it/s]

🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...
🕒 Sleeping for 60 seconds to avoid API rate limit...


#### 4.3: QWEN-OPENELM-GEMINI DEBATE-and-CONSENSUS: REFERENCE-FREE SELF-IMPROVING LLM EVALUATION for MACHINE TRANSLATION

This evaluation framework introduces a novel, fully reference-free pipeline for scoring machine translations by leveraging the **debate and consensus** paradigm with large language models (LLMs):

**MULTI-CRITERIA DUAL-JUDGE SCORING**

Each machine translation is independently evaluated by two strong local LLMs, **Qwen** and **OpenELM**, according to four established dimensions: adequacy, fluency, style, and completeness. Each judge provides a score from 1 (worst) to 5 (best) for each criterion.

**DEBATE & CONSENSUS RESOLUTION**

The scores from both judges, along with the original sentence and translation, are then submitted to a third model, **Gemini** (via API). Gemini receives a specialized prompt to "debate" the merits of each judge's scores and determine if any criteria should be adjusted. This process simulates expert panel discussion, encouraging self-correction and consensus formation **without the need for gold-standard reference translations**.

**ROBUST REFERENCE-FREE EVALUATION**

By combining independent perspectives from different models and refining them through debate, this pipeline increases evaluation robustness and mitigates single-model bias. All debate prompts and consensus decisions are logged, providing transparency and a rich resource for future analysis.

This approach enables self-improving, reference-free machine translation evaluation, supporting large-scale, reliable assessment of translation quality in scenarios where high-quality human references are unavailable.


##### **FURTHER DETAILS**:
**QWEN 3.1.7B - Hugging-Face Reference Page:** https://huggingface.co/Qwen/Qwen3-1.7B

**OPENELM-3B-Instruct - Hugging-Face Reference Page:** https://huggingface.co/apple/OpenELM-3B-Instruct

**GPU-RAM:** 9.7GB

In [None]:
# Information to-set:
input_jsonl = "/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-llama.jsonl"
llm_type = "llama"
model_name = "gemini"
judge_type = "MultiCriteria-Qwen-OpenElm-Debate&Consensus"

# This mustn't be changed.
translation_col = f"{llm_type}_translation"

In [None]:
def parse_multicrit_scores(text):
    '''Parse the output text from the multi-criteria judge and extract scores.'''
    # Use regex to find the scores for each criterion.
    get = lambda label: int(re.search(rf"{label}:\s*([1-5])", text).group(1)) if re.search(rf"{label}:\s*([1-5])", text) else None
    return {
        "AdequacyScore": get("Adequacy"),
        "FluencyScore": get("Fluency"),
        "StyleScore": get("Style"),
        "CompletenessScore": get("Completeness")
    }

@retry(wait=wait_random_exponential(min=10, max=60), stop=stop_after_attempt(5))
def qwen_multicrit_judge(sentence, translation):
    '''Use the Qwen model to judge the translation quality on multiple criteria.'''

    prompt = MULTICRIT_PROMPT.format(sentence=sentence, translation=translation)
    output = qwen_pipe(prompt)[0]["generated_text"]
    return parse_multicrit_scores(output)

@retry(wait=wait_random_exponential(min=10, max=60), stop=stop_after_attempt(5))
def openelm_multicrit_judge(sentence, translation):
    '''Use the OpenELM model to judge the translation quality on multiple criteria.'''

    prompt = MULTICRIT_PROMPT.format(sentence=sentence, translation=translation)
    output = openelm_pipe(prompt)[0]["generated_text"]
    return parse_multicrit_scores(output)

@retry(wait=wait_random_exponential(min=10, max=60), stop=stop_after_attempt(5))
def gemini_debate_consensus(sentence, translation, score1, score2):
    '''Use the Gemini model to debate and reach a consensus on the scores given by two experts.'''

    # Prepare the debate prompt with scores from two experts.
    debate_prompt = f"""
        Two experts have independently scored the translation below. Expert 1 gave these scores:
        Adequacy: {score1['AdequacyScore']}
        Fluency: {score1['FluencyScore']}
        Style: {score1['StyleScore']}
        Completeness: {score1['CompletenessScore']}

        Expert 2 gave these scores:
        Adequacy: {score2['AdequacyScore']}
        Fluency: {score2['FluencyScore']}
        Style: {score2['StyleScore']}
        Completeness: {score2['CompletenessScore']}

        Carefully review the original sentence and the translation, then debate which scores are more correct and why. If you believe any score should be changed, state the new score. Output ONLY the revised scores in the same format.

        Original (Archaic Italian): {sentence}
        Translation (Modern Italian): {translation}
    """
    # Generate the debate response.
    response = gemini_model.generate_content(debate_prompt, generation_config={"temperature": 0.0})
    text = response.text.strip()
    return parse_multicrit_scores(text), debate_prompt, text

In [None]:
# Gemini Setup (Master).
genai.configure(api_key=gemini_api_key)
gemini_model_name = "gemini-2.0-flash"
gemini_model = genai.GenerativeModel(gemini_model_name)

# Qwen Judge Model.
qwen_checkpoint = "Qwen/Qwen3-1.7B"
qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_checkpoint, trust_remote_code=True)
qwen_model = AutoModelForCausalLM.from_pretrained(
    qwen_checkpoint,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True
)
qwen_pipe = pipeline(
    "text-generation",
    model=qwen_model,
    tokenizer=qwen_tokenizer,
    max_new_tokens=512,
    do_sample=False,
)

# OpenELM Judge Model.
openelm_checkpoint = "apple/OpenELM-3B-Instruct"
try:
    openelm_tokenizer = AutoTokenizer.from_pretrained(
        openelm_checkpoint, trust_remote_code=True
    )
except Exception as e:
    print("AutoTokenizer failed! Falling back to Llama2 tokenizer workaround:", e)
    llama2_tokenizer_id = "meta-llama/Llama-2-7b-hf"
    openelm_tokenizer = AutoTokenizer.from_pretrained(llama2_tokenizer_id)

openelm_model = AutoModelForCausalLM.from_pretrained(
    openelm_checkpoint,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True
)
openelm_pipe = pipeline(
    "text-generation",
    model=openelm_model,
    tokenizer=openelm_tokenizer,
    max_new_tokens=512,
    do_sample=False,
)

# 1) Load DataFrame from JSONL translation file.
df = pd.read_json(input_jsonl, lines=True)

# 2) Multi-Criteria Scoring Prompt.
MULTICRIT_PROMPT = """
You are an expert evaluator of machine translations from Archaic Italian to Modern Italian.
For each translation, assign a score from 1 (worst) to 5 (best) on the following four criteria.

Adequacy:
1 - The translation does not capture the original meaning at all.
2 - The translation is mostly wrong; the main meaning is lost, but there are rare fragments of meaning.
3 - Some meaning is preserved, but important information is lost or altered.
4 - Most meaning is present, with only minor issues; very little is lost.
5 - All essential meaning from the original is preserved.

Fluency:
1 - The translation is unreadable or ungrammatical; clearly machine-generated.
2 - The translation has severe grammar errors, unnatural phrasing, or frequent awkwardness.
3 - Some awkwardness or minor grammar issues, but still understandable.
4 - Mostly fluent and grammatical, only rare awkward or unnatural expressions.
5 - Perfectly fluent, fully natural Italian.

Style:
1 - The tone/register is completely lost or inappropriate.
2 - The style is mostly lost; it is awkward or inappropriate for the context.
3 - The style is partially preserved but inconsistent or awkward.
4 - The style is almost fully preserved, with only minor slips.
5 - The style, tone, and register are perfectly matched to the original.

Completeness:
1 - Major parts are omitted or unnecessary parts are added.
2 - The translation is incomplete; many elements are missing or excessive additions present.
3 - Minor omissions/additions, but most information is present.
4 - Almost everything is present, with only trivial information missing or added.
5 - Complete; nothing important is lost or added.

Output ONLY the four scores as numbers 1-5, in exactly this format (no extra text):

Adequacy: <score>
Fluency: <score>
Style: <score>
Completeness: <score>

Original (Archaic Italian): {sentence}

Translation (Modern Italian): {translation}
"""



# 3) Main Loop
results = []
debate_logs = []
debate_calls = 0

for idx, row in tqdm(df.iterrows(), total=len(df)):
    sentence = row["archaic_sentence"]
    translation = row[translation_col]

    # Get scores from both judges (Qwen, OpenELM).
    score1 = qwen_multicrit_judge(sentence, translation)
    score2 = openelm_multicrit_judge(sentence, translation)

    # Run debate (Gemini reviews both scores and possibly changes them).
    consensus, debate_prompt, debate_response = gemini_debate_consensus(
        sentence, translation, score1, score2
    )
    debate_calls += 1

    # Check for changes.
    changed = consensus != score1 and consensus != score2

    # Append main result row.
    result_row = {
        "archaic_sentence": sentence,
        translation_col: translation,
        "qwen_scores": score1,
        "openelm_scores": score2,
        "debate_consensus": consensus,
        "debate_model": "gemini",
        "debate_changed": changed
    }
    results.append(result_row)

    # Log debate conversation.
    debate_logs.append({
        "idx": idx,
        "prompt": debate_prompt,
        "response": debate_response,
        "consensus_scores": consensus
    })

    # Pause every 15 Gemini calls, for agreeing with free-tier rate limits.
    if debate_calls % 15 == 0:
        print("🕒 Sleeping for 60 seconds to avoid Gemini API rate limit...")
        time.sleep(60)

# 4) Save Results and Logs.
output_jsonl = f"BorgiNonModernToModern-hw2_transl-judge_{model_name}-{judge_type}_{llm_type}-model.jsonl"
with open(output_jsonl, "w", encoding="utf8") as fout:
    for record in results:
        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

debate_log_file = output_jsonl.replace(".jsonl", "_debate-conversations.jsonl")
with open(debate_log_file, "w", encoding="utf8") as fout:
    for conv in debate_logs:
        fout.write(json.dumps(conv, ensure_ascii=False) + "\n")

tokenizer_config.json:   0%|          | 0.00/9.73k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00002.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/622M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

configuration_openelm.py:   0%|          | 0.00/14.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/apple/OpenELM-3B-Instruct:
- configuration_openelm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


AutoTokenizer failed! Falling back to Llama2 tokenizer workaround: Unrecognized configuration class <class 'transformers_modules.apple.OpenELM-3B-Instruct.8288250ae190b81ed7ad515ea661e880b9cde2ba.configuration_openelm.OpenELMConfig'> to build an AutoTokenizer.
Model type should be one of AlbertConfig, AlignConfig, AriaConfig, AyaVisionConfig, BarkConfig, BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BitNetConfig, BlenderbotConfig, BlenderbotSmallConfig, BlipConfig, Blip2Config, BloomConfig, BridgeTowerConfig, BrosConfig, CamembertConfig, CanineConfig, ChameleonConfig, ChineseCLIPConfig, ClapConfig, CLIPConfig, CLIPSegConfig, ClvpConfig, LlamaConfig, CodeGenConfig, CohereConfig, Cohere2Config, ColPaliConfig, ConvBertConfig, CpmAntConfig, CTRLConfig, Data2VecAudioConfig, Data2VecTextConfig, DbrxConfig, DebertaConfig, DebertaV2Config, DeepseekV3Config, DiffLlamaConfig, DistilBertConfig, DPRConfig, ElectraConfig, Emu3Config, ErnieConfig, E

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

modeling_openelm.py:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/apple/OpenELM-3B-Instruct:
- modeling_openelm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  0%|          | 0/97 [00:00<?, ?it/s]`generation_config` default values have been modified to match model-specific defaults: {'do_sample': True}. If this is not desired, please set these values explicitly.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 1/97 [00:59<1:35:16, 59.54s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 2/97 [02:02<1:37:10, 61.37s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🕒 Sleeping for 60 seconds to avoid Gemini API rate limit...


 15%|█▌        | 15/97 [15:58<1:46:24, 77.87s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 16%|█▋        | 16/97 [17:00<1:38:45, 73.15s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 18%|█▊        | 17/97 [17:59<1:32:00, 69.01s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 19%|█▊        | 18/97 [18:59<1:27:15, 66.27s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generat

🕒 Sleeping for 60 seconds to avoid Gemini API rate limit...


 31%|███       | 30/97 [32:14<1:27:31, 78.38s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 32%|███▏      | 31/97 [33:13<1:19:53, 72.63s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 33%|███▎      | 32/97 [34:13<1:14:30, 68.77s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 34%|███▍      | 33/97 [35:14<1:10:46, 66.35s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generat

🕒 Sleeping for 60 seconds to avoid Gemini API rate limit...


 46%|████▋     | 45/97 [47:57<1:06:45, 77.03s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 47%|████▋     | 46/97 [48:56<1:00:55, 71.68s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 48%|████▊     | 47/97 [49:55<56:33, 67.88s/it]  The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 49%|████▉     | 48/97 [50:54<53:21, 65.34s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generatio

🕒 Sleeping for 60 seconds to avoid Gemini API rate limit...


 62%|██████▏   | 60/97 [1:03:40<47:41, 77.34s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 63%|██████▎   | 61/97 [1:04:38<42:56, 71.57s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 64%|██████▍   | 62/97 [1:05:35<39:14, 67.27s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 65%|██████▍   | 63/97 [1:06:35<36:54, 65.14s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generat

🕒 Sleeping for 60 seconds to avoid Gemini API rate limit...


 77%|███████▋  | 75/97 [1:19:24<28:05, 76.63s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 78%|███████▊  | 76/97 [1:20:23<24:59, 71.40s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 79%|███████▉  | 77/97 [1:21:21<22:23, 67.16s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 80%|████████  | 78/97 [1:22:20<20:33, 64.91s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generat

🕒 Sleeping for 60 seconds to avoid Gemini API rate limit...


 93%|█████████▎| 90/97 [1:35:05<08:57, 76.73s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 94%|█████████▍| 91/97 [1:36:04<07:07, 71.27s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 95%|█████████▍| 92/97 [1:37:01<05:36, 67.22s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 96%|█████████▌| 93/97 [1:38:00<04:18, 64.57s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generat

✅ Results saved to BorgiNonModernToModern-hw2_transl-judge_debate-consensus_qwen-openelm_vs_gemini.jsonl
✅ All debate conversations saved to BorgiNonModernToModern-hw2_transl-judge_debate-consensus_qwen-openelm_vs_gemini_debate-conversations.jsonl





#### 4.3: M-PROMETHEUS - GENERAL EVALUATION

I will use here the M-Prometheus to evaluate the translations as suggested.

In [None]:
!pip install -U prometheus-eval
!pip install vllm



In [None]:
# !pip install prometheus-eval==0.2.7  # Run once in Colab or locally

import pandas as pd
from prometheus_eval.vllm import VLLM
from prometheus_eval import PrometheusEval
from prometheus_eval.prompts import ABSOLUTE_PROMPT, SCORE_RUBRIC_TEMPLATE
from tqdm import tqdm
import json
import time

# --- 1. Load your translations ---
input_jsonl = "/content/drive/MyDrive/BorgiNonModernToModern/llm_based/zero-shot_prompting/BorgiNonModernToModern-hw2_transl-gemma.jsonl"
df = pd.read_json(input_jsonl, lines=True)

# --- 2. Set up the M-Prometheus-7B model ---
judge_model = VLLM(model="Unbabel/M-Prometheus-7B")
judge = PrometheusEval(model=judge_model, absolute_grade_template=ABSOLUTE_PROMPT)

# --- 3. Prepare a general rubric for translation quality ---
rubric_data = {
    "criteria": "General translation quality from Archaic Italian to Modern Italian.",
    "score1_description": "Completely unacceptable translation. The translation has no pertinence with the original meaning, or makes no sense.",
    "score2_description": "Severe semantic errors, omissions or substantial additions. Errors are semantic/syntactic in nature. No human would write this.",
    "score3_description": "Partially wrong translation. The translation contains errors, but mostly minor ones (typos or small semantic errors).",
    "score4_description": "Good translation. Substantially faithful, but the style does not perfectly match the original; still fluent and acceptable.",
    "score5_description": "Perfect translation. Accurate, fluent, complete, and coherent. It retains the original meaning as much as possible."
}
score_rubric = SCORE_RUBRIC_TEMPLATE.format(**rubric_data)

# --- 4. Define the judge function using Prometheus ---
def prometheus_judge(sentence, translation):
    # The instruction (as suggested by the paper and official demos)
    instruction = "You are a professional linguist. Evaluate the modern Italian translation of an archaic Italian sentence."
    response = judge.absolute_grade(
        instruction=instruction,
        orig_response=translation,
        orig_reference_answer=None,  # No gold ref available
        orig_criteria=rubric_data["criteria"],
        orig_score1_description=rubric_data["score1_description"],
        orig_score2_description=rubric_data["score2_description"],
        orig_score3_description=rubric_data["score3_description"],
        orig_score4_description=rubric_data["score4_description"],
        orig_score5_description=rubric_data["score5_description"],
        input_sentence=sentence
    )
    # Prometheus returns a dict: {'score': int, 'feedback': str}
    return response["score"], response.get("feedback", "")

# --- 5. Run evaluation on your dataset (sleep every 15 to avoid OOM/throttling if needed) ---
translation_col = "gemma_translation"
judge_col = f"{translation_col}_prometheus_general_judge_score"
judge_feedback_col = f"{translation_col}_prometheus_general_judge_feedback"

scores = []
feedbacks = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    score, feedback = prometheus_judge(row["archaic_sentence"], row[translation_col])
    scores.append(score)
    feedbacks.append(feedback)
    if (i+1) % 15 == 0:
        print("🕒 Sleeping for 10 seconds to avoid OOM/throttling...")
        time.sleep(10)  # Adjust for your hardware

df[judge_col] = scores
df[judge_feedback_col] = feedbacks

# --- 6. Save as JSONL ---
output_cols = ["archaic_sentence", translation_col, judge_col, judge_feedback_col]
output_jsonl = f"BorgiNonModernToModern-hw2_transl-judge_m-prometheus-general_{translation_col}.jsonl"

with open(output_jsonl, "w", encoding="utf8") as fout:
    for record in df[output_cols].to_dict(orient="records"):
        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"✅ All translations scored by M-Prometheus-7B and saved to {output_jsonl}")


INFO 05-28 21:26:34 [__init__.py:31] Available plugins for group vllm.general_plugins:
INFO 05-28 21:26:34 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 05-28 21:26:34 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.


config.json:   0%|          | 0.00/790 [00:00<?, ?B/s]

INFO 05-28 21:26:54 [config.py:793] This model supports multiple tasks: {'reward', 'embed', 'generate', 'score', 'classify'}. Defaulting to 'generate'.
INFO 05-28 21:26:54 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.0) with config: model='Unbabel/M-Prometheus-7B', speculative_config=None, tokenizer='Unbabel/M-Prometheus-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, co

tokenizer_config.json:   0%|          | 0.00/7.31k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

INFO 05-28 21:27:02 [cuda.py:240] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 05-28 21:27:02 [cuda.py:289] Using XFormers backend.
INFO 05-28 21:27:03 [parallel_state.py:1064] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 05-28 21:27:03 [model_runner.py:1170] Starting to load model Unbabel/M-Prometheus-7B...
INFO 05-28 21:27:04 [weight_utils.py:291] Using model weights format ['*.safetensors']


model-00001-of-00004.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.33G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

INFO 05-28 21:35:41 [weight_utils.py:307] Time spent downloading weights for Unbabel/M-Prometheus-7B: 517.368201 seconds


model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 05-28 21:36:47 [default_loader.py:280] Loading weights took 65.17 seconds
INFO 05-28 21:36:48 [model_runner.py:1202] Model loading took 14.2488 GiB and 584.227320 seconds


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 106.12 MiB is free. Process 66765 has 14.63 GiB memory in use. Of the allocated memory 14.47 GiB is allocated by PyTorch, and 54.51 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# TO-DO: STRICTLY REQUIRED
- Do the manual annotation over the generated text and study the correlation between the judge and your manual annotation.
- Use M-Prometheus as-LLM-as-a-Judge, and compute agreement with
your manual annotations.

# TO-DO: PROMPTING TECHNIQUES
- Role-Playing Prompt
- Meta-Prompting / Self-Consistency
-ReAct

# TO-DO: EVALUATION
- Automatic Reference-Free Metrics (LLM-based).
- Ensemble of Judges.