## MOUNT DRIVE

In [None]:
from google.colab import drive
import pandas as pd

# Monta Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


## DATASET

In [None]:
# Percorso al tuo file CSV
csv_path = '/content/drive/MyDrive/AleBERTs&FraBERT_shared_folder_HW2/dataset_cleaned.csv'

# Carica il dataset
df = pd.read_csv(csv_path)
if "LLaMA" not in df.columns:
    df["LLaMA"] = ""

## IMPORTS

In [None]:
import pandas as pd
import requests
import time
import json
import spacy
from tqdm import tqdm
import re


# **ZERO SHOT**

In [None]:


# === API PARAMETERS ===
API_URL = "https://openrouter.ai/api/v1/chat/completions"  # OpenRouter API endpoint
API_KEY = "..."  # Your API key
MODEL = "meta-llama/llama-3-8b-instruct"  # The model used for translation
HEADERS = {
    "Authorization": f"Bearer {API_KEY}",  # API authentication header
    "Content-Type": "application/json"     # Request content type
}

# === BUILD ZERO-SHOT PROMPT ===
def build_prompt_zero_shot(sentence):
    """
    Constructs a zero-shot prompt for translating an archaic Italian sentence
    into fluent modern Italian using precise instructions.
    """
    return [
        {
            "role": "system",
            "content": (
                "You are a professional linguist specializing in philology and the translation of archaic Italian "
                "(Florentine, 1300–1500)  into modern Italian. Your task is to translate sentences written in archaic Italian "
                "into contemporary, fluent Italian.\n\n"
                "Your translations must be:\n"
                "1. Faithful to the original meaning,\n"
                "2. Written in a natural and idiomatic modern style,\n"
                "3. Clear and immediately comprehensible to a modern reader.\n\n"
                "Avoid archaic or overly literal structures. You may restructure the syntax and update the vocabulary "
                "as needed to improve readability, as long as the core meaning and tone remain intact."
            )
        },
        {
            "role": "user",
            "content": f'Input: {sentence}'
        }
    ]


jsonl_data = []  # This list will store translation results in JSONL format

# === TRANSLATION LOOP ===
for i, row in df.iterrows():

    if pd.isna(row["LLaMA"]) or row["LLaMA"] == "":
        sentence = row["Sentence"]  # Get the archaic sentence
        messages = build_prompt_zero_shot(sentence)

        # Prepare the request payload for the API
        payload = {
            "model": MODEL,
            "messages": messages,
            "temperature": 0.0,  # deterministic output
            "max_tokens": 256    # Maximum length of the translation
        }

        try:
            # Send the request to the API
            response = requests.post(API_URL, headers=HEADERS, json=payload)
            response.raise_for_status()  # Raise exception if request failed

            # Extract the translation from the response
            result = response.json()
            output = result["choices"][0]["message"]["content"].strip()


            df.at[i, "LLaMA"] = output
            jsonl_data.append({
                "Sentence": sentence,
                "Translated Sentence": output
            })

            print(f"{i} → {output}")
            time.sleep(2)  # Delay to avoid rate limiting

        except Exception as e:
            print(f"ERROR at row {i}: {e}")
            continue

# === SAVE RESULTS TO FILES ===
df.to_csv("/content/dataset_with_LLaMA_zero_shot.csv", index=False)  # Save updated DataFrame to CSV

# Save translations in JSONL format
with open("/content/dataset_with_LLaMA_zero_shot.jsonl", "w", encoding="utf-8") as f:
    for entry in jsonl_data:
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")



0 → "Quella guerra fu un'opera ben fatta, ma dall'altra parte Aiace era un cavaliere francese e valoroso, ma non aveva grande intelligenza."
1 → "Cruel, e prende vendetta di tutte le colpe, come richiede la legge, e perdonerà a nessun cavaliere che pecca."
2 → Ponzio Aufidiano, cavaliere romano, non aveva un'anima più forte.
3 → "Se questo piace a tutti e se il tempo richiede un Pompeo per cavaliere e non per compagno, non considererò più i fatti."
4 → "L'ufficio di questo arte sembra essere quello di dire qualcosa in modo da far credere, e il fine è far credere attraverso il dire."
5 → "Ecco che i temporali estesi si abbassano sulle nebbie decise; e sembrerebbe che tutto il cielo stesse precipitando nel mare"
6 → Ma chi spererebbe che anche quelli che non credono ancora in Cristo siano già con noi, e non possono negarlo, quindi si lamentano a denti stretti.
7 → "Il commercio dei morti e le pretese dei vivi fece la frode di un re crudele."
8 → "Affinché colui, che ora per le sue gravi 

# **FEW SHOT**





In [None]:
# === API PARAMETERS ===
API_URL = "https://openrouter.ai/api/v1/chat/completions"  # OpenRouter API endpoint
API_KEY = "..."  # Your API key
MODEL = "meta-llama/llama-3-8b-instruct"  # The model used for translation
HEADERS = {
    "Authorization": f"Bearer {API_KEY}",  # API authentication header
    "Content-Type": "application/json"     # Request content type
}


# === BUILD FEW-SHOT CONVERSATION PROMPT ===
def build_prompt_conversation(sentence):

    return [
        {
            "role": "system",
            "content": (
                "You are a professional linguist specializing in philology and the translation of archaic Italian "
                "(Florentine, 1300–1500) into modern Italian. Your task is to translate sentences written in archaic Italian "
                "into contemporary, fluent Italian. Your translations must be: (1) Faithful to the original meaning, "
                "(2) Written in a natural and idiomatic modern style, (3) Clear and immediately comprehensible to a modern reader. "
                "Avoid archaic or overly literal structures. You may restructure the syntax and update the vocabulary as needed "
                "to improve readability, as long as the core meaning and tone remain intact."
            )
        },
        # Example 1
        {
            "role": "user",
            "content": 'Traduci: E però che la seconda suole talora per la grande provedenza fare timoroso, e la prima per l’ardire rendere altrui matto, assai utile è la presente distinzione.'
        },
        {
            "role": "assistant",
            "content": 'Output: Poiché la seconda, per la sua grande previdenza, può talora incutere timore, e la prima, per l’ardire, condurre alla follia, questa distinzione si rivela assai utile.'
        },
        # Example 2
        {
            "role": "user",
            "content": 'Traduci: Vergine bella, che di sol vestita,  coronata di stelle, al sommo Sole  piacesti sì, che n te sua luce ascose;  amor mi spinge a dir di te parole.'
        },
        {
            "role": "assistant",
            "content": 'Output: Bella Vergine, vestita di sole e coronata di stelle, piacesti tanto al sommo Dio che nascose in te la sua luce; è l’amore a spingermi a parlare di te.'
        },
        # Example 3
        {
            "role": "user",
            "content":  "Traduci: Era una giovane di maravigliosa bellezza e di costumi onesti, la quale con leggiadria e onestà si portava."
        },
        {
            "role": "assistant",
            "content": "Era una giovane di straordinaria bellezza e di buoni costumi, che si comportava con eleganza e compostezza."
        },
        # Actual input sentence
        {
            "role": "user",
            "content": f'Input: {sentence}'
        }
    ]

jsonl_data = []  # Will store each result as a dictionary for JSONL output

# === TRANSLATION LOOP ===
for i, row in df.iterrows():
    if pd.isna(row["LLaMA"]) or row["LLaMA"] == "":
        sentence = row["Sentence"]  # Get the archaic sentence from the dataset
        messages = build_prompt_conversation(sentence)

        # Prepare the API request payload
        payload = {
            "model": MODEL,
            "messages": messages,
            "temperature": 0.0,  #  deterministic output
            "max_tokens": 256
        }

        try:
            # Send the request to OpenRouter API
            response = requests.post(API_URL, headers=HEADERS, json=payload)
            response.raise_for_status()  # Raise error for bad responses

            # Extract the model output (translation)
            result = response.json()
            output = result["choices"][0]["message"]["content"].strip()


            df.at[i, "LLaMA"] = output
            jsonl_data.append({
                "original": sentence,
                "translation": output
            })

            print(f"{i} → {output}")
            time.sleep(2)  # Short delay to avoid rate limits

        except Exception as e:
            print(f"ERROR at row {i}: {e}")  # Log any errors that occur
            continue

# === SAVE RESULTS TO FILES ===

df.to_csv("/content/dataset_with_LLaMA_few_shot.csv", index=False)

with open("/content/dataset_with_LLaMA_few_shot.jsonl", "w", encoding="utf-8") as f:
    for entry in jsonl_data:
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")


0 → Output: Quella guerra è stata eseguita con successo proprio per il fatto che... Dall'altra parte, Aiace era un cavaliere coraggioso e valoroso in battaglia, in grado di farlo tutto, ma non era particolarmente dotato di grande saggezza.
1 → Output: Crudele e di tutte le colpe vendicatore, come dice la legge, e non perdonerà a nessun cavaliere che pecca.
2 → Output: Ponzio Aufidiano, un nobile cavaliere romano, non ebbe altro fiore di animo.
3 → Output: Se questo ti piace a te e se il tempo richiede Pompeio come cavaliere e non come compagno, non considererò più il mio destino.
4 → Output: L'officio di questa arte sembra essere quello di parlare in modo da convincere, e il fine è convincere attraverso il discorso.
5 → Output: Ecco, i larghi venti si aggiogano alle poderose nebbie; e pare impossibile che tutta la volta celeste non cadesse nel mare.


KeyboardInterrupt: 

# **FEW SHOT + POS TAGGING**

In [None]:
!python -m spacy download it_core_news_lg

Collecting it-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_lg-3.8.0/it_core_news_lg-3.8.0-py3-none-any.whl (567.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.9/567.9 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: it-core-news-lg
Successfully installed it-core-news-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:

# load the model
nlp = spacy.load("it_core_news_lg")

In [None]:

tqdm.pandas()


df = pd.read_csv("/content/drive/MyDrive/AleBERTs&FraBERT_shared_folder_HW2/dataset_cleaned.csv")
assert "Sentence" in df.columns, "The 'Sentence' column is missing"

# Function to extract POS tags from a sentence
def get_pos_tags(sentence):
    doc = nlp(str(sentence))  # str() to avoid crashes on NaN
    return " ".join([token.pos_ for token in doc])

df["pos_tags"] = df["Sentence"].progress_apply(get_pos_tags)

# Save the result to a new CSV file
df.to_csv("output_con_pos_tags.csv", index=False)

100%|██████████| 97/97 [00:01<00:00, 84.87it/s]


In [None]:
# === API PARAMETERS ===
API_URL = "https://openrouter.ai/api/v1/chat/completions"  # OpenRouter API endpoint
API_KEY = "..."  # Your API key
MODEL = "meta-llama/llama-3-8b-instruct"  # The model used for translation
HEADERS = {
    "Authorization": f"Bearer {API_KEY}",  # API authentication header
    "Content-Type": "application/json"     # Request content type
}


def build_prompt_conversation(sentence, pos_tags):
    return [
        {
            "role": "system",
            "content": (
                "You are an expert Italian translator and philologist. Your task is to translate literary and poetic texts from arcaic Italian (Florentine, 1200–1400) to fluent Modern Italian.\n"
                "- Preserve the full meaning of the original without omissions or unjustified additions.\n"
                "- Use correct and elegant grammar (verb tenses, agreement, punctuation).\n"
                "- Adapt archaic or obsolete words and expressions into natural, idiomatic modern equivalents.\n"
                "- Render metaphors, poetic turns, and complex structures with clarity and sensitivity.\n"
                "- Maintain the tone, emotional register, and stylistic elegance of the original.\n"
                "- Avoid literal translations that sound unnatural or archaic in contemporary Italian.\n"
                "- Provide only the translated sentence. Do not include explanations or commentary.\n"
                "- Do not invent or transliterate unknown or archaic words: interpret their meaning from context and render with modern equivalents.\n"
            )
        },
        {
            "role": "user",
            "content": (
                "Traduci: I’ vidi angelic’ forma e in vista umana un lume che, se ‘l vero m’è mostrato,fu spirito celeste in corpo manto.\n"
                "POS tagging: PRON VERB ADJ NOUN CCONJ ADP NOUN ADJ DET NOUN SCONJ PRON NOUN AUX VERB PUNCT VERB NOUN ADJ ADP NOUN NOUN PUNCT"
            )
        },
        {
            "role": "assistant",
            "content": "Output: Vidi una figura angelica con sembianze umane, una luce che, se davvero vidi il vero, era uno spirito celeste in un corpo terreno."
        },
        {
            "role": "user",
            "content": (
                "Traduci: mi prese del costui piacer sì forte,che, come vedi, ancor non m’abbandona\n"
                "POS tagging: PRON VERB ADP DET PRON NOUN ADV ADJ PUNCT SCONJ ADV VERB PUNCT ADV ADV PRON VERB"
            )
        },
        {
            "role": "assistant",
            "content": "Output: Mi rapì la bellezza di questo in modo così forte che, come vedi, ancora non mi abbandona."
        },
        {
            "role": "user",
            "content": (
                "Traduci: Non è meraviglia se da l’uno e l’altro idiota è fuggito lo volgare illustre\n"
                "POS tagging: PART AUX NOUN SCONJ ADP DET PRON CCONJ DET NOUN AUX VERB DET NOUN ADJ"
            )
        },
        {
            "role": "assistant",
            "content": "Output: Non stupisce che il volgare illustre sia sfuggito sia ai colti che agli incolti."
        },
        {
            "role": "user",
            "content": f"Translate: {sentence}\nPOS tagging: {pos_tags}"
        }
    ]


In [None]:


# === FUNCTION TO EXTRACT THE FINAL TRANSLATION FROM THE OUTPUT TEXT ===
def extract_final_translation(output_text):
    """
    Extracts the actual translated sentence from the model's response.
    The function looks for specific markers like **Correction:** and **Translation:**
    to parse the desired output. It returns the first meaningful correction or translation.
    """

    # First, try to find a correction section (if present)
    match_corr = re.search(r"\*\*Correction:\*\*\s*(.*?)(?:\n\s*\*\*|$)", output_text, re.DOTALL)
    if match_corr:
        corr_text = match_corr.group(1).strip()  # Extract the correction content
        corr_lines = [line.strip() for line in corr_text.splitlines() if line.strip()]  # Remove blank lines

        if corr_lines:
            first = corr_lines[0].lower()  # Normalize for comparison
            # If the first line is not something like "none needed." or "no correction."
            if first not in ["none needed.", "no correction.", "none."]:
                return corr_lines[0]  # Return the first actual correction line

    # If there's no correction or it's "none", try to find a **Translation:**
    match_trans = re.search(r"\*\*Translation:\*\*\s*(.*?)(?:\n\s*\*{2}|$)", output_text, re.DOTALL)
    if match_trans:
        return match_trans.group(1).strip()  # Return the translation

    # If no specific tags were found, return the whole text as fallback
    return output_text.strip()


In [None]:


# === LOAD DATASET ===
# Load a CSV file that contains at least two columns:
# "Sentence" (archaic Italian text) and "pos_tags" (precomputed part-of-speech tags)
df = pd.read_csv('/content/drive/MyDrive/AleBERTs&FraBERTS_shared_folder_2/output_con_pos_tags.csv')

jsonl_data = []  # This will store the processed data in JSONL format

# === MAIN LOOP: FOR EACH SENTENCE IN THE DATASET ===
for idx, row in df.iterrows():
    sentence = row["Sentence"]  # Original sentence to translate
    pos_tags = row["pos_tags"]  # POS tags as a string (assumed pre-formatted)


    messages = build_prompt_conversation(sentence, pos_tags)

    # Prepare the request payload for the OpenRouter API
    payload = {
        "model": MODEL,
        "messages": messages,
        "temperature": 0.0,  # deterministic output
        "max_tokens": 256
    }

    try:
        # Make the POST request to the LLM API
        response = requests.post(API_URL, headers=HEADERS, data=json.dumps(payload))
        response.raise_for_status()  # Raise an error if the request fails

        # Extract the content of the model's response
        output_text = response.json()["choices"][0]["message"]["content"].strip()

        # Clean and isolate the final translated sentence from the response
        final_translation = extract_final_translation(output_text)


        print(f"[{idx+1}/{len(df)}] {final_translation}")


        df.at[idx, "LLaMA"] = final_translation


        jsonl_data.append({
            "Sentence": sentence,
            "POS": pos_tags,
            "Translated Sentence": final_translation
        })

        time.sleep(1.5)  # Delay to respect API rate limits

    except Exception as e:
        # Log any error and mark the row as failed
        print(f"[{idx+1}]  Error: {e}")
        df.at[idx, "LLaMA"] = "Error"

# === SAVE THE UPDATED DATASET ===

df.to_csv("/content/dataset_with_LLaMA_few_shot_pos_tagging.csv", index=False)


with open("/content/dataset_with_LLaMA_few_shot_pos_tagging.jsonl", "w", encoding="utf-8") as f:
    for item in jsonl_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")


[1/97] Output: Quell'opera fu ben fatta perciò... Dall'altra parte, Aiace era un cavaliere franco e valoroso in guerra, in modo grande, ma non era pieno di grande sapienza.
[2/97] Output: Siate crudeli, e perseguitate tutti i colpevoli, come vuole la legge, e lasciate perdono a nessun cavaliere, che pecca.
[3/97] Output: Nessun'altra virtù d'animo ha adornato Ponzio Aufidiano, il valoroso cavaliere romano.
[4/97] Output: Se questo è gradevole a tutti e se il tempo ha bisogno di Pompeio come cavaliere e non come compagno, non mi piegherò più a colpa.
[5/97] Output: Il compito di questa arte sembra essere espungere parole inopportune per fare credere, il fine vero è far credere per quel che si dice.
[6/97] Output: Ecco che cadono vasti cumuli di nebbia, e sembreresti credere che tutto il cielo si gettasse nel mare.
[7/97] Output: Se ancora sperasse qualcuno che queste persone, che non credono ancora in Cristo, già lo scoprono con noi e, perché non possono negarlo, soffocano a stento.
[8/