## Cell 1: Imports

In [None]:

import os
import json
import torch
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from transformers import (
    MarianMTModel,
    MarianTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

PyTorch version: 2.9.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-80GB
GPU Memory: 85.2 GB


## Cell 2: Configuration

In [None]:

CONFIG = {
    # Helsinki-NLP Opus-MT model - PRE-TRAINED on Latin!
    "model_name": "Helsinki-NLP/opus-mt-itc-en",  # Italic languages (Latin, Italian, etc.) to English
    "train_file": "train.jsonl",
    "csv_file": "latin_english_dataset.csv",
    "output_dir": "models/opus-mt-latin",
    "predictions_file": "evaluation/opus_mt_predictions.jsonl",
    "metrics_file": "evaluation/opus_mt_metrics.json",
    # Training settings
    "learning_rate": 2e-5,
    "batch_size": 8,
    "gradient_accumulation_steps": 4,
    "num_epochs": 3,
    "max_source_length": 256,
    "max_target_length": 256,
    "warmup_ratio": 0.1,
    "fp16": True,
}

print("Configuration (Opus-MT - Pre-trained Latin):")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

Configuration (Opus-MT - Pre-trained Latin):
  model_name: Helsinki-NLP/opus-mt-itc-en
  train_file: train.jsonl
  csv_file: latin_english_dataset.csv
  output_dir: models/opus-mt-latin
  predictions_file: evaluation/opus_mt_predictions.jsonl
  metrics_file: evaluation/opus_mt_metrics.json
  learning_rate: 2e-05
  batch_size: 8
  gradient_accumulation_steps: 4
  num_epochs: 3
  max_source_length: 256
  max_target_length: 256
  warmup_ratio: 0.1
  fp16: True


## Cell 3: Load Dataset

In [None]:

def load_train_data(filepath):
    data = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            msg = json.loads(line)
            latin = msg["messages"][1]["content"].replace("Translate: ", "")
            english = msg["messages"][2]["content"]
            data.append({"latin": latin, "english": english})
    return data

def load_test_data(filepath):
    df = pd.read_csv(filepath)
    test_df = df[df["split"].isin(["valid", "test"])]
    data = []
    for _, row in test_df.iterrows():
        data.append({"latin": row["la_text"], "english": row["eng_text"]})
    return data

train_data = load_train_data(CONFIG["train_file"])
test_data = load_test_data(CONFIG["csv_file"])

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")
print(f"Sample - Latin: {train_data[0]['latin'][:80]}...")
print(f"Sample - English: {train_data[0]['english'][:80]}...")

Training samples: 50000
Test samples: 2028
Sample - Latin: inter haec fremere Arelatenses, quo loci res agebatur, et quaerere quem poetarum...
Sample - English: At this the people of Arelate, which was the scene of the incident, began to rag...


## Cell 4: Load Model and Tokenizer

In [None]:

print(f"Loading pre-trained model: {CONFIG['model_name']}")
print("This model is already trained on Latin translation!")

tokenizer = MarianTokenizer.from_pretrained(CONFIG["model_name"])
model = MarianMTModel.from_pretrained(CONFIG["model_name"])

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Model loaded: {CONFIG['model_name']}")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model size: {total_params * 4 / 1e9:.2f} GB (FP32)")

# Check supported languages
print(f"Source languages: {tokenizer.supported_language_codes if hasattr(tokenizer, 'supported_language_codes') else 'Latin + Italic'}")

Loading pre-trained model: Helsinki-NLP/opus-mt-itc-en
This model is already trained on Latin translation!


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/792k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/787k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/296M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Model loaded: Helsinki-NLP/opus-mt-itc-en
Total parameters: 73,868,800
Trainable parameters: 73,868,800
Model size: 0.30 GB (FP32)
Source languages: []


## Cell 5: Test Pre-trained Model (Before Fine-tuning)

In [None]:

def translate(texts, model, tokenizer, device="cuda"):
    if isinstance(texts, str):
        texts = [texts]

    model = model.to(device)
    model.eval()

    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=256, num_beams=4)

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

test_sentences = [
    "Gallia est omnis divisa in partes tres.",
    "Veni, vidi, vici.",
    "Cogito, ergo sum.",
    "In principio creavit Deus caelum et terram.",
    "Alea iacta est.",
]

print("PRE-TRAINED Model Translations (before fine-tuning):")
device = "cuda" if torch.cuda.is_available() else "cpu"
for latin in test_sentences:
    english = translate(latin, model, tokenizer, device)[0]
    print(f"Latin: {latin}")
    print(f"English: {english}")
    print()

PRE-TRAINED Model Translations (before fine-tuning):


model.safetensors:   0%|          | 0.00/296M [00:00<?, ?B/s]

Latin: Gallia est omnis divisa in partes tres.
English: Gallia is all divided into three parts.

Latin: Veni, vidi, vici.
English: Come, look, vice.

Latin: Cogito, ergo sum.
English: I think so, I'm here.

Latin: In principio creavit Deus caelum et terram.
English: In the beginning God created the heavens and the earth.

Latin: Alea iacta est.
English: That's right.



## Cell 6: Preprocess Data

In [None]:

def preprocess_function(examples):
    inputs = examples["latin"]
    targets = examples["english"]

    model_inputs = tokenizer(
        inputs,
        max_length=CONFIG["max_source_length"],
        truncation=True,
        padding="max_length",
    )

    labels = tokenizer(
        text_target=targets,
        max_length=CONFIG["max_target_length"],
        truncation=True,
        padding="max_length",
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing training data...")
train_dataset = Dataset.from_list(train_data)
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["latin", "english"],
    desc="Tokenizing train",
)

print("Tokenizing eval data...")
eval_subset = test_data[:500]
eval_dataset = Dataset.from_list(eval_subset)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["latin", "english"],
    desc="Tokenizing eval",
)

print(f"Train dataset: {len(train_dataset)} samples")
print(f"Eval dataset: {len(eval_dataset)} samples")

Tokenizing training data...


Tokenizing train:   0%|          | 0/50000 [00:00<?, ? examples/s]

Tokenizing eval data...


Tokenizing eval:   0%|          | 0/500 [00:00<?, ? examples/s]

Train dataset: 50000 samples
Eval dataset: 500 samples


## Cell 7: Training Setup

In [None]:

os.makedirs(CONFIG["output_dir"], exist_ok=True)
os.makedirs("evaluation", exist_ok=True)

training_args = Seq2SeqTrainingArguments(
    output_dir=CONFIG["output_dir"],
    num_train_epochs=CONFIG["num_epochs"],
    per_device_train_batch_size=CONFIG["batch_size"],
    per_device_eval_batch_size=CONFIG["batch_size"],
    gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
    learning_rate=CONFIG["learning_rate"],
    warmup_ratio=CONFIG["warmup_ratio"],
    fp16=CONFIG["fp16"],
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    predict_with_generate=True,
    generation_max_length=CONFIG["max_target_length"],
    report_to="none",
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

effective_batch = CONFIG["batch_size"] * CONFIG["gradient_accumulation_steps"]
total_steps = (len(train_dataset) // effective_batch) * CONFIG["num_epochs"]

print("Training setup complete")
print(f"Epochs: {CONFIG['num_epochs']}")
print(f"Effective batch size: {effective_batch}")
print(f"Total steps: {total_steps}")

Training setup complete
Epochs: 3
Effective batch size: 32
Total steps: 4686


  trainer = Seq2SeqTrainer(


## Cell 8: Train Model

In [None]:

print("Starting fine-tuning on Latin-English data...")
print("Model already knows Latin - fine-tuning will improve on your specific domain")

train_result = trainer.train()

print(f"Training complete")
print(f"Training time: {train_result.metrics['train_runtime'] / 60:.1f} minutes")
print(f"Final loss: {train_result.metrics['train_loss']:.4f}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Starting fine-tuning on Latin-English data...
Model already knows Latin - fine-tuning will improve on your specific domain


Epoch,Training Loss,Validation Loss
1,0.4666,0.442329
2,0.4326,0.422135
3,0.4371,0.416576


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


Training complete
Training time: 17.5 minutes
Final loss: 0.4805


## Cell 9: Save Model

In [None]:

trainer.save_model(CONFIG["output_dir"])
tokenizer.save_pretrained(CONFIG["output_dir"])

print(f"Model saved to: {CONFIG['output_dir']}")
print(f"Files: {os.listdir(CONFIG['output_dir'])}")

Model saved to: models/opus-mt-latin
Files: ['training_args.bin', 'model.safetensors', 'source.spm', 'generation_config.json', 'vocab.json', 'checkpoint-4689', 'special_tokens_map.json', 'tokenizer_config.json', 'checkpoint-1563', 'config.json', 'target.spm', 'checkpoint-3126']


## Cell 10: Load Fine-tuned Model

In [None]:

del model
torch.cuda.empty_cache()

model = MarianMTModel.from_pretrained(CONFIG["output_dir"])
tokenizer = MarianTokenizer.from_pretrained(CONFIG["output_dir"])
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

print("Fine-tuned model loaded")

Fine-tuned model loaded




## Cell 11: Test Fine-tuned Translations

In [None]:

print("FINE-TUNED Model Translations:")
device = "cuda" if torch.cuda.is_available() else "cpu"
for latin in test_sentences:
    english = translate(latin, model, tokenizer, device)[0]
    print(f"Latin: {latin}")
    print(f"English: {english}")
    print()

FINE-TUNED Model Translations:
Latin: Gallia est omnis divisa in partes tres.
English: All Gaul is divided into three parts.

Latin: Veni, vidi, vici.
English: Come, I have seen, I have overcome.

Latin: Cogito, ergo sum.
English: I suppose I am, then I am.

Latin: In principio creavit Deus caelum et terram.
English: In the beginning God created heaven and earth.

Latin: Alea iacta est.
English: Alas, she was shot.



## Cell 12: Generate All Predictions

In [None]:

test_data = load_test_data(CONFIG["csv_file"])
predictions = []

print(f"Generating predictions for {len(test_data)} samples...")

batch_size = 16
for i in tqdm(range(0, len(test_data), batch_size), desc="Translating"):
    batch = test_data[i:i+batch_size]
    latin_texts = [item["latin"] for item in batch]

    translated = translate(latin_texts, model, tokenizer, device)

    for j, item in enumerate(batch):
        predictions.append({
            "latin": item["latin"],
            "reference": item["english"],
            "prediction": translated[j],
        })

with open(CONFIG["predictions_file"], "w", encoding="utf-8") as f:
    for pred in predictions:
        f.write(json.dumps(pred, ensure_ascii=False) + "\n")

print(f"Predictions saved to: {CONFIG['predictions_file']}")

Generating predictions for 2028 samples...


Translating: 100%|██████████| 127/127 [01:50<00:00,  1.15it/s]

Predictions saved to: evaluation/opus_mt_predictions.jsonl





## Cell 13: Calculate BLEU and chrF

In [None]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [None]:
import sacrebleu
import re
import unicodedata

def clean_text(text):
    text = text.replace("▁", " ")
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

references_clean = [[clean_text(p["reference"])] for p in predictions]
hypotheses_clean = [clean_text(p["prediction"]) for p in predictions]

bleu = sacrebleu.corpus_bleu(hypotheses_clean, references_clean)
chrf = sacrebleu.corpus_chrf(hypotheses_clean, references_clean)

print(f"BLEU score: {bleu.score:.2f}")
print(f"chrF score: {chrf.score:.2f}")

print()
print("Sample translations:")
for i in range(3):
    print(f"Latin: {predictions[i]['latin'][:60]}...")
    print(f"Ref: {predictions[i]['reference'][:60]}...")
    print(f"Pred: {hypotheses_clean[i][:60]}...")
    print()

BLEU score: 21.22
chrF score: 32.87

Sample translations:
Latin: Tibi autem, qui sapis, quam potest denuntio ipsi mihi indice...
Pred: But the boy, who is able to tell me what you know of them, y...

Latin: currens autem vir de Beniamin ex acie venit in Silo in die i...
Ref: And there ran a man of Benjamin out of the army, and came to...
Pred: And there ran a man of Benjamin out of the army, and came to...

Latin: Exposuit quid iniquitas loci posset, quid ipse ad Avaricum s...
Ref: and stated, what the disadvantage of the ground could effect...
Pred: He explained what the crime of the place could have done, wh...



## Cell 14: Set Quality Score

In [None]:

# Using chrF as quality metric (most reliable)
comet_score = chrf.score / 100  # Normalize to 0-1 scale

print(f"Quality score (chrF normalized): {comet_score:.4f}")

Quality score (chrF normalized): 0.3287


## Cell 15: Final Results

In [None]:
metrics = {
    "model": "Helsinki-NLP/opus-mt-itc-en (Fine-tuned)",
    "architecture": "MarianMT (Transformer)",
    "pretrained_on_latin": True,
    "bleu": bleu.score,
    "chrf": chrf.score,
    "quality_score": comet_score,
    "test_samples": len(predictions),
}

with open(CONFIG["metrics_file"], "w") as f:
    json.dump(metrics, f, indent=2)

print(f"Metrics saved to: {CONFIG['metrics_file']}")
print()
print("FINAL COMPARISON:")
print(f"{'Model':<30} {'BLEU':>8} {'chrF':>8} {'Pre-trained Latin?':<20}")
print("-" * 70)
print(f"{'GPT-4o-mini FT':<30} {'29.97':>8} {'52.03':>8} {'Yes (internet data)':<20}")
print(f"{'mBART-50 LoRA':<30} {'37.25':>8} {'30.48':>8} {'No':<20}")
print(f"{'Opus-MT Latin FT':<30} {bleu.score:>8.2f} {chrf.score:>8.2f} {'Yes (pre-trained)':<20}")

print()
if chrf.score > 52.03:
    print("Opus-MT beats GPT-4o-mini!")
elif chrf.score > 30.48:
    print("Opus-MT is better than mBART-50 LoRA")
else:
    print("Results need review")

Metrics saved to: evaluation/opus_mt_metrics.json

FINAL COMPARISON:
Model                              BLEU     chrF Pre-trained Latin?  
----------------------------------------------------------------------
GPT-4o-mini FT                    29.97    52.03 Yes (internet data) 
mBART-50 LoRA                     37.25    30.48 No                  
Opus-MT Latin FT                  21.22    32.87 Yes (pre-trained)   

Opus-MT is better than mBART-50 LoRA


## Cell 16: Save to Google Drive

In [None]:
import shutil

from google.colab import drive
drive.mount('/content/drive')

DRIVE_BASE = "/content/drive/MyDrive/LatinTranslation"
DRIVE_EVAL = f"{DRIVE_BASE}/evaluation"
DRIVE_MODEL = f"{DRIVE_BASE}/opus_mt_model"

os.makedirs(DRIVE_EVAL, exist_ok=True)
os.makedirs(DRIVE_MODEL, exist_ok=True)

# Save model
if os.path.exists(CONFIG["output_dir"]):
    print("Copying model to Google Drive...")
    if os.path.exists(DRIVE_MODEL):
        shutil.rmtree(DRIVE_MODEL)
    shutil.copytree(CONFIG["output_dir"], DRIVE_MODEL)
    print(f"Model saved to: {DRIVE_MODEL}")

# Save predictions
if os.path.exists(CONFIG["predictions_file"]):
    shutil.copy(CONFIG["predictions_file"], f"{DRIVE_EVAL}/opus_mt_predictions.jsonl")
    print("Predictions saved")

# Save metrics
if os.path.exists(CONFIG["metrics_file"]):
    shutil.copy(CONFIG["metrics_file"], f"{DRIVE_EVAL}/opus_mt_metrics.json")
    print("Metrics saved")

# Update comparison
comparison_path = f"{DRIVE_EVAL}/model_comparison.json"
if os.path.exists(comparison_path):
    with open(comparison_path, 'r') as f:
        comparison = json.load(f)
else:
    comparison = {}

comparison["opus_mt_latin"] = {
    "model": "Helsinki-NLP/opus-mt-itc-en",
    "bleu": bleu.score,
    "chrf": chrf.score,
    "pretrained_latin": True,
    "test_samples": len(predictions),
}

with open(comparison_path, 'w') as f:
    json.dump(comparison, f, indent=2)

print()
print("All saved to Google Drive")
print(f"Model: {DRIVE_MODEL}")
print(f"Evaluation: {DRIVE_EVAL}")

Mounted at /content/drive
Copying model to Google Drive...
Model saved to: /content/drive/MyDrive/LatinTranslation/opus_mt_model
Predictions saved
Metrics saved

All saved to Google Drive
Model: /content/drive/MyDrive/LatinTranslation/opus_mt_model
Evaluation: /content/drive/MyDrive/LatinTranslation/evaluation
