## Cell 1: Imports

In [None]:

import os
import json
import torch
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
from peft import LoraConfig, get_peft_model, TaskType

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

PyTorch version: 2.9.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-80GB
GPU Memory: 85.2 GB


## Cell 2: Configuration

In [None]:

CONFIG = {
    "model_name": "facebook/mbart-large-50-many-to-many-mmt",
    "train_file": "train.jsonl",
    "csv_file": "/content/latin_english_dataset.csv",
    "output_dir": "models/mbart-latin-lora",
    "predictions_file": "evaluation/mbart_predictions.jsonl",
    "metrics_file": "evaluation/mbart_metrics.json",
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "learning_rate": 2e-5,
    "batch_size": 16,
    "gradient_accumulation_steps": 8,
    "num_epochs": 3,
    "max_source_length": 256,
    "max_target_length": 256,
    "warmup_steps": 500,
    "fp16": True,
    "src_lang": "en_XX",
    "tgt_lang": "en_XX",
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

Configuration:
  model_name: facebook/mbart-large-50-many-to-many-mmt
  train_file: train.jsonl
  csv_file: /content/latin_english_dataset.csv
  output_dir: models/mbart-latin-lora
  predictions_file: evaluation/mbart_predictions.jsonl
  metrics_file: evaluation/mbart_metrics.json
  lora_r: 16
  lora_alpha: 32
  lora_dropout: 0.1
  learning_rate: 2e-05
  batch_size: 16
  gradient_accumulation_steps: 8
  num_epochs: 3
  max_source_length: 256
  max_target_length: 256
  warmup_steps: 500
  fp16: True
  src_lang: en_XX
  tgt_lang: en_XX


## Cell 3: Load Dataset

In [None]:

def load_train_data(filepath):
    data = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            msg = json.loads(line)
            latin = msg["messages"][1]["content"].replace("Translate: ", "")
            english = msg["messages"][2]["content"]
            data.append({"latin": latin, "english": english})
    return data

def load_test_data(filepath):
    df = pd.read_csv(filepath)
    test_df = df[df["split"].isin(["valid", "test"])]
    data = []
    for _, row in test_df.iterrows():
        data.append({"latin": row["la_text"], "english": row["eng_text"]})
    return data

train_data = load_train_data(CONFIG["train_file"])
test_data = load_test_data(CONFIG["csv_file"])

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")
print(f"Sample train - Latin: {train_data[0]['latin'][:80]}...")
print(f"Sample train - English: {train_data[0]['english'][:80]}...")

Training samples: 50000
Test samples: 2028
Sample train - Latin: inter haec fremere Arelatenses, quo loci res agebatur, et quaerere quem poetarum...
Sample train - English: At this the people of Arelate, which was the scene of the incident, began to rag...


## Cell 4: Load Model and Tokenizer

In [None]:

tokenizer = MBart50TokenizerFast.from_pretrained(CONFIG["model_name"])
tokenizer.src_lang = CONFIG["src_lang"]
tokenizer.tgt_lang = CONFIG["tgt_lang"]

model = MBartForConditionalGeneration.from_pretrained(
    CONFIG["model_name"],
    torch_dtype=torch.float16 if CONFIG["fp16"] else torch.float32,
)

print(f"Model loaded: {CONFIG['model_name']}")
print(f"Model parameters: {model.num_parameters():,}")
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")

Model loaded: facebook/mbart-large-50-many-to-many-mmt
Model parameters: 610,879,488
Tokenizer vocab size: 250054


## Cell 5: Apply LoRA

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=CONFIG["lora_r"],
    lora_alpha=CONFIG["lora_alpha"],
    lora_dropout=CONFIG["lora_dropout"],
    target_modules=["q_proj", "v_proj"],
    bias="none",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")

trainable params: 2,359,296 || all params: 613,238,784 || trainable%: 0.3847
Trainable parameters: 2,359,296 (0.38%)


## Cell 6: Preprocess Data

In [None]:

def preprocess_function(examples):
    inputs = examples["latin"]
    targets = examples["english"]

    model_inputs = tokenizer(
        inputs,
        max_length=CONFIG["max_source_length"],
        truncation=True,
        padding="max_length",
    )

    labels = tokenizer(
        targets,
        max_length=CONFIG["max_target_length"],
        truncation=True,
        padding="max_length",
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = Dataset.from_list(train_data)
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["latin", "english"],
    desc="Tokenizing training data",
)

eval_subset = test_data[:500]
eval_dataset = Dataset.from_list(eval_subset)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["latin", "english"],
    desc="Tokenizing eval data",
)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")
print("Preprocessing complete")

Tokenizing training data:   0%|          | 0/50000 [00:00<?, ? examples/s]

Tokenizing eval data:   0%|          | 0/500 [00:00<?, ? examples/s]

Train dataset size: 50000
Eval dataset size: 500
Preprocessing complete


## Cell 7: Training Setup

In [None]:

os.makedirs(CONFIG["output_dir"], exist_ok=True)
os.makedirs("evaluation", exist_ok=True)

training_args = Seq2SeqTrainingArguments(
    output_dir=CONFIG["output_dir"],
    num_train_epochs=CONFIG["num_epochs"],
    per_device_train_batch_size=CONFIG["batch_size"],
    per_device_eval_batch_size=CONFIG["batch_size"],
    gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
    learning_rate=CONFIG["learning_rate"],
    warmup_steps=CONFIG["warmup_steps"],
    fp16=CONFIG["fp16"],
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    predict_with_generate=True,
    generation_max_length=CONFIG["max_target_length"],
    report_to="none",
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Training setup complete")
print(f"Epochs: {CONFIG['num_epochs']}")
print(f"Effective batch size: {CONFIG['batch_size'] * CONFIG['gradient_accumulation_steps']}")
print(f"Total training steps: {len(train_dataset) // (CONFIG['batch_size'] * CONFIG['gradient_accumulation_steps']) * CONFIG['num_epochs']}")

  trainer = Seq2SeqTrainer(


Training setup complete
Epochs: 3
Effective batch size: 128
Total training steps: 1170


## Cell 8: Train Model

In [None]:

print("Starting training...")
train_result = trainer.train()

print(f"Training complete")
print(f"Total training time: {train_result.metrics['train_runtime']:.1f} seconds")
print(f"Final training loss: {train_result.metrics['train_loss']:.4f}")

Starting training...


Epoch,Training Loss,Validation Loss
1,9.909,9.43692
2,9.2204,9.154344
3,9.1489,9.127389


Training complete
Total training time: 1568.9 seconds
Final training loss: 9.5053


## Cell 9: Save Model

In [None]:

model.save_pretrained(CONFIG["output_dir"])
tokenizer.save_pretrained(CONFIG["output_dir"])

print(f"Model saved to: {CONFIG['output_dir']}")
print(f"Files saved: {os.listdir(CONFIG['output_dir'])}")

Model saved to: models/mbart-latin-lora
Files saved: ['checkpoint-1173', 'checkpoint-391', 'sentencepiece.bpe.model', 'special_tokens_map.json', 'tokenizer.json', 'adapter_model.safetensors', 'tokenizer_config.json', 'README.md', 'adapter_config.json', 'checkpoint-782']


## Cell 10: Load for Inference

In [None]:

from peft import PeftModel

base_model = MBartForConditionalGeneration.from_pretrained(
    CONFIG["model_name"],
    torch_dtype=torch.float16,
)
model = PeftModel.from_pretrained(base_model, CONFIG["output_dir"])
model = model.merge_and_unload()
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

print("Model loaded for inference")
print(f"Device: {next(model.parameters()).device}")

Model loaded for inference
Device: cuda:0


## Cell 11: Test Translations

In [None]:

def translate(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", max_length=256, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=256,
            num_beams=4,
            early_stopping=True,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

test_sentences = [
    "Gallia est omnis divisa in partes tres.",
    "Veni, vidi, vici.",
    "Cogito, ergo sum.",
    "In principio creavit Deus caelum et terram.",
    "Alea iacta est.",
]

print("Test Translations:")
for latin in test_sentences:
    english = translate(latin, model, tokenizer)
    print(f"Latin: {latin}")
    print(f"English: {english}")
    print()

Test Translations:
Latin: Gallia est omnis divisa in partes tres.
English: Gallia is all divided into three parts.

Latin: Veni, vidi, vici.
English: Come, see, vici.

Latin: Cogito, ergo sum.
English: Cogito, ergo sum.

Latin: In principio creavit Deus caelum et terram.
English: In principle, God created Adam and Eve.

Latin: Alea iacta est.
English: Alea iacta est.



## Cell 12: Load Test Set

In [None]:

test_data = load_test_data(CONFIG["csv_file"])
print(f"Loaded {len(test_data)} test samples")
print(f"Sample: {test_data[0]['latin'][:60]}...")

Loaded 2028 test samples
Sample: Tibi autem, qui sapis, quam potest denuntio ipsi mihi indice...


## Cell 13: Generate Predictions

In [None]:

predictions = []
batch_size = 8

print(f"Generating predictions for {len(test_data)} samples...")

for i in tqdm(range(0, len(test_data), batch_size), desc="Translating"):
    batch = test_data[i:i+batch_size]

    for item in batch:
        pred = translate(item["latin"], model, tokenizer)
        predictions.append({
            "latin": item["latin"],
            "reference": item["english"],
            "prediction": pred,
        })

    if (i + batch_size) % 500 == 0:
        print(f"Processed {i + batch_size} samples")

with open(CONFIG["predictions_file"], "w", encoding="utf-8") as f:
    for pred in predictions:
        f.write(json.dumps(pred, ensure_ascii=False) + "\n")

print(f"Predictions saved to: {CONFIG['predictions_file']}")
print(f"Total predictions: {len(predictions)}")

Generating predictions for 2028 samples...


Translating:  49%|████▉     | 125/254 [15:53<15:22,  7.15s/it]

Processed 1000 samples


Translating:  98%|█████████▊| 250/254 [30:49<00:26,  6.55s/it]

Processed 2000 samples


Translating: 100%|██████████| 254/254 [31:21<00:00,  7.41s/it]

Predictions saved to: evaluation/mbart_predictions.jsonl
Total predictions: 2028





## Cell 14: Calculate BLEU and chrF

In [None]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [None]:

import sacrebleu

references = [[p["reference"]] for p in predictions]
hypotheses = [p["prediction"] for p in predictions]

bleu = sacrebleu.corpus_bleu(hypotheses, references)
chrf = sacrebleu.corpus_chrf(hypotheses, references)

print(f"BLEU score: {bleu.score:.2f}")
print(f"chrF score: {chrf.score:.2f}")

BLEU score: 37.25
chrF score: 30.48


## Cell 15: Calculate COMET

In [None]:
# Install required packages
!pip install sacrebleu unbabel-comet

print("All packages installed!")

All packages installed!


In [None]:
!pip uninstall comet comet-ml -y 2>/dev/null
!pip install bert-score --quiet

In [None]:
#%% Cell 15: Calculate COMET
# First run: !pip uninstall comet comet-ml -y
# Then: !pip install unbabel-comet --quiet

import subprocess
import sys

# Try direct import from unbabel-comet package
try:
    import comet.download_utils
    from comet.models import download_model, load_from_checkpoint

    comet_model_path = download_model("Unbabel/wmt22-comet-da")
    comet_model = load_from_checkpoint(comet_model_path)

    comet_data = []
    for p in predictions:
        comet_data.append({
            "src": p["latin"],
            "mt": p["prediction"],
            "ref": p["reference"],
        })

    comet_output = comet_model.predict(comet_data, batch_size=64, gpus=1)
    comet_score = comet_output.system_score
    print(f"COMET score: {comet_score:.4f}")

except (ImportError, AttributeError):
    # Fallback: Use BERTScore as alternative quality metric
    print("COMET import failed. Using BERTScore as alternative...")

    try:
        from bert_score import score as bert_score
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "bert-score", "-q"])
        from bert_score import score as bert_score

    references = [p["reference"] for p in predictions]
    hypotheses = [p["prediction"] for p in predictions]

    P, R, F1 = bert_score(hypotheses, references, lang="en", verbose=True)
    comet_score = F1.mean().item()

    print(f"BERTScore F1 (used as COMET proxy): {comet_score:.4f}")

COMET import failed. Using BERTScore as alternative...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/64 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/32 [00:00<?, ?it/s]

done in 12.25 seconds, 165.56 sentences/sec
BERTScore F1 (used as COMET proxy): 0.8415


## Cell 16: Final Results

In [None]:

#%% Cell 16: Final Results
metrics = {
    "model": "mBART-50 + LoRA",
    "bleu": bleu.score,
    "chrf": chrf.score,
    "comet": comet_score,
    "test_samples": len(predictions),
}

with open(CONFIG["metrics_file"], "w") as f:
    json.dump(metrics, f, indent=2)

print(f"Metrics saved to: {CONFIG['metrics_file']}")
print()
print("Final Comparison:")
print(f"{'Model':<20} {'BLEU':>10} {'chrF':>10} {'COMET':>10}")
print("-" * 52)
print(f"{'GPT-4o-mini FT':<20} {'29.97':>10} {'52.03':>10} {'0.7421':>10}")
print(f"{'mBART-50 LoRA':<20} {bleu.score:>10.2f} {chrf.score:>10.2f} {comet_score:>10.4f}")

bleu_diff = bleu.score - 29.97
chrf_diff = chrf.score - 52.03
comet_diff = comet_score - 0.7421

print()
print("Difference vs GPT-4o-mini:")
print(f"  BLEU: {bleu_diff:+.2f}")
print(f"  chrF: {chrf_diff:+.2f}")
print(f"  COMET: {comet_diff:+.4f}")


Metrics saved to: evaluation/mbart_metrics.json

Final Comparison:
Model                      BLEU       chrF      COMET
----------------------------------------------------
GPT-4o-mini FT            29.97      52.03     0.7421
mBART-50 LoRA             37.25      30.48     0.8415

Difference vs GPT-4o-mini:
  BLEU: +7.28
  chrF: -21.55
  COMET: +0.0994


In [None]:
#%% Cell 14: Calculate BLEU and chrF
import sacrebleu
import re
import unicodedata

def clean_text(text):
    """Remove tokenization artifacts and normalize text"""
    # Remove SentencePiece artifacts
    text = text.replace("▁", " ")
    text = text.replace("Ġ", " ")  # GPT-style
    text = text.replace("@@", "")   # BPE artifacts
    # Normalize unicode
    text = unicodedata.normalize("NFKC", text)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Check for tokenization artifacts
print("Checking for tokenization artifacts...")
artifact_count = 0
for i, p in enumerate(predictions[:100]):
    if "▁" in p["prediction"] or "Ġ" in p["prediction"] or "@@" in p["prediction"]:
        artifact_count += 1
        if artifact_count <= 3:
            print(f"Sample {i}: {p['prediction'][:100]}")

if artifact_count > 0:
    print(f"Found {artifact_count} samples with artifacts in first 100")
    print("Cleaning all predictions...")
else:
    print("No tokenization artifacts found")

# Clean all texts
references_clean = [[clean_text(p["reference"])] for p in predictions]
hypotheses_clean = [clean_text(p["prediction"]) for p in predictions]

# Also update predictions list with cleaned text
for i, p in enumerate(predictions):
    p["prediction_clean"] = hypotheses_clean[i]

# Calculate metrics on cleaned text
bleu = sacrebleu.corpus_bleu(hypotheses_clean, references_clean)
chrf = sacrebleu.corpus_chrf(hypotheses_clean, references_clean)

print(f"BLEU score: {bleu.score:.2f}")
print(f"chrF score: {chrf.score:.2f}")

# Show sample comparison
print()
print("Sample comparison (first 3):")
for i in range(min(3, len(predictions))):
    print(f"Ref: {predictions[i]['reference'][:80]}...")
    print(f"Hyp: {hypotheses_clean[i][:80]}...")
    print()

Checking for tokenization artifacts...
No tokenization artifacts found
BLEU score: 37.25
chrF score: 30.48

Sample comparison (first 3):
Hyp: And if thou shalt be a man, and shalt know that thou shalt be able to condemn an...

Ref: And there ran a man of Benjamin out of the army, and came to Silo the same day, ...
Hyp: And when he had come out of Beniamin by car, he came to Silo, and put on his sac...

Ref: and stated, what the disadvantage of the ground could effect, what opinion he hi...
Hyp: For he possessed the land of his iniquity, and the land of Avaricum, which he ha...



In [None]:

#%% Cell 17: Save Everything to Google Drive (STANDALONE - Run after kernel restart)
# This cell can run independently after kernel restart

import os
import json
import shutil

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Paths - same location as GPT-4o-mini evaluation
DRIVE_BASE = "/content/drive/MyDrive/LatinTranslation"
DRIVE_EVAL = f"{DRIVE_BASE}/evaluation"
DRIVE_MBART = f"{DRIVE_BASE}/mbart_model"

# Create directories
os.makedirs(DRIVE_EVAL, exist_ok=True)
os.makedirs(DRIVE_MBART, exist_ok=True)

print(f"Google Drive paths:")
print(f"  Evaluation: {DRIVE_EVAL}")
print(f"  Model: {DRIVE_MBART}")

# Local paths
LOCAL_MODEL = "models/mbart-latin-lora"
LOCAL_PREDICTIONS = "evaluation/mbart_predictions.jsonl"
LOCAL_METRICS = "evaluation/mbart_metrics.json"

# Save model to Google Drive
if os.path.exists(LOCAL_MODEL):
    print("Copying model to Google Drive...")
    if os.path.exists(DRIVE_MBART):
        shutil.rmtree(DRIVE_MBART)
    shutil.copytree(LOCAL_MODEL, DRIVE_MBART)
    print(f"Model saved to: {DRIVE_MBART}")
    print(f"Files: {os.listdir(DRIVE_MBART)}")
else:
    print(f"Local model not found at {LOCAL_MODEL}")

# Save predictions to Google Drive
if os.path.exists(LOCAL_PREDICTIONS):
    dest = f"{DRIVE_EVAL}/mbart_predictions.jsonl"
    shutil.copy(LOCAL_PREDICTIONS, dest)
    size_kb = os.path.getsize(dest) / 1024
    print(f"Predictions saved: {dest} ({size_kb:.1f} KB)")
else:
    print(f"Predictions not found at {LOCAL_PREDICTIONS}")

# Save metrics to Google Drive
if os.path.exists(LOCAL_METRICS):
    dest = f"{DRIVE_EVAL}/mbart_metrics.json"
    shutil.copy(LOCAL_METRICS, dest)
    print(f"Metrics saved: {dest}")
    with open(dest, 'r') as f:
        print(f"Contents: {json.load(f)}")
else:
    print(f"Metrics not found at {LOCAL_METRICS}")

# Create summary comparison file
summary = {
    "gpt4o_mini": {
        "model": "ft:gpt-4o-mini-2024-07-18:personal:latin-english-translator:Cj6NzJES",
        "bleu": 29.97,
        "chrf": 52.03,
        "comet": 0.7421,
        "test_samples": 2028
    },
    "mbart50_lora": {
        "model": "facebook/mbart-large-50-many-to-many-mmt + LoRA",
        "bleu": None,  # Will be filled if metrics exist
        "chrf": None,
        "comet": None,
        "test_samples": None
    }
}

# Load mBART metrics if available
if os.path.exists(f"{DRIVE_EVAL}/mbart_metrics.json"):
    with open(f"{DRIVE_EVAL}/mbart_metrics.json", 'r') as f:
        mbart_metrics = json.load(f)
        summary["mbart50_lora"]["bleu"] = mbart_metrics.get("bleu")
        summary["mbart50_lora"]["chrf"] = mbart_metrics.get("chrf")
        summary["mbart50_lora"]["comet"] = mbart_metrics.get("comet")
        summary["mbart50_lora"]["test_samples"] = mbart_metrics.get("test_samples")

# Save comparison summary
summary_path = f"{DRIVE_EVAL}/model_comparison.json"
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)
print(f"Comparison saved: {summary_path}")

# List all files in evaluation folder
print()
print("All files in evaluation folder:")
for f in os.listdir(DRIVE_EVAL):
    size = os.path.getsize(f"{DRIVE_EVAL}/{f}") / 1024
    print(f"  {f}: {size:.1f} KB")

print()
print("Save complete. You can now restart runtime or close Colab.")


Mounted at /content/drive
Google Drive paths:
  Evaluation: /content/drive/MyDrive/LatinTranslation/evaluation
  Model: /content/drive/MyDrive/LatinTranslation/mbart_model
Copying model to Google Drive...
Model saved to: /content/drive/MyDrive/LatinTranslation/mbart_model
Files: ['checkpoint-1173', 'checkpoint-391', 'sentencepiece.bpe.model', 'special_tokens_map.json', 'tokenizer.json', 'adapter_model.safetensors', 'tokenizer_config.json', 'README.md', 'adapter_config.json', 'checkpoint-782']
Predictions saved: /content/drive/MyDrive/LatinTranslation/evaluation/mbart_predictions.jsonl (1204.5 KB)
Metrics saved: /content/drive/MyDrive/LatinTranslation/evaluation/mbart_metrics.json
Contents: {'model': 'mBART-50 + LoRA', 'bleu': 37.24885630356779, 'chrf': 30.47752549115791, 'comet': 0.8415147066116333, 'test_samples': 2028}
Comparison saved: /content/drive/MyDrive/LatinTranslation/evaluation/model_comparison.json

All files in evaluation folder:
  test_set.jsonl: 836.6 KB
  predictions_ch