In [None]:
!pip install transformers datasets evaluate --quiet

In [None]:
from google.colab import files

uploaded = files.upload()  # Select your local `damaged_latin_dataset.jsonl`

In [None]:
import json

with open("damaged_latin_dataset.jsonl", "r", encoding="utf-8") as f:
    lines = f.readlines()

print(f"Total lines: {len(lines)}")

for i, line in enumerate(lines[:5]):  # check first 5 lines
    try:
        data = json.loads(line)
        print(f"Line {i+1} OK:", data)
    except json.JSONDecodeError:
        print(f"Line {i+1} NOT valid JSON:", line)

In [None]:
import json
from datasets import Dataset

from google.colab import drive
drive.mount('/content/drive')

file_path = "/content/drive/MyDrive/damaged_latin_dataset.jsonl"

# Read JSONL lines into Python list
with open(file_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(data)

print(dataset[0])


In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

max_input_length = 128
max_target_length = 128

def preprocess(example):
    model_input = tokenizer(
        example["damaged_text"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["ground_truth"],
            max_length=max_target_length,
            truncation=True,
            padding="max_length"
        )

    model_input["labels"] = labels["input_ids"]
    return model_input

tokenized_dataset = dataset.map(preprocess, batched=True)

In [None]:
!pip install -q transformers

In [None]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="no",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


In [None]:
import evaluate

bleu = evaluate.load("bleu")
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU
    bleu_score = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])

    # Accuracy (Exact Match)
    exact_matches = sum([1 for pred, label in zip(decoded_preds, decoded_labels) if pred.strip() == label.strip()])
    acc_score = exact_matches / len(decoded_preds)

    return {"bleu": bleu_score["bleu"], "accuracy": acc_score}

trainer.compute_metrics = compute_metrics

In [None]:
from difflib import SequenceMatcher

def highlight_insertions(damaged, reconstructed):
    matcher = SequenceMatcher(None, damaged.split(), reconstructed.split())
    highlighted = []
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "equal":
            highlighted.extend(reconstructed.split()[j1:j2])
        elif tag == "insert":
            highlighted.extend([f"\033[94m{word}\033[0m" for word in reconstructed.split()[j1:j2]])  # Blue highlight
        elif tag == "replace":
            highlighted.extend([f"\033[94m{word}\033[0m" for word in reconstructed.split()[j1:j2]])
    return " ".join(highlighted)

In [None]:
import torch

def reconstruct(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Move inputs to the same device as model
    device = next(model.parameters()).device
    inputs = {key: val.to(device) for key, val in inputs.items()}

    output = model.generate(**inputs, max_new_tokens=50)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
import google.generativeai as genai

# Replace with your Gemini API Key
genai.configure(api_key="AIzaSyCLZrcCTgqqQIQiixpEutAZiXhvUQYtUos")

In [None]:
def translate_with_gemini(text):
    model = genai.GenerativeModel("gemini-2.0-flash")

    prompt = f"""You are a Latin language expert. Translate the following Latin (possibly with corrupted Cyrillic letters) into English:

    Latin: {text}

    English:"""

    response = model.generate_content(prompt)
    return response.text.strip()

In [None]:
# damaged = "Zаmfіrаkе. nꙋ'lꙋ꙼ аltъ"
# reconstructed = reconstruct(damaged)

# print("Damaged Text:", damaged)
# print("Reconstructed:", reconstructed)
# print("Highlighted Insertions:", highlight_insertions(damaged, reconstructed))

In [None]:
damaged = "Gallia est omnis divisa"

reconstructed = reconstruct(damaged)
translation = translate_with_gemini(reconstructed)

print("🟥 Damaged Text:", damaged)
print("🟩 Reconstructed:", reconstructed)
print("Highlighted Insertions:", highlight_insertions(damaged, reconstructed))
print("🟦 English Translation:", translation)