# Data Preparation

In [76]:
import os
import glob
import re

import numpy as np
import torch
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import spacy
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
import explacy
from matplotlib import pyplot as plt
from torch.utils.data import Dataset, DataLoader
from cltk.stop.greek.stops import STOPS_LIST
from cltk.corpus.greek.beta_to_unicode import Replacer
from cltk.corpus.greek.alphabet import expand_iota_subscript

## Data Collection

In [25]:
def get_qualified_pairs(xml_paths):
    qualified_pairs = []
    for xml_path in xml_paths:
        match = re.search(r"(?P<name>[^_]*)_(?P<language>gk|eng).(?P<extension>xml)", xml_path)
        if match is None: continue
        if match.group("language") == "eng":
            gk_file = xml_path.replace("eng", "gk")
            if os.path.isfile(gk_file):
                qualified_pairs.append(gk_file)
                qualified_pairs.append(xml_path)
    return qualified_pairs

In [29]:
def perseus_tei_xml_to_text():
    xml_dir = os.path.normpath("./data/Classics/*/*/*.xml")
    xml_paths = glob.glob(xml_dir)

    xml_paths = get_qualified_pairs(xml_paths)
    new_dir = os.path.normpath("./data/Pairs/")
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        xml_names = os.path.split(xml_path)
        xml_name = xml_names[1].rstrip(".xml")
        xml_name += ".txt"

        with open(xml_path) as file_open:
            soup = BeautifulSoup(file_open, "lxml")
        title = soup.title
        author = soup.author

        tei_header = soup.find('teiHeader')
        if tei_header:
            tei_header.decompose()

        for tag in soup(['lb', 'pb', 'lpar', 'rpar']):
            tag.decompose()

        body = soup.body
        text = body.get_text()
        new_plain_text_path = os.path.join(new_dir, xml_name)
        with open(new_plain_text_path, "w") as file_open:
            file_open.write(text)

In [30]:
perseus_tei_xml_to_text()

## Data Cleaning



In [25]:
class TranslationDataset(Dataset):
    def __init__(self, data_dir, src_lang, tgt_lang, src_model, tgt_model):
        r = Replacer()
        self.data = []
        for file in os.listdir(data_dir):
            if file.endswith(f"{src_lang}.txt"):
                tgt_file = file.replace(src_lang, tgt_lang)
                if tgt_file in os.listdir(data_dir):
                    with open(os.path.join(data_dir, file), "r") as f_src, open(os.path.join(data_dir, tgt_file), "r") as f_tgt:
                        src_txt = r.beta_code(r"%s", f_src.read().strip())
                        src_sents = [sent.text.strip() for doc in src_model(str(src_txt)) for sent in doc.sents]
                        tgt_sents = [sent.text.strip() for doc in tgt_model(f_src.read().strip()) for sent in doc.sents]
                        for src_sent, tgt_sent in zip(src_sents, tgt_sents):
                            self.data.append((src_sent, tgt_sent))

        self.src_model = src_model
        self.tgt_model = tgt_model

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        src_sent, tgt_sent = self.data[index]

        src_tokens = [token.text for token in self.src_model(src_sent)]
        tgt_tokens = [token.text for token in self.tgt_model(tgt_sent)]

        src_tokens = ["<s>"] + src_tokens + ["</s>"]
        tgt_tokens = ["<s>"] + tgt_tokens + ["</s>"]

        src_ids = torch.tensor(self.src_model.convert_tokens_to_ids(src_tokens))
        tgt_ids = torch.tensor(self.tgt_model.convert_tokens_to_ids(tgt_tokens))

        return {
            "input_ids": src_ids,
            "attention_mask": torch.ones(len(src_ids)),
            "decoder_input_ids": tgt_ids[:-1],
            "decoder_attention_mask": torch.ones(len(tgt_ids)-1),
            "labels": tgt_ids[1:]
        }

In [122]:
class TranslationDataset(Dataset):
    def __init__(self, data_dir, source_lang, target_lang, nlp_src, nlp_tgt, max_seq_len):
        self.nlp_source = nlp_src
        self.nlp_target = nlp_tgt
        self.max_seq_len = max_seq_len

        # Read file pairs from the data directory
        self.file_pairs = self.read_file_pairs(data_dir, source_lang, target_lang)

    def __len__(self):
        return len(self.file_pairs)

    def __getitem__(self, index):
        r = Replacer()
        file_pair = self.file_pairs[index]
        with open(file_pair["source"]) as f_src, open(file_pair["target"], "r") as f_tgt:
            source_text = r.beta_code(r"%s" % f_src.read())
            target_text = f_tgt.read()

        # Tokenize the source and target sentences
        source_tokens = self.preprocess(source_text, self.nlp_source)
        target_tokens = self.preprocess(target_text, self.nlp_target)
        # Add special tokens to the input and output sequences
        input_ids = [self.nlp_source.vocab.strings[source_token.text] for source_token in source_tokens]
        labels = [self.nlp_target.vocab.strings[target_token.text] for target_token in target_tokens]

        # Pad or truncate the input and output sequences to max_seq_len
        if len(input_ids) > self.max_seq_len:
            input_ids = input_ids[:self.max_seq_len]
        else:
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
        if len(labels) > self.max_seq_len:
            labels = labels[:self.max_seq_len]
        else:
            labels = labels + [0] * (self.max_seq_len - len(labels))

        # Create attention masks to ignore padded tokens
        attention_mask = [1] * len(input_ids)

        # Convert to PyTorch tensors and return as dictionary
        return {
            "input_ids": torch.LongTensor(input_ids),
            "attention_mask": torch.tensor(attention_mask),
            "labels": torch.LongTensor(labels),
        }

    def read_file_pairs(self, data_dir, source_lang, target_lang):
        file_pairs = []
        for filename in os.listdir(data_dir):
            if filename.endswith(f"{source_lang}.txt"):
                source_file = filename
                target_file = filename.replace(f"_{source_lang}", f"_{target_lang}")
                file_pairs.append({"source": data_dir + "/" + source_file, "target": data_dir + "/" + target_file})
        return file_pairs

    def preprocess(self, text, nlp):
        doc = nlp(text)
        return [token for token in doc.sents][0]

In [123]:
# model_name = "t5-small"
# tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5ForConditionalGeneration.from_pretrained(model_name)

src_lang = "gk"
tgt_lang = "eng"
nlp_grc = spacy.load("grc_ud_proiel_md")
nlp_eng = spacy.load("en_core_web_md")

data_dir = "./data/Pairs"
translation_dataset = TranslationDataset(data_dir, src_lang, tgt_lang, nlp_grc, nlp_eng, 512)
translation_dataset.__getitem__(1)
#
# batch_size = 8
#
# data_loader = DataLoader(translation_dataset, batch_size=batch_size, shuffle=True)
#
# optimizer = AdamW(model.parameters(), lr=1e-4)
# num_epochs = 5
#
# for epoch in range(num_epochs):
#     model.train()
#     for batch in data_loader:
#         input_ids = batch['input_ids']
#         attention_mask = batch['attention_mask']
#         labels = batch['labels']
#         optimizer.zero_grad()
#         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         print('Epoch:', epoch, 'Batch loss:', loss.item())
#
#     # Save the model after each epoch
#     save_dir = f'./saved_models/t5_{epoch}.pt'
#     torch.save(model.state_dict(), save_dir)
#
# print('Training complete!')



RuntimeError: Overflow when unpacking long

In [None]:
# def clean_doc(raw_txt):
#     with open(raw_txt, "r") as file:
#         text = file.read()
#     doc = nlp(text)
#     for sent in doc.sents:
#         words = [ word for word in sent if not word in STOPS_LIST ]
print(sample:=[*doc.sents][150])

In [23]:
explacy.print_parse_info(nlp, u"%s" % sample)

Dep tree             Token      Dep type  Lemma      Part of Sp
──────────────────── ────────── ───────── ────────── ──────────
         ┌─────────► πνοαὶ      nsubj     πνοή       NOUN      
         │┌────────► δ’         discourse δ’         ADV       
         ││      ┌─► ἀπὸ        case      ἀπό        ADP       
         ││   ┌─►└── Στρυμόνος  nmod      Στρυμών    PROPN     
         ││┌─►└───── μολοῦσαι   advcl     βλώσκω     VERB      
┌───────►└┴┴──────── 
          advcl     
          CCONJ     
│          ┌─►┌┬┬┬── κακόσχολοι amod      κακόσχολος ADJ       
│          │  │││└─► νήστιδες   conj      νήστις     ADJ       
│          │  ││└──► δύσορμοι   conj      δύσορμος   ADJ       
│          │  │└───► ,          cc        ,          PUNCT     
│          │  └─►┌── 
          conj      
          CCONJ     
│          │     └─► βροτῶν     nmod      βροτός     NOUN      
│   ┌─►┌┬──┴─────┬── ἄλαι       obj       ἄλαι       ADJ       
│   │  ││        └─► ,          dep     

In [18]:
from cltk.corpus.utils.importer import CorpusImporter
corpus_importer = CorpusImporter("latin")
print(corpus_importer.list_corpora)

['latin_text_perseus', 'latin_treebank_perseus', 'latin_text_latin_library', 'phi5', 'phi7', 'latin_proper_names_cltk', 'latin_models_cltk', 'latin_pos_lemmata_cltk', 'latin_treebank_index_thomisticus', 'latin_lexica_perseus', 'latin_training_set_sentence_cltk', 'latin_word2vec_cltk', 'latin_text_antique_digiliblt', 'latin_text_corpus_grammaticorum_latinorum', 'latin_text_poeti_ditalia', 'latin_text_tesserae']


In [19]:
corpus_importer.import_corpus("latin_treebank_perseus")

Downloaded 100% 9.40 MiB | 11.80 MiB/s 

In [21]:
from cltk.corpus.readers import get_corpus_reader
greek_corpus = get_corpus_reader(corpus_name="latin_text_perseus", language="latin")

AttributeError: 'property' object has no attribute 'replace'

In [140]:
from cltk.corpus.greek.beta_to_unicode import Replacer
import spacy

r = Replacer()
nlp = spacy.load("en_core_web_md")
# sen_tokenizer = SentenceTokenizer()
split_pattern = re.compile(r"\n{2,}\s*")
with open("./data/Pairs/aesch.ag_gk.txt") as grc_f, open("./data/Pairs/aesch.ag_eng.txt") as eng_f:
    gk_text = r.beta_code(r"%s" % grc_f.read())
    eng_text = eng_f.read()

gk_pars = split_pattern.split(gk_text)
en_pars = split_pattern.split(eng_text)
pair = list(zip(gk_pars, en_pars))
print(pair[1])
# # gr_sents = sen_tokenizer.tokenize(gk_text)
# en_sents = list(doc.sents)
# gk_sents = list(g_doc.sents)
# print(f"gr{len(gk_sents)}")
# print(f"en{len(en_sents)}")

('θεοὺς μὲν αἰτῶ τῶνδ’ ἀπαλλαγὴν πόνων\nφρουρᾶς ἐτείας μῆκος, ἣν κοιμώμενοσ\nστέγαις Ἀτρειδῶν ἄγκαθεν, κυνὸς δίκην,\nἄστρων κάτοιδα νυκτέρων ὁμήγυριν,\nκαὶ τοὺς φέροντας χεῖμα καὶ θέρος βροτοῖσ\nλαμπροὺς δυνάστας, ἐμπρέποντας αἰθέρι\nἀστέρας, ὅταν φθίνωσιν, ἀντολάς τε τῶν.\n καὶ νῦν φυλάσσω λαμπάδος τό σύμβολον,\nαὐγὴν πυρὸς φέρουσαν ἐκ Τροίας φάτιν\nἁλώσιμόν τε βάξιν· ὧδε γὰρ κρατεῖ\nγυναικὸς ἀνδρόβουλον ἐλπίζον κέαρ.\nεὖτ’ ἂν δὲ νυκτίπλαγκτον ἔνδροσόν τ’ ἔχω\nεὐνὴν ὀνείροις οὐκ ἐπισκοπουμένην\nἐμήν· φόβος γὰρ ἀνθ’ ὕπνου παραστατεῖ,\nτὸ μὴ βεβαίως βλέφαρα συμβαλεῖν ὕπνῳ·\nὅταν δ’ ἀείδειν ἢ μινύρεσθαι δοκῶ,\nὕπνου τόδ’ ἀντίμολπον ἐντέμνων ἄκος,\nκλαίω τότ’ οἴκου τοῦδε συμφορὰν στένων\nοὐχ ὡς τὰ πρόσθ’ ἄριστα διαπονουμένου.\nνῦν δ’ εὐτυχὴς γένοιτ’ ἀπαλλαγὴ πόνων\nεὐαγγέλου φανέντος ὀρφναίου πυρός.\n ὦ χαῖρε λαμπτὴρ νυκτός, ἡμερήσιον\nφάος πιφαύσκων καὶ χορῶν κατάστασιν\nπολλῶν ἐν Ἄργει, τῆσδε συμφορᾶς χάριν.\nἰοὺ ἰού.\n Ἀγαμέμνονος γυναικὶ σημαίνω τορῶσ\nεὐνῆς ἐπαντείλασαν ὡς τάχος δόμο