<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/Lexonomy_Bilingual_Converter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install required libraries
!pip install -q spacy lxml
!python -m spacy download en_core_web_sm

In [None]:
# Step 2: Import libraries
import spacy
import xml.etree.ElementTree as ET
from google.colab import files
import io

In [None]:
# Step 3: Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Step 4: Upload two files: English and Ukrainian
print("🔁 Upload English text file:")
uploaded = files.upload()
eng_filename = next(iter(uploaded))
eng_text = uploaded[eng_filename].decode('utf-8')

print("🔁 Upload Ukrainian translation file:")
uploaded = files.upload()
ukr_filename = next(iter(uploaded))
ukr_text = uploaded[ukr_filename].decode('utf-8')

In [None]:
# Step 5: Split both texts into sentences
eng_doc = nlp(eng_text)
ukr_doc = list(filter(None, ukr_text.strip().split('\n')))

eng_sentences = [sent.text.strip() for sent in eng_doc.sents]
ukr_sentences = [s.strip() for s in ukr_doc if s.strip()]

In [None]:
# Step 6: Ensure lengths match
min_len = min(len(eng_sentences), len(ukr_sentences))
eng_sentences = eng_sentences[:min_len]
ukr_sentences = ukr_sentences[:min_len]

In [None]:
# Step 7: Build XML
def build_bilingual_xml(eng_sents, ukr_sents):
    root = ET.Element("dictionary")
    for en, uk in zip(eng_sents, ukr_sents):
        entry = ET.SubElement(root, "entry")
        ET.SubElement(entry, "form").text = en
        ET.SubElement(entry, "translation").text = uk
    return ET.ElementTree(root)

xml_tree = build_bilingual_xml(eng_sentences, ukr_sentences)

In [None]:
# Step 8: Save to file
output_file = "lexonomy_dictionary.xml"
xml_tree.write(output_file, encoding='utf-8', xml_declaration=True)
print("✅ Dictionary created as:", output_file)

In [None]:
# Step 9: Download the XML file
files.download(output_file)