<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/Untitled15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import re
import xml.etree.ElementTree as ET
from xml.dom import minidom
import pandas as pd
from IPython.display import display
from google.colab import files

# --- Step 1: Upload raw text files (English and Ukrainian) ---

print("Upload English raw text file:")
eng_uploaded = files.upload()
eng_file = list(eng_uploaded.keys())[0]

print("Upload Ukrainian raw text file:")
ukr_uploaded = files.upload()
ukr_file = list(ukr_uploaded.keys())[0]

# Read uploaded files
with open(eng_file, encoding='utf-8') as f:
    eng_text = f.read()

with open(ukr_file, encoding='utf-8') as f:
    ukr_text = f.read()

# --- Step 2: Sentence segmentation (simple) ---

def split_sentences(text):
    # Simple sentence splitter (can be improved)
    return re.split(r'(?<=[.!?])\s+', text.strip())

eng_sents = split_sentences(eng_text)
ukr_sents = split_sentences(ukr_text)

print(f"English sentences found: {len(eng_sents)}")
print(f"Ukrainian sentences found: {len(ukr_sents)}")

# --- Step 3: Extract MWUs (multiword units) - simple heuristic: bigrams and trigrams ---

def extract_mwus(text, min_len=2, max_len=3):
    # Tokenize by words
    tokens = re.findall(r'\b\w+\b', text.lower())
    mwus = set()
    for n in range(min_len, max_len + 1):
        for i in range(len(tokens) - n + 1):
            mwu = ' '.join(tokens[i:i+n])
            mwus.add(mwu)
    return sorted(mwus)

eng_mwus = extract_mwus(eng_text)
ukr_mwus = extract_mwus(ukr_text)

print(f"Extracted {len(eng_mwus)} English MWUs")
print(f"Extracted {len(ukr_mwus)} Ukrainian MWUs")

# Save MWUs as CSV
eng_mwus_df = pd.DataFrame(eng_mwus, columns=['mwu'])
ukr_mwus_df = pd.DataFrame(ukr_mwus, columns=['mwu'])

eng_mwus_df.to_csv("english_mwus.csv", index=False)
ukr_mwus_df.to_csv("ukrainian_mwus.csv", index=False)

print("MWUs saved as english_mwus.csv and ukrainian_mwus.csv")

# --- Step 4: Align sentences as pairs (simple 1-to-1 by order) ---

pairs = list(zip(eng_sents, ukr_sents))

print(f"Aligned {len(pairs)} sentence pairs")

# --- Step 5: Export aligned sentence pairs to Lexonomy XML format ---

def prettify_xml(elem):
    rough_string = ET.tostring(elem, encoding='utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ")

def create_lexonomy_xml(pairs):
    root = ET.Element("dictionary")

    for i, (eng_sent, ukr_sent) in enumerate(pairs, start=1):
        entry = ET.SubElement(root, "entry", id=f"e{i}")

        head = ET.SubElement(entry, "headword")
        head.text = eng_sent

        translation = ET.SubElement(entry, "translation")
        translation.text = ukr_sent

    return prettify_xml(root)

lexonomy_xml = create_lexonomy_xml(pairs)

with open("lexonomy_aligned.xml", "w", encoding="utf-8") as f:
    f.write(lexonomy_xml)

print("Lexonomy XML saved as lexonomy_aligned.xml")

# --- Step 6: Provide files for download (Colab only) ---

files.download("english_mwus.csv")
files.download("ukrainian_mwus.csv")
files.download("lexonomy_aligned.xml")

print("Files are ready to download.")
