<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/runnable_code_mwus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install stanza quietly
!pip install stanza -q

import stanza
import sys
import os
from google.colab import files
from xml.sax.saxutils import escape

# Download stanza models quietly (only first time)
with open(os.devnull, 'w') as fnull:
    old_stdout = sys.stdout
    sys.stdout = fnull
    stanza.download('en')
    stanza.download('uk')
    sys.stdout = old_stdout

# Initialize pipelines
nlp_en = stanza.Pipeline('en', processors='tokenize', use_gpu=False, verbose=False)
nlp_uk = stanza.Pipeline('uk', processors='tokenize', use_gpu=False, verbose=False)

print("Models downloaded and pipelines initialized.")

# Upload any two files, you will be prompted twice
print("Upload your English text file (any name):")
uploaded_en = files.upload()

print("Upload your Ukrainian text file (any name):")
uploaded_uk = files.upload()

# Get filenames
en_filename = list(uploaded_en.keys())[0]
uk_filename = list(uploaded_uk.keys())[0]

# Read lines (strip to avoid whitespace issues)
with open(en_filename, encoding='utf-8') as f:
    en_lines = [line.strip() for line in f if line.strip()]

with open(uk_filename, encoding='utf-8') as f:
    uk_lines = [line.strip() for line in f if line.strip()]

# Basic validation: equal number of lines
if len(en_lines) != len(uk_lines):
    raise ValueError(f"Number of lines differ: English file has {len(en_lines)}, Ukrainian file has {len(uk_lines)}")

print(f"Loaded {len(en_lines)} aligned lines.")

# Tokenize sentences with stanza (optional: you could extract MWUs or phrases here)
def tokenize_sentences(nlp, lines):
    sentences = []
    for line in lines:
        doc = nlp(line)
        # flatten all sentences in line into one list, here we assume 1 sentence per line mostly
        line_sents = [sent.text for sent in doc.sentences]
        sentences.append(line_sents)
    return sentences

en_sents = tokenize_sentences(nlp_en, en_lines)
uk_sents = tokenize_sentences(nlp_uk, uk_lines)

# Flatten sentence lists (assuming 1 sentence per line; if multiple, you can adjust)
# For simplicity, let's take the first sentence per line (you can customize)
en_sents_flat = [sents[0] if sents else "" for sents in en_sents]
uk_sents_flat = [sents[0] if sents else "" for sents in uk_sents]

# Create Lexonomy XML format
def create_lexonomy_xml(en_sents, uk_sents):
    header = '''<?xml version="1.0" encoding="UTF-8"?>
<dictionary>
  <entries>'''
    footer = '''
  </entries>
</dictionary>'''

    entries_xml = []
    for i, (en, uk) in enumerate(zip(en_sents, uk_sents), 1):
        # Escape XML special chars
        en_esc = escape(en)
        uk_esc = escape(uk)
        entry = f'''
    <entry id="{i}">
      <form lang="en">{en_esc}</form>
      <form lang="uk">{uk_esc}</form>
    </entry>'''
        entries_xml.append(entry)

    return header + "\n".join(entries_xml) + footer

lexonomy_xml = create_lexonomy_xml(en_sents_flat, uk_sents_flat)

# Save output
output_filename = "lexonomy_output.xml"
with open(output_filename, "w", encoding="utf-8") as f:
    f.write(lexonomy_xml)

print(f"Lexonomy XML saved to {output_filename}")

# Provide download link in Colab
files.download(output_filename)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: uk (Ukrainian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-uk/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/uk/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources


Models downloaded and pipelines initialized.
Upload your English text file (any name):


Saving ukr_1.txt to ukr_1.txt
Saving en_1.txt to en_1.txt
Upload your Ukrainian text file (any name):
