<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/mwus_run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install stanza lxml

import stanza
import xml.etree.ElementTree as ET
import pandas as pd
import re
from IPython.display import display
import os
from google.colab import files

# Download English and Ukrainian models
stanza.download('en')
stanza.download('uk')

# Initialize NLP pipelines
nlp_en = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma')
nlp_uk = stanza.Pipeline('uk', processors='tokenize,mwt,pos,lemma')

# Upload two .txt files (with en/uk in names)
print("📤 Upload your English and Ukrainian .txt files")
uploaded = files.upload()
filepaths = list(uploaded.keys())

# Identify files
en_file = [f for f in filepaths if 'en' in f.lower()][0]
uk_file = [f for f in filepaths if 'uk' in f.lower()][0]

with open(en_file, encoding='utf-8') as f:
    en_lines = [line.strip() for line in f if line.strip()]

with open(uk_file, encoding='utf-8') as f:
    uk_lines = [line.strip() for line in f if line.strip()]

# Align
min_len = min(len(en_lines), len(uk_lines))
en_lines = en_lines[:min_len]
uk_lines = uk_lines[:min_len]

print(f"✅ Aligned {min_len} sentence pairs")

# MWU extractor: noun/adjective phrases
def extract_mwus(doc):
    mwus = []
    for sent in doc.sentences:
        tokens = sent.words
        for i in range(len(tokens) - 1):
            t1, t2 = tokens[i], tokens[i+1]
            if (t1.upos in ['ADJ', 'NOUN'] and t2.upos == 'NOUN'):
                phrase = f"{t1.text} {t2.text}"
                mwus.append(phrase.lower())
    return mwus

# Process all lines
records = []
for i, (en, uk) in enumerate(zip(en_lines, uk_lines)):
    doc_en = nlp_en(en)
    doc_uk = nlp_uk(uk)
    mwus_en = extract_mwus(doc_en)
    mwus_uk = extract_mwus(doc_uk)
    records.append({
        "EN": en,
        "UK": uk,
        "EN_MWUs": '; '.join(mwus_en),
        "UK_MWUs": '; '.join(mwus_uk),
    })

df = pd.DataFrame(records)
display(df.head())

# Save to TSV (for debugging)
df.to_csv("aligned_mwus.tsv", sep="\t", index=False)

# Build Lexonomy XML
root = ET.Element("dictionary")
for i, row in df.iterrows():
    entry = ET.SubElement(root, "entry")

    head = ET.SubElement(entry, "head")
    head.text = f"{row['EN']}"

    body = ET.SubElement(entry, "body")

    trans = ET.SubElement(body, "translation")
    trans.text = row["UK"]

    mwus = ET.SubElement(body, "mwus")
    mwus_en = ET.SubElement(mwus, "en")
    mwus_en.text = row["EN_MWUs"]

    mwus_uk = ET.SubElement(mwus, "uk")
    mwus_uk.text = row["UK_MWUs"]

# Write XML to file
tree = ET.ElementTree(root)
tree.write("lexonomy_mwus.xml", encoding="utf-8", xml_declaration=True)

# Offer download
files.download("lexonomy_mwus.xml")


Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: uk (Ukrainian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-uk/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/uk/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: uk (Ukrainian):
| Processor | Package     |
---------------------------
| tokenize  | iu          |
| mwt       | iu          |
| pos       | iu_charlm   |
| lemma     | iu_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!


📤 Upload your English and Ukrainian .txt files


Saving en_1.txt to en_1.txt
Saving ukr_1.txt to ukr_1.txt
✅ Aligned 10 sentence pairs


Unnamed: 0,EN,UK,EN_MWUs,UK_MWUs
0,The United States Supreme Courtis holding a he...,"У четвер Верховний суд США розглядає справу, щ...",biggest controversy; executive orders; birthri...,верховний суд; скасування права
1,"On his first day as president, the Republican ...",У свій перший день на посаді президента Трамп ...,first day; undocumented immigrant; immigrant p...,перший день; посаді президента; громадянство д...
2,The Supreme Court does not have to rule direct...,Хоча Верховний суд не розглядатиме безпосередн...,court rulings; executive order; entire country,верховний суд; конституційність указу; судові ...
3,The Trump administration asked the Supreme Cou...,Адміністрація Трампа звернулася до Верховного ...,federal judges; executive order,верховного суду; федеральні судді; виконавчого...
4,The administration is asking judges to limit t...,"Адміністрація наполягає на тому, щоб судді обм...",court rulings; lower courts; presidential order,судових рішень; нижчі суди; президентський указ


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>