<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/MWU_Extractor_Colab_revised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MWU Extractor for English and Ukrainian
---
This notebook helps you extract Multi-Word Units (MWUs) from English and Ukrainian texts and align them into an XML file.

In [1]:
# Install necessary libraries
!pip install stanza lxml

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [2]:
import stanza
import lxml.etree as ET
from google.colab import files

# Download models if not already downloaded
stanza.download('uk')
stanza.download('en')

# Initialize pipelines
nlp_en = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse')
nlp_uk = stanza.Pipeline(lang='uk', processors='tokenize,pos,lemma,depparse')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: uk (Ukrainian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-uk/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/uk/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: uk (Ukrainian):
| Processor | Package     |
---------------------------
| tokenize  | iu          |
| mwt       | iu          |
| pos       | iu_charlm   |
| lemma     | iu_nocharlm |
| depparse  | iu_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


In [3]:
def extract_mwus_stanza(doc):
    mwus = []
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.deprel in ('nsubj', 'obj', 'iobj') and word.upos == 'NOUN':
                mwus.append(word.text)
    return mwus

In [4]:
# Зчитування завантажених файлів
uploaded = files.upload()
# Очікуємо два файли: en.txt та uk.txt

en_lines = []
uk_lines = []

for fn in uploaded.keys():
    if 'en' in fn:
        with open(fn, encoding='utf-8') as f:
            en_lines = f.read().splitlines()
    if 'uk' in fn:
        with open(fn, encoding='utf-8') as f:
            uk_lines = f.read().splitlines()

if not en_lines or not uk_lines:
    raise ValueError('Please upload two files: one English (en.txt) and one Ukrainian (uk.txt)')

Saving geneva_convention_english_expanded.txt to geneva_convention_english_expanded.txt
Saving geneva_convention_ukrainian_expanded.txt to geneva_convention_ukrainian_expanded.txt


In [5]:
# Обробка MWUs для англійської та української мови
en_mwus = []  # Список для зберігання MWUs для англійської
uk_mwus = []  # Список для зберігання MWUs для української

for line in en_lines:
    doc = nlp_en(line)  # Аналіз тексту англійською мовою
    mwus = extract_mwus_stanza(doc)  # Отримання MWUs
    en_mwus.append(' ; '.join(mwus) if mwus else line)  # Додавання MWUs до списку

for line in uk_lines:
    doc = nlp_uk(line)  # Аналіз тексту українською мовою
    mwus = extract_mwus_stanza(doc)  # Отримання MWUs
    uk_mwus.append(' ; '.join(mwus) if mwus else line)  # Додавання MWUs до списку

# Вирівнювання MWUs між англійським і українським текстом
aligned = list(zip(en_mwus, uk_mwus))

In [6]:
# Збереження результатів у XML
root = ET.Element('MWUs')
for en, ukr in aligned:
    entry = ET.SubElement(root, 'Entry')
    en_elem = ET.SubElement(entry, 'EN')
    en_elem.text = en
    ukr_elem = ET.SubElement(entry, 'UKR')
    ukr_elem.text = ukr

tree = ET.ElementTree(root)
tree.write('aligned_mwus.xml', encoding='utf-8', xml_declaration=True)
files.download('aligned_mwus.xml')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>