<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/dic19052025working_extraction_of_mwus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 🟦 CELL 1: Install and Load spaCy Models
!pip install -q spacy
!python -m spacy download en_core_web_sm
!python -m spacy download uk_core_news_sm

import spacy

# Load spaCy models
nlp_en = spacy.load("en_core_web_sm")
nlp_uk = spacy.load("uk_core_news_sm")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m97.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting uk-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/uk_core_news_sm-3.8.0/uk_core_news_sm-3.8.0-py3-none-any.whl (14.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.9/14.9 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymorphy3>=1.0.0 (

In [2]:
# 🟦 CELL 2: Define sentence splitting functions

def split_english_sentences(text):
    doc = nlp_en(text)
    return [sent.text.strip() for sent in doc.sents]

def split_ukrainian_sentences(text):
    doc = nlp_uk(text)
    sents = [sent.text.strip() for sent in doc.sents]
    if len(sents) == 1:
        import re
        sents = re.split(r'(?<=[.!?])\s+', text.strip())
    return sents


In [3]:
# 🟦 CELL 3: Upload Parallel English and Ukrainian Text Files
from google.colab import files

print("Upload English file:")
uploaded_en = files.upload()
en_filename = next(iter(uploaded_en))

print("Upload Ukrainian file:")
uploaded_uk = files.upload()
uk_filename = next(iter(uploaded_uk))

with open(en_filename, 'r', encoding='utf-8') as f:
    english_text = f.read()

with open(uk_filename, 'r', encoding='utf-8') as f:
    ukrainian_text = f.read()

# Sentence splitting
english_sentences = split_english_sentences(english_text)
ukrainian_sentences = split_ukrainian_sentences(ukrainian_text)

# Align by shortest
min_len = min(len(english_sentences), len(ukrainian_sentences))
english_sentences = english_sentences[:min_len]
ukrainian_sentences = ukrainian_sentences[:min_len]

print(f"✅ Aligned sentence pairs: {min_len}")


Upload English file:


Saving en.txt to en.txt
Upload Ukrainian file:


Saving ukr.txt to ukr.txt
✅ Aligned sentence pairs: 32


In [4]:
# 🟦 CELL 4: Define MWU Extraction Function (Fixed for Ukrainian)

def extract_mwus(text, lang='en'):
    if lang == 'en':
        doc = nlp_en(text)
        return [chunk.text.strip() for chunk in doc.noun_chunks]

    elif lang == 'uk':
        doc = nlp_uk(text)
        mwus = []
        tokens = list(doc)

        for i in range(len(tokens) - 1):
            if tokens[i].pos_ in ('ADJ', 'NOUN') and tokens[i+1].pos_ == 'NOUN':
                phrase = tokens[i].text + ' ' + tokens[i+1].text
                mwus.append(phrase)
        return mwus

    else:
        raise ValueError("Unsupported language. Use 'en' or 'uk'.")


In [5]:
# 🟦 CELL 5: Extract MWUs and Align Sentence Pairs

aligned_pairs = list(zip(english_sentences, ukrainian_sentences))
dictionary_entries = []

for en_sent, uk_sent in aligned_pairs:
    mwu_en = extract_mwus(en_sent, lang='en')
    mwu_uk = extract_mwus(uk_sent, lang='uk')

    entry = {
        'EN': en_sent.strip(),
        'UK': uk_sent.strip(),
        'MWU_EN': mwu_en,
        'MWU_UK': mwu_uk
    }
    dictionary_entries.append(entry)

print(f"✅ Extracted MWUs for {len(dictionary_entries)} sentence pairs.")


✅ Extracted MWUs for 32 sentence pairs.


In [6]:
# 🟦 CELL 6: Print a Sample of Results
import random

for sample in random.sample(dictionary_entries, min(5, len(dictionary_entries))):
    print("\n🔹 EN:", sample['EN'])
    print("  ➤ MWUs EN:", sample['MWU_EN'])
    print("🔹 UK:", sample['UK'])
    print("  ➤ MWUs UK:", sample['MWU_UK'])



🔹 EN: The United States Supreme Courtis holding a hearing Thursday to address what is likely the biggest controversy sparked by Donald Trump’s executive orders: theright to birthright citizenship.
  ➤ MWUs EN: ['The United States Supreme Courtis', 'a hearing', 'what', 'the biggest controversy', 'Donald Trump’s executive orders', 'citizenship']
🔹 UK: У четвер Верховний суд США розглядає справу, що стосується, ймовірно, найгучнішого з указів Дональда Трампа — скасування права на громадянство за народженням.
  ➤ MWUs UK: ['Верховний суд', 'скасування права']

🔹 EN: The president and his supporters argue that stricter standards for becoming a U.S. citizen should exist, calling the executive order “a priceless and profound gift.”
  ➤ MWUs EN: ['The president', 'his supporters', 'stricter standards', 'a U.S. citizen', 'the executive order', '“a priceless and profound gift']
🔹 UK: Президент і його прихильники стверджують, що повинні існувати суворіші стандарти для набуття громадянства США, н

In [9]:
# 🟦 CELL 7: Export Clean XML with MWUs as Entry Elements, No Numeric IDs
import xml.etree.ElementTree as ET
from xml.dom import minidom
from google.colab import files

root = ET.Element("dictionary")

for item in dictionary_entries:
    # Only add if there are MWUs on both sides
    if item['MWU_EN'] and item['MWU_UK']:
        entry = ET.SubElement(root, "entry")

        mwu_en_elem = ET.SubElement(entry, "mwu_en")
        for mwu in item['MWU_EN']:
            ET.SubElement(mwu_en_elem, "item").text = mwu

        mwu_uk_elem = ET.SubElement(entry, "mwu_uk")
        for mwu in item['MWU_UK']:
            ET.SubElement(mwu_uk_elem, "item").text = mwu

        ET.SubElement(entry, "sentence_en").text = item['EN']
        ET.SubElement(entry, "sentence_uk").text = item['UK']

# Prettify the XML
xml_str = minidom.parseString(ET.tostring(root, encoding='utf-8')).toprettyxml(indent="  ")
filename = "mwu_clean_dictionary.xml"

with open(filename, "w", encoding='utf-8') as f:
    f.write(xml_str)

files.download(filename)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>