<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/MWU_Extraction_EN_UA_as_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary packages
!pip install -q spacy pandas nltk
!python -m spacy download en_core_web_sm
!python -m spacy download uk_core_news_sm

import spacy
import pandas as pd
import nltk
from nltk.util import ngrams
from nltk import word_tokenize
import os
from google.colab import files

# Load spaCy models
nlp_en = spacy.load('en_core_web_sm')
nlp_ua = spacy.load('uk_core_news_sm')

# Upload text files
print("Upload your aligned English and Ukrainian text files (one sentence per line each).")
uploaded = files.upload()

# Read files
en_lines, ua_lines = [], []
for filename in uploaded.keys():
    with open(filename, encoding='utf-8') as f:
        if 'en' in filename:
            en_lines = [line.strip() for line in f.readlines() if line.strip()]
        elif 'ua' in filename or 'uk' in filename:
            ua_lines = [line.strip() for line in f.readlines() if line.strip()]

# Ensure equal length
min_len = min(len(en_lines), len(ua_lines))
en_lines, ua_lines = en_lines[:min_len], ua_lines[:min_len]

print(f"Loaded {len(en_lines)} aligned sentence pairs.")

# MWU Extraction function
def extract_mwus(texts, lang_model, lang='en'):
    mwus = []
    for doc in lang_model.pipe(texts, disable=["ner"]):
        tokens = [token for token in doc if not token.is_punct and not token.is_stop]
        for i in range(len(tokens) - 1):
            t1, t2 = tokens[i], tokens[i+1]
            # Patterns: Adj + Noun, Noun + Noun, Verb + Noun
            if (t1.pos_ == "ADJ" and t2.pos_ == "NOUN") or                (t1.pos_ == "NOUN" and t2.pos_ == "NOUN") or                (t1.pos_ == "VERB" and t2.pos_ == "NOUN"):
                mwus.append(f"{t1.text} {t2.text}")
    return mwus

# Extract MWUs
print("Extracting MWUs from English...")
mwus_en = extract_mwus(en_lines, nlp_en)
print(f"Found {len(mwus_en)} English MWUs.")

print("Extracting MWUs from Ukrainian...")
mwus_ua = extract_mwus(ua_lines, nlp_ua)
print(f"Found {len(mwus_ua)} Ukrainian MWUs.")

# Optional: Align MWUs (naive alignment based on sentence index and MWU occurrence)
aligned_mwus = []
for i in range(min_len):
    en_mwus = extract_mwus([en_lines[i]], nlp_en)
    ua_mwus = extract_mwus([ua_lines[i]], nlp_ua)
    aligned_mwus.extend([(e, u) for e in en_mwus for u in ua_mwus])

# Save MWUs to CSV
mwu_df = pd.DataFrame(aligned_mwus, columns=['English_MWU', 'Ukrainian_MWU'])
mwu_df = mwu_df.drop_duplicates()
mwu_df.to_csv("extracted_mwus.csv", index=False)

# Download the result
files.download("extracted_mwus.csv")


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting uk-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/uk_core_news_sm-3.7.0/uk_core_news_sm-3.7.0-py3-none-any.whl (14.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.9/14.9 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from uk-core

Saving ukr_1.txt to ukr_1.txt
Saving en_1.txt to en_1.txt
Loaded 10 aligned sentence pairs.
Extracting MWUs from English...
Found 52 English MWUs.
Extracting MWUs from Ukrainian...
Found 62 Ukrainian MWUs.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>