<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/mwu_extractor_colab_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MWU Extractor Notebook
This notebook provides a GUI in Google Colab to extract multi-word units (MWUs) from raw text: **ADJ + NOUN** and **NOUN + VERB + ADVERB**, and export them as Lexonomy-compatible XML.

In [None]:
!pip install -q spacy lxml ipywidgets
!python -m spacy download en_core_web_sm


In [None]:
import spacy
from lxml import etree
import ipywidgets as widgets
from IPython.display import display, clear_output
from google.colab import files

# Load spaCy model
nlp = spacy.load("en_core_web_sm")


In [None]:
def extract_mwus(text):
    doc = nlp(text)
    mwus = []
    # ADJ + NOUN
    for i in range(len(doc)-1):
        if doc[i].tag_.startswith("JJ") and doc[i+1].tag_.startswith("NN"):
            mwus.append((f"{doc[i].text} {doc[i+1].text}", "ADJ+NOUN"))
    # NOUN + VERB + ADVERB
    for i in range(len(doc)-2):
        if (doc[i].tag_.startswith("NN") and
            doc[i+1].tag_.startswith("VB") and
            doc[i+2].tag_.startswith("RB")):
            mwus.append((f"{doc[i].text} {doc[i+1].text} {doc[i+2].text}", "NOUN+VERB+ADVERB"))
    return mwus

def make_xml(mwus):
    root = etree.Element("lexonomy")
    for text, pattern in mwus:
        e = etree.SubElement(root, "entry")
        form = etree.SubElement(e, "form")
        orth = etree.SubElement(form, "orth")
        orth.text = text
        gramGrp = etree.SubElement(e, "gramGrp")
        gram = etree.SubElement(gramGrp, "gram", type="pattern")
        gram.text = pattern
    return etree.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8")


In [None]:
# Create widgets
upload_widget = widgets.FileUpload(accept='.txt', multiple=False, description="Upload .txt")
text_area = widgets.Textarea(value='', placeholder='Your text will appear here...', description='Text:', layout=widgets.Layout(width='100%', height='200px'))
process_button = widgets.Button(description="Process Text", button_style='primary')
download_button = widgets.Button(description="Download XML", button_style='success', disabled=True)
output_area = widgets.Output()

# Handlers
def on_upload(change):
    if upload_widget.value:
        content = list(upload_widget.value.values())[0]['content'].decode('utf-8')
        text_area.value = content
        output_area.clear_output()
        download_button.disabled = True

def on_process(_):
    output_area.clear_output()
    mwus = extract_mwus(text_area.value)
    with output_area:
        if mwus:
            print("Extracted MWUs:")
            for m, p in mwus:
                print(f" • {m}  ({p})")
            download_button.disabled = False
            download_button.mwus = mwus
        else:
            print("No MWUs found.")

def on_download(_):
    mwus = download_button.mwus
    xml_data = make_xml(mwus)
    with open("mwus_output.xml", "wb") as f:
        f.write(xml_data)
    files.download("mwus_output.xml")

# Link handlers
upload_widget.observe(on_upload, names='value')
process_button.on_click(on_process)
download_button.on_click(on_download)

# Display GUI
display(widgets.VBox([
    upload_widget,
    text_area,
    widgets.HBox([process_button, download_button]),
    output_area
]))
