<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/MWU_Extractor_Translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install required packages
!pip install -q spacy lxml
!python -m spacy download en_core_web_sm


In [None]:
# Step 2: Import libraries and define helper functions
import spacy
import requests
import xml.etree.ElementTree as ET
from google.colab import files
import io

nlp = spacy.load("en_core_web_sm")

def get_translation_mymemory(text):
    try:
        response = requests.get(
            "https://api.mymemory.translated.net/get",
            params={'q': text, 'langpair': 'en|uk'}
        )
        data = response.json()
        return data['responseData']['translatedText']
    except Exception as e:
        return f"(error: {e})"

def extract_mwus_and_sentences(text):
    doc = nlp(text)
    results = []

    for sent in doc.sents:
        mwus = set(chunk.text for chunk in sent.noun_chunks if len(chunk.text.split()) > 1)
        for mwu in mwus:
            results.append((mwu.strip(), sent.text.strip()))

    return results

def build_xml(entries):
    root = ET.Element("dictionary")
    for mwu, sentence in entries:
        translation = get_translation_mymemory(mwu)

        entry = ET.SubElement(root, "entry")
        ET.SubElement(entry, "headword").text = mwu
        ET.SubElement(entry, "definition").text = translation
        ET.SubElement(entry, "illustration").text = sentence

    tree = ET.ElementTree(root)
    return tree


In [None]:
# Step 3: Upload your .txt file
uploaded = files.upload()
filename = next(iter(uploaded))
text = uploaded[filename].decode('utf-8')

# Extract MWUs and build XML
entries = extract_mwus_and_sentences(text)
xml_tree = build_xml(entries)

# Save to XML file
output_name = "mwu_dictionary.xml"
xml_tree.write(output_name, encoding='utf-8', xml_declaration=True)
print("✅ XML created:", output_name)


In [None]:
# Step 4: Download the generated XML file
files.download(output_name)
