<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/Untitled24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import json

notebook_content = {
    "nbformat": 4,
    "nbformat_minor": 0,
    "metadata": {
        "colab": {
            "provenance": []
        },
        "kernelspec": {
            "name": "python3",
            "display_name": "Python 3"
        },
        "language_info": {
            "name": "python"
        }
    },
    "cells": [
        # Add your cells here
    ]
}

# Save the notebook as a .ipynb file
with open('your_notebook.ipynb', 'w') as f:
    json.dump(notebook_content, f)


In [7]:
# Cell 1 - Install and import necessary packages
!pip install nltk ipywidgets --quiet

import nltk
import re
from collections import Counter, defaultdict
from ipywidgets import FileUpload, Button, Output, VBox, Layout
from IPython.display import display, clear_output
import xml.etree.ElementTree as ET
from xml.dom import minidom

nltk.download('punkt')

# Cell 2 - Define sentence splitting functions

def split_sentences_en(text):
    from nltk.tokenize import sent_tokenize
    return sent_tokenize(text, language='english')

def split_sentences_uk(text):
    sentence_endings = re.compile(r'(?<=[.!?])\s+')
    sentences = sentence_endings.split(text.strip())
    return [s.strip() for s in sentences if s.strip()]

# Cell 3 - Define function to extract n-grams (2-4) from English text

def extract_ngrams(sentences, min_n=2, max_n=4, min_freq=2):
    words = []
    for sent in sentences:
        # Tokenize words simply by split (can be improved)
        words.extend(sent.lower().split())
    ngram_counts = Counter()
    for n in range(min_n, max_n+1):
        for i in range(len(words)-n+1):
            ngram = tuple(words[i:i+n])
            ngram_counts[ngram] += 1
    # Filter by min frequency
    filtered = {ngram: count for ngram, count in ngram_counts.items() if count >= min_freq}
    # Sort by frequency descending
    sorted_ngrams = sorted(filtered.items(), key=lambda x: x[1], reverse=True)
    return sorted_ngrams

# Cell 4 - Align sentences roughly 1-to-1

def align_sentences(en_sents, uk_sents):
    length = min(len(en_sents), len(uk_sents))
    aligned = list(zip(en_sents[:length], uk_sents[:length]))
    return aligned

# Cell 5 - Build MWU dictionary aligned with Ukrainian

def build_mwu_dict(ngrams, aligned_sents):
    mwu_dict = defaultdict(set)  # English ngram -> set of Ukrainian sentences
    for ngram, freq in ngrams:
        ngram_str = ' '.join(ngram)
        for en_sent, uk_sent in aligned_sents:
            # Simple lowercase substring check
            if ngram_str in en_sent.lower():
                mwu_dict[ngram_str].add(uk_sent)
    # Convert sets to lists for XML export
    mwu_dict = {k: list(v) for k, v in mwu_dict.items()}
    return mwu_dict

# Cell 6 - Export dictionary as Lexonomy XML

def dict_to_lexonomy_xml(mwu_dict):
    # Lexonomy dictionary root
    root = ET.Element('dictionary', attrib={
        'xmlns':"https://www.lexonomy.eu",
        'version':"1.0"
    })

    # Iterate over MWUs
    for mwu, uk_sents in mwu_dict.items():
        entry = ET.SubElement(root, 'entry')

        # headword
        headword = ET.SubElement(entry, 'headword')
        headword.text = mwu

        # senses (one sense per uk sentence)
        senses = ET.SubElement(entry, 'senses')
        for uk_s in uk_sents:
            sense = ET.SubElement(senses, 'sense')
            definition = ET.SubElement(sense, 'definition')
            definition.text = uk_s

    # Pretty print XML string
    xml_str = ET.tostring(root, encoding='unicode')
    parsed = minidom.parseString(xml_str)
    pretty_xml = parsed.toprettyxml(indent="  ")
    return pretty_xml

# Cell 7 - UI for uploading, processing and exporting

upload_en = FileUpload(accept='.txt', multiple=False, description='Upload English text')
upload_uk = FileUpload(accept='.txt', multiple=False, description='Upload Ukrainian text')
process_btn = Button(description='Process & Extract MWUs', button_style='success')
export_btn = Button(description='Export Lexonomy XML', button_style='info', disabled=True)
output = Output(layout={'border': '1px solid black'})

mwu_dict_global = None  # to store results between buttons

def process_files(b):
    global mwu_dict_global
    with output:
        clear_output()
        # Validate uploads
        if not upload_en.value or not upload_uk.value:
            print("Please upload both English and Ukrainian text files.")
            return
        # Read uploaded files content
        en_filename = list(upload_en.value.keys())[0]
        uk_filename = list(upload_uk.value.keys())[0]

        en_text = upload_en.value[en_filename]['content'].decode('utf-8')
        uk_text = upload_uk.value[uk_filename]['content'].decode('utf-8')

        print(f"Loaded English file '{en_filename}' ({len(en_text)} chars)")
        print(f"Loaded Ukrainian file '{uk_filename}' ({len(uk_text)} chars)")

        # Split sentences
        en_sents = split_sentences_en(en_text)
        uk_sents = split_sentences_uk(uk_text)
        print(f"\nEnglish sentences: {len(en_sents)}")
        print(f"Ukrainian sentences: {len(uk_sents)}")

        # Align sentences
        aligned = align_sentences(en_sents, uk_sents)
        print(f"Aligned sentence pairs: {len(aligned)}")

        # Extract n-grams (MWUs)
        print("\nExtracting English n-grams (2-4 words) with frequency >= 2 ...")
        ngrams = extract_ngrams(en_sents, 2, 4, min_freq=2)
        print(f"Found {len(ngrams)} MWU candidates")

        # Build MWU dictionary aligned with Ukrainian
        print("\nBuilding MWU dictionary aligned with Ukrainian sentences ...")
        mwu_dict_global = build_mwu_dict(ngrams, aligned)
        print(f"Dictionary entries: {len(mwu_dict_global)}")

        export_btn.disabled = False

def export_xml(b):
    global mwu_dict_global
    with output:
        clear_output()
        if not mwu_dict_global:
            print("No dictionary to export. Please process files first.")
            return
        xml_text = dict_to_lexonomy_xml(mwu_dict_global)
        print("Lexonomy XML generated successfully.\n")

        # Save XML to file
        filename = "mwu_dictionary.xml"
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(xml_text)
        print(f"XML saved to file '{filename}' in the current directory.")

upload_box = VBox([upload_en, upload_uk, process_btn, export_btn, output])

process_btn.on_click(process_files)
export_btn.on_click(export_xml)

display(upload_box)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


VBox(children=(FileUpload(value={}, accept='.txt', description='Upload English text'), FileUpload(value={}, ac…

In [4]:
import json

notebook_content = {
    "nbformat": 4,
    "nbformat_minor": 0,
    "metadata": {
        "colab": {
            "provenance": []
        },
        "kernelspec": {
            "name": "python3",
            "display_name": "Python 3"
        },
        "language_info": {
            "name": "python"
        }
    },
    "cells": [
        # Add your cells here
    ]
}

# Save the notebook as a .ipynb file
with open('your_notebook.ipynb', 'w') as f:
    json.dump(notebook_content, f)
