In [28]:
import string
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from collections import defaultdict


def read_base_forms(filename):
    base_forms = defaultdict(set)
    with open(filename) as file:
        for line in tqdm(file):
            columns = line.split(';')
            base_form = columns[0].lower()
            word = columns[1].lower()
            base_forms[word].add(base_form)
    return base_forms


def read_embeddings(filename):
    embeddings = {}
    for line in tqdm(open(filename)):
        line = line.split()
        word = line[0].lower()
        vector = np.array(line[1:], dtype=np.float32)
        embeddings[word] = vector
    return embeddings


def clear_text(text):
    text = text.lower()
    tokenized_text = word_tokenize(text) 
  
    table = str.maketrans('', '', string.punctuation + '–')
    stopwords = set(nltk.corpus.stopwords.words('polish'))
    
    cleared_text = [] 
    for word in tokenized_text:
        word = word.translate(table)
        if word and word not in stopwords:
            cleared_text.append(word)
    return cleared_text


def read_synsets(synsets_filename):
    with open(synsets_filename) as synsets_file:
        synsets = [line.strip() for line in synsets_file]
    return synsets


def read_definitions(filename):
    definitions = defaultdict(list)
    with open(filename) as file:
        for line in file:
            synset, definition = line.strip().split(maxsplit=1)
            definitions[synset] += clear_text(definition)
    return definitions


def read_relations(filename, synsets):
    relative_synsets = defaultdict(set)
    with open(filename) as file:
        for line in file:
            s1, s2, relation = line.strip().split(maxsplit=2)
            synset1 = synsets[int(s1)-1]
            synset2 = synsets[int(s2)-1]
            relative_synsets[synset1].add(synset2)
            relative_synsets[synset2].add(synset1)
    return relative_synsets
    
    
def create_lemma_synset_mappings(lemmas_filename, synsets_filename, lexicalunits_filename):
    with open(lemmas_filename) as lemmas_file:
        lemmas = [line.strip().split(',')[0] for line in lemmas_file]
    with open(synsets_filename) as synsets_file:
        synsets = [line.strip() for line in synsets_file]
    with open(lexicalunits_filename) as lexicalunits_file:
        lexicalunits = [line.strip().split() for line in lexicalunits_file]
    lemma_synsets_mapping = defaultdict(list)
    for lemma_id, synset_id in lexicalunits:
        lemma_synsets_mapping[lemmas[int(lemma_id)-1]].append(int(synset_id)-1)
    synset_lemmas_mapping = defaultdict(list)
    for lemma_id, synset_id in lexicalunits:
        synset_lemmas_mapping[synsets[int(synset_id)-1]].append(lemmas[int(lemma_id)-1])
    return lemma_synsets_mapping, synset_lemmas_mapping

    
def calculate_idf(definitions, embeddings, synset_lemmas_mapping):
    df = defaultdict(int)
    N = len(definitions)
    for synset, definition in tqdm(definitions.items()):
        terms = set()
        for word in definition + synset_lemmas_mapping[synset]:
            if word not in embeddings:
                continue
            if word in terms:
                continue
            terms.add(word)
            df[word] += 1
    return {word: np.log(N/df_t) for word, df_t in df.items()}

    
def calculate_lemma_idf(definitions, lemma_embeddings, synset_lemmas_mapping, base_forms):
    df = defaultdict(int)
    N = len(definitions)
    for synset, definition in tqdm(definitions.items()):
        terms = set()
        for word in definition + synset_lemmas_mapping[synset]:
            for lemma in base_forms.get(word, [word]):
                if lemma not in lemma_embeddings:
                    continue
                if lemma in terms:
                    continue
                terms.add(lemma)
                df[lemma] += 1
    return {word: np.log(N/df_t) for word, df_t in df.items()}

In [2]:
base_forms = read_base_forms('../data/polimorfologik/polimorfologik-2.1.txt')

4811854it [00:16, 297051.99it/s]


In [3]:
definitions = read_definitions('data/synset_defs_examples.txt')

In [4]:
embeddings = read_embeddings('data/nkjp+wiki-forms-all-100-cbow-hs.txt')

2123133it [01:28, 24075.97it/s]


In [5]:
lemma_embeddings = read_embeddings('data/nkjp+wiki-lemmas-all-100-cbow-hs.txt')

1549323it [01:08, 22638.07it/s]


In [None]:
lemma_synsets_mapping, synset_lemmas_mapping = create_lemma_synset_mappings('data/lemmas.txt', 
                                                                            'data/synsets.txt', 
                                                                            'data/lexicalunits.txt')

In [29]:
idf = calculate_idf(definitions, embeddings, synset_lemmas_mapping)

100%|██████████| 346566/346566 [00:17<00:00, 19866.55it/s]


In [30]:
lemma_idf = calculate_lemma_idf(definitions, lemma_embeddings, synset_lemmas_mapping, base_forms)

100%|██████████| 346566/346566 [00:31<00:00, 10873.62it/s]


In [8]:
synsets = read_synsets('data/synsets.txt')

In [33]:
relative_synsets = read_relations('data/synset_rels.txt', synsets)

In [38]:
def add_word_embedding(matrix, i, word, embeddings, lemma_embeddings, idf, lemma_idf, base_forms, multiplier=1.0):
    if word in embeddings:
        matrix[i] += multiplier * embeddings[word] * idf[word]
    else:
        for lemma in base_forms.get(word, [word]):
            if lemma in lemma_embeddings:
                matrix[i] += multiplier * lemma_embeddings[lemma] * lemma_idf[lemma]

In [56]:
from numpy import savez_compressed

counter = 0
synsets_matrix = np.zeros((len(synsets), len(next(iter(embeddings.values())))))
for i, synset in enumerate(tqdm(synsets)):
#     for s in [synset] + list(relative_synsets[synset]):
#         for word in definitions[s] + synset_lemmas_mapping[s]:
#             try:
#                 add_word_embedding(synsets_matrix, i, word, embeddings, lemma_embeddings, 
#                                    idf, lemma_idf, base_forms)
#             except KeyError:
#                 counter += 1
    for word in definitions[synset] + synset_lemmas_mapping[synset]:
        add_word_embedding(synsets_matrix, i, word, embeddings, lemma_embeddings, 
                           idf, lemma_idf, base_forms)
    for s in relative_synsets[synset]:
        for word in synset_lemmas_mapping[s]:
            add_word_embedding(synsets_matrix, i, word, embeddings, lemma_embeddings, 
                               idf, lemma_idf, base_forms)
print(counter)
synset_norms = np.linalg.norm(synsets_matrix, axis=1)[:, None]
synset_norms[synset_norms == 0] = 0.0000001
synsets_matrix = synsets_matrix / synset_norms
savez_compressed('synsets_matrix.npz', synsets_matrix)

100%|██████████| 346537/346537 [00:39<00:00, 8847.90it/s] 


0


In [44]:
def clear_text(text):
    important_word_ids = []
    text = text.lower().split()
    punctuation = string.punctuation + '–'
    stopwords = set(nltk.corpus.stopwords.words('polish'))
    important_words = [] 
    for i, word in enumerate(text):
        word = word.strip(punctuation)
        if word and word not in stopwords:
            important_words.append(word)
            important_word_ids.append(i)
    return important_words, important_word_ids


def calculate_embedding(i, important_words, embeddings, lemma_embeddings, idf, lemma_idf, k=5):
    start = max(i - k, 0)
    end = i + k
    context = important_words[start:i] + important_words[i+1:end]
    embedding = np.zeros(len(next(iter(embeddings.values()))))
    counter = 0
    for word in context:
        try:
            if word in embeddings:
                embedding += embeddings[word] * idf[word]
            else:
                for lemma in base_forms.get(word, [word]):
                    if lemma in lemma_embeddings:
                        embedding += lemma_embeddings[lemma] * lemma_idf[lemma]
        except KeyError:
            counter += 1
    return embedding


def disambiguate(text, lemma_synsets_mapping, synsets, base_forms, synsets_matrix, 
                 embeddings, lemma_embeddings, idf, lemma_idf, k=5):
    original_text = text
    splitted_original_text = text.split()
    important_words, important_word_indices = clear_text(original_text)
    disambiguation_results = {}
    for word, i in zip(important_words, important_word_indices):
        senses = []
        for lemma in base_forms.get(word, [word]):
            senses += lemma_synsets_mapping[lemma]
        if len(senses) > 1:
            embedding = calculate_embedding(i, important_words, embeddings, lemma_embeddings,
                                            idf, lemma_idf, k)
            best_sense = find_best_sense(synsets_matrix, senses, embedding)
            disambiguation_results[i] = synsets[best_sense]

    for i, synset in disambiguation_results.items():
        splitted_original_text[i] += '/{}'.format(synset)     
    return ' '.join(splitted_original_text), set(disambiguation_results.values())

    
def find_best_sense(synsets_matrix, senses, embedding):
    distances = cosine_similarity(synsets_matrix[senses], embedding)
    best_sense_index = np.argmax(distances)
    return senses[best_sense_index]


def cosine_similarity(matrix, embedding):
    similarity_vector = matrix.dot(embedding.T) / np.linalg.norm(embedding)
    return similarity_vector.flatten()


def read_human_readable_definitions(filename):
    definitions = defaultdict(list)
    with open(filename) as file:
        for line in file:
            synset, definition = line.strip().split(maxsplit=1)
            definitions[synset].append(definition)
    return definitions


def display_result(text, used_synsets, original_definitions, synset_lemmas_mapping, relative_synsets):
    print(text)
    print()
    for synset in used_synsets:
        print('{}({}):'.format(synset, ', '.join(synset_lemmas_mapping[synset])))
        for definition in original_definitions[synset]:
            print('- {}'.format(definition))
        print('Relative synsets: {}'.format(', '.join(['({})'.format(', '.join(synset_lemmas_mapping[s])) \
                                                       for s in relative_synsets[synset]])))
        print()

In [20]:
original_definitions = read_human_readable_definitions('data/synset_defs_examples.txt')

In [25]:
from numpy import load, savez_compressed
synsets_matrix = load('synsets_matrix.npz')['arr_0']

In [58]:
from highlight import yellow_line

print('Wpisz tekst do zdezambiguowania:')
x = input()
while x:
    text, used_synsets = disambiguate(x, lemma_synsets_mapping, synsets, base_forms, synsets_matrix,
                                      embeddings, lemma_embeddings, idf, lemma_idf)
    display_result(text, used_synsets, original_definitions, synset_lemmas_mapping, relative_synsets)
    yellow_line()
    print('Wpisz tekst do zdezambiguowania:')
    x = input()

Wpisz tekst do zdezambiguowania:
pociąg towarowy
pociąg/s7294 towarowy/s422837

s422837(towarowy):
- "taki, który służy do transportu towarów, jest tak skonstruowany, by móc uczestniczyć w transporcie towarów; np. dworzec **towarowy**, rampa **towarowa**."
Relative synsets: (budowa, struktura), (conveyance, transferral, transfer, transport, transportation), (functional), (ciężarowy), (cel, przeznaczenie)

s7294(kolej, pociąg):
- "środek lokomocji, połączone lokomotywa i wagony."
- "**pociąg**, lokomotywa i wagony."
- "Wyszła na dworzec po brata, który przyjechał do niej **koleją**."
Relative synsets: (pociąg szpitalny), (zespół trakcyjny), (pociąg pocztowy), (tender), (kibel), (ekspresowy, expressowy), (pojazd kolejowy), (lokomotywa), (pociąg towarowy), (wąskotorówka, kolejka wąskotorowa), (pociąg pasażerski), (metro), (pociąg szybki), (skomunikować się), (cug), (kolej ogumiona), (pociąg marszrutowy), (kolej żelazna, kolej, droga żelazna), (autobus szynowy, szynobus), (kolejka metra, p

Wszystkie nowoczesne przeglądarki pozwalają na włączenie bądź wyłączenie mechanizmu ciasteczek.
Wszystkie nowoczesne/s103589 przeglądarki/s26212 pozwalają/s2567 na włączenie/s58421 bądź/s250925 wyłączenie/s100072 mechanizmu/s45692 ciasteczek./s425556

s2567(pozwalać):
Relative synsets: (fotografować się, zdejmować się), (dopuszczać), (dopuszczać), (narażać), (dozwalać), (omamiać się), (móc, być w stanie, być w mocy), (zanieczyszczać), (clear, pass, authorise, authorize), (umożliwiać)

s58421(włączyć):
- "Na nowym osiedlu **włączono** wreszcie wodę i prąd."
- "zainicjować podłączenie jakiegoś obiektu, miejsca do mediów użytkowych, do elektryki."
Relative synsets: (przyłączyć), (zaświecić, zapalić), (join), (odkręcić kurek), (począć, jąć, zacząć, rozpocząć, wszcząć, zainicjować), (incorporate)

s100072(wyłączenie):
Relative synsets: (przerwanie), (włączenie), (cutoff)

s425556(ciasteczko):
- "**Ciasteczka** różnych rodzajów są stosowane najczęściej po logowaniu do utrzymywania sesji."
- 

KeyboardInterrupt: 