In [2]:
from gensim.models.keyedvectors import KeyedVectors
import estnltk
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
import operator
import matplotlib.pyplot as plt

In [6]:
model_cbow = KeyedVectors.load_word2vec_format('word2vec-models/lemmas.cbow.s100.w2v.bin', binary=True)
model_sg = KeyedVectors.load_word2vec_format('word2vec-models/lemmas.sg.s100.w2v.bin', binary=True)

In [None]:
def sentence_to_contexts(sentence, symmetric, window_size):
    sentence_text = estnltk.Text(sentence)
    df = sentence_text.get.word_texts.lemmas.postags.postag_descriptions.as_dataframe
    df = df[(df.postags != 'Z') & (df.postags != 'J')].reset_index()
    indexes = df.loc[df.lemmas == 'tee'].index
    results = []
    for index in indexes:
        left_context = " ".join(df.word_texts[max(index-window_size,0):index])  
        right_context = " ".join(df.word_texts[index+1:index+window_size+1])
        word_context = "{} {}".format(left_context, right_context).strip().lower()
        lemma_left_context = " ".join(df.lemmas[max(index-window_size,0):index])  
        lemma_right_context = " ".join(df.lemmas[index+1:index+window_size+1])
        lemma_context = "{} {}".format(lemma_left_context, lemma_right_context).strip().lower()
        if symmetric and (len(left_context.split()) != window_size or len(left_context.split()) != window_size):
            continue
        try:
            model[lemma_context.split()]
            results.append({'word_context': word_context, 'lemma_context': lemma_context})
        except KeyError:
            print('no key for', lemma_context.split())

    return results


def sentences_to_contexts(sentences, symmetric, window_size):
    word_contexts = []
    lemma_contexts = []
    for sentence in sentences:
#         print(sentence)
        contexts = sentence_to_contexts(sentence, symmetric=symmetric, window_size=window_size)
        for context in contexts:
            word_contexts.append(context['word_context'])
            lemma_contexts.append(context['lemma_context'])
    return {'word_contexts': word_contexts, 'lemma_contexts': lemma_contexts}

def generate_datasets(symmetric, window_size):
    tee_soidu_contexts = sentences_to_contexts(tee_soidu, symmetric=symmetric, window_size=window_size)
    tee_jook_contexts = sentences_to_contexts(tee_jook, symmetric=symmetric, window_size=window_size)
    tee_soidu_lemma_contexts = tee_soidu_contexts['lemma_contexts']
    print('tee_soidu_lemma_contexts', len(tee_soidu_lemma_contexts))
    tee_soidu_word_contexts = tee_soidu_contexts['word_contexts']
    tee_jook_lemma_contexts = tee_jook_contexts['lemma_contexts']
    print('tee_jook_lemma_contexts', len(tee_jook_lemma_contexts))
    tee_jook_word_contexts = tee_jook_contexts['word_contexts']
    tee_word_contexts = list(tee_soidu_word_contexts)
    tee_word_contexts.extend(tee_jook_word_contexts)
    tee_lemma_contexts = list(tee_soidu_lemma_contexts)
    print(len(tee_lemma_contexts))
    tee_lemma_contexts.extend(tee_jook_lemma_contexts)
    print(len(tee_lemma_contexts))
    return {'tee_word_contexts': tee_word_contexts, 'tee_lemma_contexts': tee_lemma_contexts}