In [3]:
%matplotlib inline
from gensim.models import KeyedVectors
import estnltk
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances, linear_kernel, euclidean_distances
import operator
from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer
import numba
import glob
# import pyemd
import seaborn as sns
import matplotlib.pyplot as plt
import logging


In [13]:
logger = logging.getLogger('generate_wmd')
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler('sg_wmd.log')
fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

def log(txt):
    print(txt)
    logger.info(txt)

In [7]:
model = KeyedVectors.load_word2vec_format('word2vec-models/lemmas.cbow.s100.w2v.bin', binary=True)

In [15]:
model = 123

In [17]:
model_sg = KeyedVectors.load_word2vec_format('word2vec-models/lemmas.sg.s100.w2v.bin', binary=True)

In [6]:
contexts1 = open('../datasets/tee_sõidu_contexts_s_True_w_3.txt').read().splitlines()
contexts2 = open('../datasets/tee_jook_contexts_s_True_w_3.txt').read().splitlines()
contexts = contexts1 + contexts2
true_labels = len(contexts1)*[0] + len(contexts2)*[1]

In [18]:
def alignement_matrix_row_dist(s1, s1_index, contexts, row_length):
    row = np.zeros(row_length)
    for j in range(s1_index+1):
        s2 = contexts[j]
        row[j] = model_sg.wmdistance(s1.split(), s2.split())
    return row

In [9]:
model

<gensim.models.keyedvectors.KeyedVectors at 0x7f4fa5d3eb00>

In [11]:
for word1, word2 in words:
    print((word1, word2))
    for window in [2]:
        for symmetric in [False]:
            print((window, symmetric))
            with open('../datasets/contexts/{}_s_{}_w_{}.txt'.format(word1, symmetric, window)) as f:
                contexts1 = f.read().splitlines()
            with open('../datasets/contexts/{}_s_{}_w_{}.txt'.format(word2, symmetric, window)) as f:
                contexts2 = f.read().splitlines()
            contexts = contexts1 + contexts2
            print(len(contexts))

('joogitee', 'sõidutee')
(2, False)
1954
('õun', 'banaan')
(2, False)
3958
('õun', 'puder')
(2, False)
3602
('õun', 'kivi')
(2, False)
5568
('hea', 'halb')
(2, False)
20000
('countries', 'cities')
(2, False)
20000
('Eesti', 'TallinnTartu')
(2, False)
20000
('ämber', 'pang')
(2, False)
926


In [None]:
# print(window, symmetric, func, name)
# apple_contexts = open('../datasets/apple_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
# rock_contexts = open('../datasets/rock_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
# pear_contexts = open('../datasets/pear_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
# contexts = apple_contexts + rock_contexts + pear_contexts
# labels = [0]*len(apple_contexts) + [1]*len(rock_contexts) + [2]*len(pear_contexts)

#23:55

words = [('joogitee', 'sõidutee'),
        ('ämber', 'pang'),
        ('õun', 'banaan'),
        ('õun', 'puder'),
        ('õun', 'kivi'),
        ('hea', 'halb'),
        ('countries', 'cities'),
        ('Eesti', 'TallinnTartu')]

for word1, word2 in words:
    log((word1, word2))
    for window in [2,3,4]:
        for symmetric in [True, False]:
            log((window, symmetric))
            with open('datasets/contexts/{}_s_{}_w_{}.txt'.format(word1, symmetric, window)) as f:
                contexts1 = f.read().splitlines()
            with open('datasets/contexts/{}_s_{}_w_{}.txt'.format(word2, symmetric, window)) as f:
                contexts2 = f.read().splitlines()
            contexts = contexts1 + contexts2

            n = len(contexts)

            distance_matrix_rows = Parallel(n_jobs=8)(delayed(alignement_matrix_row_dist)(s1, s1_index, contexts, n) 
                                                                    for s1_index, s1 in tqdm(enumerate(contexts)))
            distance_matrix = np.array(distance_matrix_rows)

            distance_matrix_partial = np.array(distance_matrix_rows)
            distance_matrix = distance_matrix_partial + distance_matrix_partial.T
            
            filename = 'datasets/sg/wmd/{}_{}_w_{}_s_{}.npy'.format(word1, word2, window, symmetric)
            log(filename)
            np.save(filename, distance_matrix)

INFO:generate_wmd:('joogitee', 'sõidutee')
INFO:generate_wmd:(2, True)

0it [00:00, ?it/s]

('joogitee', 'sõidutee')
(2, True)


[A
1it [00:01,  1.69s/it][A
624it [00:03,  1.19s/it][A
662it [00:05,  1.18it/s][A
681it [00:07,  1.61it/s][A
700it [00:09,  2.15it/s][A
719it [00:10,  2.87it/s][A
738it [00:12,  3.68it/s][A
757it [00:14,  4.46it/s][A
776it [00:16,  5.24it/s][A
795it [00:17,  6.98it/s][A
814it [00:23,  4.86it/s][A
833it [00:24,  6.49it/s][A
852it [00:31,  4.69it/s][A
871it [00:31,  6.26it/s][A
890it [00:35,  5.98it/s][A
909it [00:39,  5.57it/s][A
918it [00:39,  7.17it/s][A
927it [00:40,  7.75it/s][A
936it [00:41,  7.98it/s][A
945it [00:44,  5.60it/s][A
954it [00:47,  4.71it/s][A
963it [00:48,  4.97it/s][A
972it [00:52,  3.94it/s][A
981it [00:56,  3.13it/s][A
990it [00:57,  3.89it/s][A
999it [00:59,  4.22it/s][A
1008it [00:59,  5.20it/s][A
1017it [01:03,  3.81it/s][A
1026it [01:05,  3.93it/s][A
1030it [01:06,  5.17it/s][A
1034it [01:07,  4.65it/s][A
1038it [01:08,  3.47it/s][A
1046it [01:09,  4.41it/s][A
1050it [01:12,  2.53it/s][A
1054it [01:13,  2.74it/s][A
1058it [0

datasets/sg/wmd/joogitee_sõidutee_w_2_s_True.npy
(2, False)


[A
1it [00:01,  1.47s/it][A
436it [00:01,  1.03s/it][A
552it [00:03,  1.37it/s][A
635it [00:08,  1.90it/s][A
694it [00:13,  2.51it/s][A
736it [00:19,  3.14it/s][A
766it [00:22,  3.90it/s][A
787it [00:24,  4.98it/s][A
802it [00:28,  4.53it/s][A
813it [00:28,  5.93it/s][A
821it [00:29,  6.94it/s][A
827it [00:31,  5.49it/s][A
832it [00:32,  4.90it/s][A
837it [00:33,  4.72it/s][A
844it [00:33,  6.27it/s][A
851it [00:36,  4.14it/s][A
858it [00:38,  4.13it/s][A
865it [00:38,  5.47it/s][A
868it [00:39,  5.90it/s][A
871it [00:39,  6.24it/s][A
874it [00:40,  6.22it/s][A
877it [00:42,  3.26it/s][A
880it [00:42,  3.72it/s][A
886it [00:43,  4.34it/s][A
889it [00:44,  3.98it/s][A
892it [00:45,  4.37it/s][A
895it [00:45,  5.12it/s][A
898it [00:45,  5.14it/s][A
901it [00:46,  4.63it/s][A
902it [00:47,  2.90it/s][A
903it [00:47,  2.88it/s][A
904it [00:48,  3.15it/s][A
905it [00:48,  3.64it/s][A
906it [00:48,  2.77it/s][A
908it [00:49,  2.97it/s][A
909it [00:49,  3.7

TypeError: log() takes 1 positional argument but 2 were given

In [9]:
model.wmdistance(contexts[1].split(), contexts[0].split())

19.682990621389386