In [1]:

import numpy as np
from joblib import Parallel, delayed
from gensim.models.keyedvectors import KeyedVectors

# from numba import jit, autojit
from sklearn.manifold import MDS, TSNE
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances, linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:
model = KeyedVectors.load_word2vec_format('word2vec-models/lemmas.cbow.s100.w2v.bin', binary=True)

In [8]:
def n_similarity(s1, s2):
    vec1 = np.mean(model[s1.split()], axis=0)
    vec2 = np.mean(model[s2.split()], axis=0)
    return cosine_similarity([vec1], [vec2])[0][0]

def n_distance(s1, s2):
    vec1 = np.mean(model[s1.split()], axis=0)
    vec2 = np.mean(model[s2.split()], axis=0)
    return cosine_distances([vec1], [vec2])[0][0]

def matrix_row_sim(s1, contexts, row_length):
    row = np.empty(row_length)
    for j, s2 in enumerate(contexts):
#         row[j] = model.n_similarity(s1.split(), s2.split())
        row[j] = n_similarity(s1, s2)
        
    return row

def matrix_row_dist(s1, contexts, row_length):
    row = np.empty(row_length)
    for j, s2 in enumerate(contexts):
        row[j] = n_distance(s1, s2)
    return row

In [None]:
#alustas 6:40
for window in [2,3,4]:
    for symmetric in [True, False]:
#         symmetric = True
#         window = 4
        print(window, symmetric)
        apple_contexts = open('../datasets/apple_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
        rock_contexts = open('../datasets/rock_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
        pear_contexts = open('../datasets/pear_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
        contexts = apple_contexts + rock_contexts + pear_contexts
        labels = [0]*len(apple_contexts) + [1]*len(rock_contexts) + [2]*len(pear_contexts)
        n = len(contexts)

        distance_matrix_rows = Parallel(n_jobs=12)(delayed(matrix_row_dist)(s1, contexts, n) for s1 in contexts)
        distance_matrix = np.array(distance_matrix_rows)
        filename = '../datasets/apple-rock-pear/cos_dist_w_{}_s_{}.npy'.format(window, symmetric)

        np.save(filename, distance_matrix)


2 True


In [50]:
window = 2
symmetric = False

apple_contexts = open('../datasets/apple_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
rock_contexts = open('../datasets/rock_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
pear_contexts = open('../datasets/pear_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
contexts = apple_contexts + rock_contexts + pear_contexts
labels = [0]*len(apple_contexts) + [1]*len(rock_contexts) + [2]*len(pear_contexts)
n = len(contexts)



In [43]:
dist = np.load('../datasets/apple-pear/cos_dist_w_2_s_False.npy')
sim = np.load('../datasets/apple-pear/cos_sim_w_2_s_False.npy')

In [27]:
np.allclose(sim, 1-dist)

False

In [35]:
np.where((sim - 1-dist) > 0.0000001)

(array([  29,   29,   29, ..., 3492, 3492, 3492]),
 array([ 151,  219, 1399, ...,  310, 1095, 2683]))

In [36]:
sim[29,152], (1-dist)[29,152]

(0.22283592820167542, 0.22283592820167542)

In [47]:
dist[dist>1]

array([ 1.03143096,  1.11698997,  1.37149608, ...,  1.06773734,
        1.08519888,  1.00470614])

In [40]:
dist[dist<0], dist[dist>1]

(array([ -5.96046448e-08,  -5.96046448e-08,  -5.96046448e-08, ...,
         -1.19209290e-07,  -1.19209290e-07,  -1.19209290e-07]),
 array([ 1.08329114,  1.02585201,  1.02585201, ...,  1.03437838,
         1.11062   ,  1.00657483]))

In [57]:
model[" ".join(apple_contexts).split()]
# " ".join(apple_contexts)

array([[-0.93079835, -3.13616061, -1.06315136, ...,  2.46277642,
         1.91486752, -1.6647681 ],
       [ 1.52429068, -0.30287835, -0.50285649, ...,  1.07115805,
        -1.51329362,  2.2414    ],
       [-1.29780161, -1.31244445,  1.6517936 , ...,  1.54376769,
         0.62512565,  2.93584681],
       ..., 
       [ 1.72746611,  0.45528936,  0.93981308, ...,  0.19209468,
         2.41504908,  0.81338692],
       [ 1.94154191, -1.95882583,  0.03309294, ..., -1.367046  ,
         0.60458416,  3.0778532 ],
       [-0.70207834,  1.34139383, -0.23653167, ...,  1.5368619 ,
        -0.07259645,  2.47928333]], dtype=float32)

In [42]:
np.all(np.where(dist>1)[1] == np.where(sim != 1-dist)[1]) #, np.all(np.where(dist>1)[0] == np.where(sim != 1-dist)[0])

  if __name__ == '__main__':


False

In [17]:
## TODO: kui postgres ei käi, siis on kiire? prg oli 1-4 it/s
distance_matrix_rows = Parallel(n_jobs=12)(delayed(matrix_row_dist)(s1, contexts, n) for s1 in contexts)
distance_matrix = np.array(distance_matrix_rows)

KeyboardInterrupt: 

In [18]:
similarity_matrix_rows = Parallel(n_jobs=12)(delayed(matrix_row_sim)(s1, contexts, n) for s1 in tqdm(contexts))
similarity_matrix = np.array(similarity_matrix_rows)



  0%|          | 0/12097 [00:00<?, ?it/s][A[A
[A

  0%|          | 1/12097 [00:02<8:45:18,  2.61s/it][A[A

  0%|          | 24/12097 [00:07<6:20:07,  1.89s/it][A[A

  0%|          | 26/12097 [00:07<4:29:56,  1.34s/it][A[A

  0%|          | 29/12097 [00:07<3:11:45,  1.05it/s][A[A

  0%|          | 32/12097 [00:08<2:17:45,  1.46it/s][A[A

  0%|          | 34/12097 [00:09<2:10:45,  1.54it/s][A[A

  0%|          | 36/12097 [00:12<3:11:28,  1.05it/s][A[A

  0%|          | 40/12097 [00:12<2:16:03,  1.48it/s][A[A

  0%|          | 43/12097 [00:12<1:37:49,  2.05it/s][A[A

  0%|          | 45/12097 [00:13<1:22:31,  2.43it/s][A[A

  0%|          | 47/12097 [00:15<2:14:28,  1.49it/s][A[A

  0%|          | 48/12097 [00:17<3:04:38,  1.09it/s][A[A

  0%|          | 51/12097 [00:17<2:11:48,  1.52it/s][A[A

  0%|          | 53/12097 [00:17<1:38:11,  2.04it/s][A[A

  0%|          | 55/12097 [00:17<1:14:03,  2.71it/s][A[A

  0%|          | 57/12097 [00:18<1:01:16,  3.2

KeyboardInterrupt: 