In [3]:
%matplotlib inline
import numpy as np
from joblib import Parallel, delayed
from gensim.models.keyedvectors import KeyedVectors
# from numba import jit, autojit
from sklearn.manifold import MDS, TSNE
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances, linear_kernel, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
import os
import scipy.sparse as sps

In [7]:
# model = KeyedVectors.load_word2vec_format('word2vec-models/lemmas.cbow.s100.w2v.bin', binary=True)

In [4]:
model_sg = KeyedVectors.load_word2vec_format('word2vec-models/lemmas.sg.s100.w2v.bin', binary=True)

In [6]:
model_sg

<gensim.models.keyedvectors.KeyedVectors at 0x7ff60bf21898>

In [8]:
def n_similarity(s1, s2):
    vec1 = np.mean(model[s1.split()], axis=0)
    vec2 = np.mean(model[s2.split()], axis=0)
    return cosine_similarity([vec1], [vec2])[0][0]

def n_distance(s1, s2):
    vec1 = np.mean(model[s1.split()], axis=0)
    vec2 = np.mean(model[s2.split()], axis=0)
    return cosine_distances([vec1], [vec2])[0][0]

def matrix_row_sim(s1, contexts, row_length):
    row = np.empty(row_length)
    for j, s2 in enumerate(contexts):
#         row[j] = model.n_similarity(s1.split(), s2.split())
        row[j] = n_similarity(s1, s2)
        
    return row

def matrix_row_dist(s1, contexts, row_length):
    row = np.empty(row_length)
    for j, s2 in enumerate(contexts):
        row[j] = n_distance(s1, s2)
    return row

# tf-idf

In [13]:
words = [('joogitee', 'sõidutee'),
        ('õun', 'banaan'),
        ('õun', 'puder'),
        ('õun', 'kivi'),
        ('ämber', 'pang'),
        ('hea', 'halb'),
        ('countries', 'cities'),
        ('Eesti', 'TallinnTartu')]

# words = [('hea', 'halb'),
#             ('countries', 'cities'),
#             ('Eesti', 'TallinnTartu')]

In [15]:
words

[('joogitee', 'sõidutee'),
 ('õun', 'banaan'),
 ('õun', 'puder'),
 ('õun', 'kivi'),
 ('ämber', 'pang'),
 ('hea', 'halb'),
 ('countries', 'cities'),
 ('Eesti', 'TallinnTartu')]

In [215]:
for word1, word2 in words:
    print(word1, word2)
    for window in [2,3,4]:
        for symmetric in [True, False]:
            print(window, symmetric)
            with open('../datasets/contexts/{}_s_{}_w_{}.txt'.format(word1, symmetric, window)) as f:
                contexts1 = f.read().splitlines()
            with open('../datasets/contexts/{}_s_{}_w_{}.txt'.format(word2, symmetric, window)) as f:
                contexts2 = f.read().splitlines()
            contexts = contexts1 + contexts2
#             labels = [0]*len(contexts1) + [1]*len(contexts2)
            
            print(len(contexts))
            tfidf_vectorizer = TfidfVectorizer()
            tfidf = tfidf_vectorizer.fit_transform(contexts)
            print('saving')
            print()
            filename = '../datasets/tfidf-features/{}_{}_w_{}_s_{}.npy'.format(word1, word2, window, symmetric)
            print(filename)
#             break
            
#         break
#     break
            np.save(filename, tfidf)


hea halb
2 True
10000
saving

../datasets/tfidf-features/hea_halb_w_2_s_True.npy
2 False
10000
saving

../datasets/tfidf-features/hea_halb_w_2_s_False.npy
3 True
10000
saving

../datasets/tfidf-features/hea_halb_w_3_s_True.npy
3 False
10000
saving

../datasets/tfidf-features/hea_halb_w_3_s_False.npy
4 True
8068
saving

../datasets/tfidf-features/hea_halb_w_4_s_True.npy
4 False
10000
saving

../datasets/tfidf-features/hea_halb_w_4_s_False.npy
countries cities
2 True
7486
saving

../datasets/tfidf-features/countries_cities_w_2_s_True.npy
2 False
10000
saving

../datasets/tfidf-features/countries_cities_w_2_s_False.npy
3 True
5980
saving

../datasets/tfidf-features/countries_cities_w_3_s_True.npy
3 False
10000
saving

../datasets/tfidf-features/countries_cities_w_3_s_False.npy
4 True
4614
saving

../datasets/tfidf-features/countries_cities_w_4_s_True.npy
4 False
10000
saving

../datasets/tfidf-features/countries_cities_w_4_s_False.npy
Eesti TallinnTartu
2 True
6928
saving

../datasets/tfi

# mean-vec

In [16]:
for word1, word2 in words:
    print(word1, word2)
    for window in [2,3,4]:
        for symmetric in [True, False]:
            print(window, symmetric)
            with open('datasets/contexts/{}_s_{}_w_{}.txt'.format(word1, symmetric, window)) as f:
                contexts1 = f.read().splitlines()
            with open('datasets/contexts/{}_s_{}_w_{}.txt'.format(word2, symmetric, window)) as f:
                contexts2 = f.read().splitlines()
            contexts_len = min(len(contexts1), len(contexts2))
            contexts = contexts1[:contexts_len] + contexts2[:contexts_len]
#             labels = [0]*len(contexts1) + [1]*len(contexts2)
            print(len(contexts1), len(contexts2), contexts_len, len(contexts))
            n = len(contexts)
            mean_vectors = np.zeros((n, 100))
            for i in range(n):
                mean_vectors[i] = np.mean(model_sg[contexts[i].split()], axis=0)
            print('saving')
            print()
            filename = 'datasets/sg/mean-vec/vectors/{}_{}_w_{}_s_{}.npy'.format(word1, word2, window, symmetric)
            print(filename)
#             break
#         break
#     break
            np.save(filename, mean_vectors)


joogitee sõidutee
2 True
627 627 627 1254
saving

datasets/sg/mean-vec/vectors/joogitee_sõidutee_w_2_s_True.npy
2 False
977 977 977 1954
saving

datasets/sg/mean-vec/vectors/joogitee_sõidutee_w_2_s_False.npy
3 True
480 480 480 960
saving

datasets/sg/mean-vec/vectors/joogitee_sõidutee_w_3_s_True.npy
3 False
939 939 939 1878
saving

datasets/sg/mean-vec/vectors/joogitee_sõidutee_w_3_s_False.npy
4 True
354 354 354 708
saving

datasets/sg/mean-vec/vectors/joogitee_sõidutee_w_4_s_True.npy
4 False
919 919 919 1838
saving

datasets/sg/mean-vec/vectors/joogitee_sõidutee_w_4_s_False.npy
õun banaan
2 True
1747 797 797 1594
saving

datasets/sg/mean-vec/vectors/õun_banaan_w_2_s_True.npy
2 False
2784 1174 1174 2348
saving

datasets/sg/mean-vec/vectors/õun_banaan_w_2_s_False.npy
3 True
1281 611 611 1222
saving

datasets/sg/mean-vec/vectors/õun_banaan_w_3_s_True.npy
3 False
2706 1136 1136 2272
saving

datasets/sg/mean-vec/vectors/õun_banaan_w_3_s_False.npy
4 True
942 494 494 988
saving

datasets/sg/

# {angular, euclidean}_distance

In [248]:
filename

'../datasets/mean-vec/Eesti_TallinnTartu_w_4_s_False.npy'

In [120]:
t = np.load(filename).item()

In [128]:
t.shape

(900, 2281)

In [137]:
cosine_similarity(features[i], features)

array([[ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.04124013,  0.07124831,  0.12497102,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.04488199,  0.03823299,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.04913427,  0.        ,  0.        ,  0.        ,
         0.        ,  0.10260547,  0.        ,  0.        ,  0.        ,
         0.        ,  0.05200036,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.08418861,
         0.        ,  0.04037107,  0.06486868,  0.        ,  0.        ,
         0.        ,  0.03878343,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.18640086,  0.16808128,  0.        ,
         0.038121  ,  0.03977867,  0.        ,  0.14769402,  0.        ,
         0.        ,  0.08296978,  0.        ,  0. 

In [151]:
a = glob(os.path.join('../datasets', feature, '*'))[0]

In [207]:
filename

'../datasets/mean-vec/hea_halb_w_4_s_False.npy'

In [None]:
features = np.load(filename)
features = features.item()

In [231]:
feature = 'mean-vec'
feature = 'tfidf-features'
a = glob(os.path.join('../datasets', feature, '*'))[0]
features_to_pairwise(files_5k[0], feature)

../datasets/tfidf-features/hea_halb_w_4_s_False.npy tfidf-features
(10000, 11439)
<class 'scipy.sparse.csr.csr_matrix'>
../datasets/euclidean-distance/tfidf-features/hea_halb_w_4_s_False.npy


In [229]:
files_5k[0]

'../datasets/tfidf-features/hea_halb_w_4_s_False.npy'

In [24]:
# doing both angular and euclidean here. change contentds to modify.

def features_to_pairwise(filename):
    print(filename)
    features = np.load(filename)
#     if feature == 'tfidf-features':
#         features = features.
#         print(features.item().shape)
#         features = features.item()
#         print(type(features))
    n = features.shape[0]
    matrix = np.zeros((n,n))
    for i in range(n):
        row = [features[i]]
#         if feature == 'tfidf-features':
#             row = row[0]
        matrix[i,:] = cosine_similarity(row, features).flatten()
#         matrix[i,:] = euclidean_distances(row, features).flatten()

    matrix[matrix>1] = 1
    matrix = np.arccos(matrix)/np.pi
    basename = os.path.basename(filename)
    new_path = os.path.join('datasets/sg/mean-vec/angular-distance/', basename)
#     new_path = os.path.join('datasets/sg/mean-vec/euclidean-distance/', basename)
    
    print(new_path)
    np.save(new_path, matrix)

In [34]:
# LSI COS DISTANCE

def features_to_pairwise_lsi(filename):
    print(filename)
    features = np.load(filename)
#     if feature == 'tfidf-features':
#         features = features.
#         print(features.item().shape)
#         features = features.item()
#         print(type(features))
    n = features.shape[0]
    matrix = np.zeros((n,n))
    for i in range(n):
        row = [features[i]]
#         if feature == 'tfidf-features':
#             row = row[0]
        matrix[i,:] = cosine_distances(row, features).flatten()
#         matrix[i,:] = euclidean_distances(row, features).flatten()

#     matrix[matrix>1] = 1
#     matrix = np.arccos(matrix)/np.pi
    basename = os.path.basename(filename)
    new_path = os.path.join('datasets/tfidf/lsi-cos-dist/', basename)
#     new_path = os.path.join('datasets/sg/mean-vec/euclidean-distance/', basename)
    
    print(new_path)
    np.save(new_path, matrix)

features_to_pairwise(fname)

datasets/tfidf/lsi/tfidf_features_õun_banaan_w_2_s_False_n_10.npy
datasets/tfidf/lsi-cos-dist/tfidf_features_õun_banaan_w_2_s_False_n_10.npy


In [29]:
fname = glob('datasets/tfidf/lsi/*')[0]

In [33]:
all_files = glob(os.path.join('datasets/tfidf/lsi/*'))
len(all_files)/48

3.0

In [35]:
# all_files = glob(os.path.join('../datasets', feature, '*'))
# files_5k = [filename for filename in all_files if ('hea' in filename or 'countries' in filename or 'Eesti' in filename)]

Parallel(n_jobs=25)(delayed(features_to_pairwise_lsi)(filename) for filename in all_files)


datasets/tfidf/lsi/tfidf_features_õun_banaan_w_2_s_False_n_10.npy
datasets/tfidf/lsi/tfidf_features_õun_banaan_w_2_s_True_n_40.npy
datasets/tfidf/lsi/tfidf_features_õun_banaan_w_3_s_True_n_2.npy
datasets/tfidf/lsi/tfidf_features_countries_cities_w_2_s_False_n_2.npy
datasets/tfidf/lsi/tfidf_features_õun_puder_w_2_s_False_n_40.npy
datasets/tfidf/lsi/tfidf_features_õun_banaan_w_3_s_False_n_2.npy
datasets/tfidf/lsi/tfidf_features_ämber_pang_w_3_s_True_n_2.npy
datasets/tfidf/lsi/tfidf_features_õun_banaan_w_2_s_False_n_2.npy
datasets/tfidf/lsi/tfidf_features_Eesti_TallinnTartu_w_4_s_False_n_40.npy
datasets/tfidf/lsi/tfidf_features_õun_kivi_w_4_s_True_n_40.npy
datasets/tfidf/lsi/tfidf_features_õun_puder_w_3_s_False_n_40.npy
datasets/tfidf/lsi/tfidf_features_õun_puder_w_3_s_True_n_10.npy
datasets/tfidf/lsi/tfidf_features_hea_halb_w_3_s_False_n_2.npy
datasets/tfidf/lsi/tfidf_features_countries_cities_w_4_s_False_n_40.npy
datasets/tfidf/lsi/tfidf_features_countries_cities_w_3_s_False_n_10.npy
da

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [98]:
basename = os.path.basename(glob('../datasets/mean-vec/*')[0])

In [100]:
new_path = os.path.join('../datasets/angular-distance/mean-vec/', basename)

'../datasets/angular-distance/mean-vec/õun_kivi_w_4_s_True.npy'

In [61]:
np.arccos(1.1)

  if __name__ == '__main__':


nan

In [69]:
matrix[matrix>1] = 1


In [71]:
matrix[matrix>1] = 1
angular_sim = 1-np.arccos(matrix)/np.pi

# WMD

# Old code

In [None]:
for window in [2,3,4]:
    for symmetric in [True, False]:
        print(metric.__name__, window, symmetric)
        apple_contexts = open('../datasets/apple_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
        rock_contexts = open('../datasets/rock_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
        pear_contexts = open('../datasets/pear_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
        contexts = apple_contexts + rock_contexts + pear_contexts
        labels = [0]*len(apple_contexts) + [1]*len(rock_contexts) + [2]*len(pear_contexts)
        n = len(contexts)
        matrix = np.empty((n,n))
        matrix[:] = np.NAN

        print('constructing matrix')
        tfidf_vectorizer = TfidfVectorizer()
        tfidf = tfidf_vectorizer.fit_transform(contexts)
        for i in tqdm(range(n)):
            matrix[i,:] = metric(tfidf[i], tfidf).flatten()
        print('saving')
        filename = '../datasets/apple-rock-pear/tfidf_{}_w_{}_s_{}.npy'.format(metric.__name__, window, symmetric)

        np.save(filename, matrix)

In [None]:
for metric in [cosine_similarity, cosine_distances]:
    for window in [2,3,4]:
        for symmetric in [True, False]:
            print(metric.__name__, window, symmetric)
            apple_contexts = open('../datasets/apple_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
            rock_contexts = open('../datasets/rock_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
            pear_contexts = open('../datasets/pear_contexts_s_{}_w_{}.txt'.format(symmetric, window)).read().splitlines()
            contexts = apple_contexts + rock_contexts + pear_contexts
            labels = [0]*len(apple_contexts) + [1]*len(rock_contexts) + [2]*len(pear_contexts)
            n = len(contexts)
            matrix = np.zeros((n,n))

            print('constructing matrix')
            tfidf_vectorizer = TfidfVectorizer()
            tfidf = tfidf_vectorizer.fit_transform(contexts)
            for i in tqdm(range(n)):
                matrix[i,:] = metric(tfidf[i], tfidf).flatten()
            print('saving')
            filename = '../datasets/apple-rock-pear/tfidf_{}_w_{}_s_{}.npy'.format(metric.__name__, window, symmetric)

            np.save(filename, matrix)

cosine_similarity 2 True


  0%|          | 0/10726 [00:00<?, ?it/s]

constructing matrix


100%|██████████| 10726/10726 [00:32<00:00, 330.76it/s]


saving
cosine_similarity 2 False
constructing matrix


100%|██████████| 12097/12097 [00:38<00:00, 317.42it/s]


saving
cosine_similarity 3 True
constructing matrix


100%|██████████| 9620/9620 [00:32<00:00, 294.53it/s]


saving
cosine_similarity 3 False
constructing matrix


100%|██████████| 11801/11801 [00:44<00:00, 265.30it/s]


saving
cosine_similarity 4 True
constructing matrix


100%|██████████| 8470/8470 [00:31<00:00, 270.79it/s]


saving
cosine_similarity 4 False
constructing matrix


100%|██████████| 11534/11534 [00:50<00:00, 229.57it/s]


saving
cosine_distances 2 True


  0%|          | 0/10726 [00:00<?, ?it/s]

constructing matrix


 40%|███▉      | 4257/10726 [00:13<00:19, 324.96it/s]