# setup

In [1]:
import os
import argparse
import numpy as np

In [2]:
VECTOR_FILE = os.getenv("VECTOR_FILE") 
print(f"VECTOR_FILE: {VECTOR_FILE}")

VECTOR_FILE: /veld/input/data/m1/vectors.txt


In [3]:
VECTORS = {}
with open(VECTOR_FILE, 'r') as f:
    for line in f:
        vals = line.rstrip().split(' ')
        VECTORS[vals[0]] = np.array([float(x) for x in vals[1:]])

# functions

In [4]:
def get_cosine_similarity_of_vectors(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [5]:
def get_cosine_similarity_of_words(w1, w2):
    v1 = VECTORS[w1.lower()]
    v2 = VECTORS[w2.lower()]
    return get_cosine_similarity_of_vectors(v1, v2)

In [6]:
def get_nearest_words_of_vector(v1, limit_results=None):
    comparisons = []
    for w2, v2 in VECTORS.items():
        comparisons.append((w2, get_cosine_similarity_of_vectors(v1, v2)))
    comparisons = sorted(comparisons, key=lambda x: -x[1])        
    if limit_results is not None:
        comparisons = comparisons[:limit_results]
    return comparisons

In [7]:
def get_nearest_words_of_word(w1, limit_results=None):
    v1 = VECTORS[w1.lower()]
    return get_nearest_words_of_vector(v1, limit_results)

# testing

In [8]:
w1 = "frau"
w2 = "mann"

In [9]:
v1 = VECTORS[w1]
print(v1.shape)
print(v1)
v2 = VECTORS[w2]
print(v2.shape)
print(v2)

(50,)
[-0.02465  -0.682123  0.339394  0.852501 -0.38473  -0.400563  2.528018
  0.547663 -0.364443  0.302127  0.289975  1.39431  -0.742984 -0.523161
 -0.166067  0.196042 -0.81867   1.820342  0.301298 -0.045174  1.099013
 -1.042619 -1.981761  0.179432  0.832562  0.182172  0.185834  1.716998
 -0.029499  0.131582 -1.881815 -0.618635 -0.102642 -0.314189 -0.415502
  0.854686  0.703377 -1.313839  0.021766 -1.247788  0.2234   -0.484226
  0.308503 -2.286786 -0.44694  -0.556265  0.29222   0.270662  0.009794
 -0.197155]
(50,)
[-0.698241  0.213898  0.394859  0.333442 -0.00285   0.104044  2.417434
  0.751385 -0.112971  0.143915 -0.469072  0.291726 -0.360177 -1.662466
 -0.279943  0.788703 -0.422262  1.738358 -0.145795  0.268173  0.932194
 -0.498397 -2.315247  0.070193  0.600768  0.011443 -0.357019  0.700192
 -0.134333  0.361401 -0.904582 -0.68162   0.982092 -0.225737 -0.606999
  0.494298  0.543297 -1.532362 -0.393751 -0.822406 -0.118827 -0.280617
 -0.25247  -1.791621  0.256596 -0.611498  0.388082  0

In [10]:
print(get_cosine_similarity_of_words(w1, w2))

0.8298255797830086


In [11]:
get_nearest_words_of_word(w1, limit_results=20)

[('frau', 0.9999999999999998),
 ('mutter', 0.9614790913240593),
 ('schwester', 0.9276123982363114),
 ('kind', 0.8916281056124479),
 ('ehefrau', 0.8833048119519473),
 ('tochter', 0.8781051351396821),
 ('vater', 0.8731896960239562),
 ('tante', 0.8727194721726078),
 ('großmutter', 0.8671961833438162),
 ('ehemann', 0.8654313443757703),
 ('eltern', 0.8599873403880165),
 ('freundin', 0.8549514881071417),
 ('ihr', 0.8529273540051311),
 ('mannes', 0.8499030612682443),
 ('geliebte', 0.8432935628288324),
 ('kinder', 0.8428390040786509),
 ('verheiratet', 0.8388101449279932),
 ('junge', 0.8381482521479995),
 ('tod', 0.8339307610198841),
 ('ihrem', 0.8322310678388922)]

In [12]:
get_nearest_words_of_word(w2, limit_results=20)

[('mann', 1.0),
 ('frau', 0.8298255797830086),
 ('soldat', 0.823139433436152),
 ('ihr', 0.8229130278788662),
 ('leben', 0.8189180771228854),
 ('sein', 0.811323180844243),
 ('kind', 0.7984683648906311),
 ('eigentlich', 0.798228172745085),
 ('vater', 0.7977366591056769),
 ('junge', 0.7975937693043107),
 ('ihm', 0.7864064825134575),
 ('seine', 0.7805807865573718),
 ('ein', 0.7805703001912596),
 ('person', 0.7802008228184328),
 ('ihn', 0.7787931154834025),
 ('gefallen', 0.7771855154466557),
 ('amerikaner', 0.7766901603034483),
 ('hand', 0.7742602271320155),
 ('ihrem', 0.7738789614815925),
 ('held', 0.7735517926568717)]