In [1]:
# importing dependencies
import numpy as np


In [2]:
# for loading the glove word embedding matrix values
def load_glove_vectors(glove_file):
    with open(glove_file, 'r', encoding="utf-8") as file:
        # unique words
        words = set()
        word_to_vec = {}
        # each line starts with a word then the values for the different features
        for line in file:
            line = line.strip().split()
            # take the word 
            curr_word = line[0]
            words.add(curr_word)
            # rest of the features for the word
            word_to_vec[curr_word] = np.array(line[1:], dtype=np.float64)
            
    return words, word_to_vec

In [4]:
words, word_to_vec = load_glove_vectors('data/glove.6B.300d.txt')

In [5]:
def find_cosine_similarity(u, v):
    distance = 0.0
    
    # find the dot product between u and v 
    dot = np.dot(u,v)
    # find the L2 norm of u 
    norm_u = np.sqrt(np.sum(u**2))
    # Compute the L2 norm of v
    norm_v = np.sqrt(np.sum(v**2))
    # Compute the cosine similarity
    distance = dot/(norm_u)/norm_v
    
    return distance

In [6]:
father = word_to_vec["father"]
mother = word_to_vec["mother"]
king = word_to_vec["king"]
queen = word_to_vec["queen"]
bat = word_to_vec["bat"]
crow = word_to_vec["crow"]
india = word_to_vec["india"]
italy = word_to_vec["italy"]
delhi = word_to_vec["delhi"]
rome = word_to_vec["rome"]

print("cosine_similarity(king, queen) = ", find_cosine_similarity(king, queen))
print("cosine_similarity(father, mother) = ", find_cosine_similarity(father, mother))
print("cosine_similarity(king - queen, father - mother) = ",find_cosine_similarity(king - queen, father - mother))
print("cosine_similarity(bat, crow) = ",find_cosine_similarity(bat, crow))
print("cosine_similarity(india - delhi, rome - italy) = ",find_cosine_similarity(india - delhi, rome - italy))

cosine_similarity(king, queen) =  0.633646870148
cosine_similarity(father, mother) =  0.756821737365
cosine_similarity(king - queen, father - mother) =  0.471706450424
cosine_similarity(bat, crow) =  0.151650597086
cosine_similarity(india - delhi, rome - italy) =  -0.519746805621


In [9]:
print(word_to_vec["sex"])

[ -1.96120000e-01   8.96550000e-02  -2.45030000e-01   1.09190000e-01
  -3.44290000e-01   4.12290000e-01  -1.05900000e-01  -4.55270000e-01
   4.16800000e-01  -1.49590000e+00   1.73130000e-01  -2.60010000e-02
  -3.35560000e-01   2.19070000e-01   2.82250000e-01  -7.09060000e-01
  -3.87100000e-02  -1.78590000e-01  -1.27960000e-01  -1.77640000e-01
  -5.96070000e-01   8.03380000e-01  -7.87810000e-02   3.40040000e-01
  -6.36520000e-01   1.48170000e-01   2.31730000e-02  -8.32600000e-01
   3.63180000e-01  -2.68650000e-01  -5.30430000e-01   2.27170000e-01
  -4.21860000e-01  -6.85690000e-01  -6.27680000e-01   4.20060000e-01
  -2.78880000e-01  -3.94230000e-01   2.18470000e-02  -3.48120000e-01
   7.09260000e-01  -1.54630000e-01  -5.15630000e-01  -4.33750000e-01
   2.72090000e-01  -1.42950000e-01  -2.76280000e-02  -4.05370000e-01
   1.58260000e-01   2.14940000e-02   1.72340000e-01  -1.31980000e-02
   1.26080000e-01   3.24380000e-01   1.03060000e-02  -3.02810000e-01
   9.43380000e-02   3.05010000e-01