In [3]:
import numpy as np

def get_vectors(glove_path):
    with open(glove_path, 'r') as f:
        words = set()
        word_to_vec = {}
        for line in f:
            line = line.strip().split()
            current_word = line[0]
            words.add(current_word)
            word_to_vec[current_word] = np.array(line[1:], dtype=np.float64)

    return words,word_to_vec

words , word_map = get_vectors('/home/yash/DeepLearning/glove.6B.50d.txt')

## Loading the Word Vectors

Here we use a 6B word count corpus with each word having a vector of 50 dimensions.

In [6]:
print(word_map['the']) # Embeddings for the word
print(len(word_map['the'])) # Dimensions of each word embedding

[ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01]
50


In [11]:
def cosine_similarity(u,v):
    if np.all(u == v):
        return 1
    
    dot = np.dot(u,v)

    norm_u = np.sqrt(np.sum(u ** 2))
    norm_v = np.sqrt(np.sum(v ** 2))

    if np.isclose(norm_u * norm_v,0,atol=1e-32):
        return 0 
    
    return dot / (norm_u * norm_v)


print(cosine_similarity(word_map['father'],word_map['mother']))
print(cosine_similarity(word_map['dog'],word_map['cat']))
print(cosine_similarity(word_map['ball'],word_map['crocodile']))

0.8909038442893615
0.9218005273769252
0.27439246261379424


As you can see here that the analogy for some words is not correct. This is due to the lack of learn't embeddings and also the small dimension of the word embeddings. Which restricts the model to only learn the most common words.

In [20]:
def get_analogy(a: str,b: str,c: str,word_map: dict) -> str:
    a , b , c = a.lower(),b.lower(), c.lower()

    e_a, e_b , e_c = word_map[a],word_map[b],word_map[c]

    words = word_map.keys()
    max_cosine = -100
    best_word = None

    for w in words:
        if w == c:
            continue

        similarity = cosine_similarity(e_b-e_a,word_map[w]-e_c)

        if similarity > max_cosine:
            max_cosine = similarity
            best_word = w

    return best_word


print(get_analogy('small','smaller','large',word_map))
print(get_analogy('italy','italian','spain',word_map))

smaller
spanish


## Debiasing the word embeddings (Mostly gender bias)

In [21]:
# Some biases in the embeddings
g = word_map['woman'] - word_map['man']
print(g)

[-0.087144    0.2182     -0.40986    -0.03922    -0.1032      0.94165
 -0.06042     0.32988     0.46144    -0.35962     0.31102    -0.86824
  0.96006     0.01073     0.24337     0.08193    -1.02722    -0.21122
  0.695044   -0.00222     0.29106     0.5053     -0.099454    0.40445
  0.30181     0.1355     -0.0606     -0.07131    -0.19245    -0.06115
 -0.3204      0.07165    -0.13337    -0.25068714 -0.14293    -0.224957
 -0.149       0.048882    0.12191    -0.27362    -0.165476   -0.20426
  0.54376    -0.271425   -0.10245    -0.32108     0.2516     -0.33455
 -0.04371     0.01258   ]


In [22]:
# No bias
name_list = ['john', 'marie', 'sophie', 'ronaldo', 'priya', 'rahul', 'danielle', 'reza', 'katy', 'yasmin']

for w in name_list:
    print (w, cosine_similarity(word_map[w], g))

john -0.23163356145973724
marie 0.31559793539607295
sophie 0.31868789859418784
ronaldo -0.3124479685032943
priya 0.17632041839009407
rahul -0.16915471039231722
danielle 0.24393299216283892
reza -0.0793042967219955
katy 0.2831068659572615
yasmin 0.23313857767928753


In [23]:
# Gender bias
words = ['computer','scientist','engineer','doctor','lawyer']
for word in words:
    print(word, cosine_similarity(word_map[word],g))

computer -0.10330358873850494
scientist -0.0519303528131346
engineer -0.08039280494524072
doctor 0.1189528941093504
lawyer 0.019827378154494146


Here you can see that the occupations have a negative value which are also close to the male embeddings found above. 
This proves  that the embeddings have a bias. 
We need to fix this bias.