In [1]:
import io, sys
import numpy as np
from heapq import * # data structure to store data

In [2]:
def load_vectors(filename):
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray(list(map(float, tokens[1:])))
    return data

In [3]:
# Loading word vectors

print('')
print(' ** Word vectors ** ')
print('')

word_vectors = load_vectors('wiki.en.vec')
#word_vectors['a']


 ** Word vectors ** 



In [4]:
## This function computes the cosine similarity between vectors u and v

def cosine(u, v):
    ## FILL CODE
    cos = np.dot(u,v)/(np.linalg.norm(u)*np.linalg.norm(v))
    return cos

## This function returns the word corresponding to 
## nearest neighbor vector of x
## The list exclude_words can be used to exclude some
## words from the nearest neighbors search

In [5]:
cosine(word_vectors['the'],word_vectors['the'])

1.0

In [6]:
# compute similarity between words

print('similarity(apple, apples) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['apples']))
print('similarity(apple, banana) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['banana']))
print('similarity(apple, tiger) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['tiger']))

similarity(apple, apples) = 0.637
similarity(apple, banana) = 0.431
similarity(apple, tiger) = 0.212


In [7]:
print('similarity(woman, queen) = %.3f' %
      cosine(word_vectors['woman'], word_vectors['queen']))

similarity(woman, queen) = 0.361


In [27]:
## Functions for nearest neighbors
# exclude_words means compare all the word except the target
def nearest_neighbor(x, word_vectors, exclude_words=[]):
    best_score = -1.0
    best_word = ''
    for i in word_vectors:
        if i not in exclude_words and not (x ==  word_vectors[i]).all():
            dist = cosine(x, word_vectors[i])
            if dist > best_score:
                best_score = dist
                best_word = i
            

    ## FILL CODE

    return best_word

## This function return the words corresponding to the
## K nearest neighbors of vector x.
## You can use the functions heappush and heappop.

def knn(x, vectors, k):
    heap = []
    for i in vectors:
        if len(heap)>k:
            heappush(heap, (cosine(x,vectors[i]),i))
            heappop(heap)
        
        else:
            heappush(heap, (cosine(x,vectors[i]),i))
        
        
    

    ## FILL CODE

    return [heappop(heap) for i in range(len(heap))][::-1][1:]

In [28]:
print('The nearest neighbor of cat is: ' +
      nearest_neighbor(word_vectors['cat'], word_vectors, exclude_words = ['cat', 'cats']))

The nearest neighbor of cat is: dog


In [29]:
# looking at nearest neighbors of a word

print('The nearest neighbor of cat is: ' +
      nearest_neighbor(word_vectors['cat'], word_vectors, exclude_words = ['cat', 'cats']))

knn_cat = knn(word_vectors['cat'], word_vectors, 5)
print('')
print('cat')
print('--------------')
for score, word in knn(word_vectors['cat'], word_vectors, 5):
    print(word + '\t%.3f' % score)

The nearest neighbor of cat is: dog

cat
--------------
cats	0.732
dog	0.638
pet	0.573
rabbit	0.549
dogs	0.538


In [30]:
knn_cat = knn(word_vectors['woman'], word_vectors, 6)
print('')
print('woman')
print('--------------')
for score, word in knn(word_vectors['woman'], word_vectors, 6):
    print(word + '\t%.3f' % score)


woman
--------------
girl	0.703
man	0.651
women	0.617
maid	0.603
herself	0.588
lover	0.572


In [49]:
## This function return the word d, such that a:b and c:d
## verifies the same relation

def analogy(a, b, c, word_vectors):
    ## FILL CODE
    a = a.lower()
    b = b.lower()
    c = c.lower()
    x_a, x_b, x_c = word_vectors[a], word_vectors[b], word_vectors[c]
    x_a = x_a/np.linalg.norm(x_a)
    x_b = x_b/np.linalg.norm(x_b)
    x_c = x_c/np.linalg.norm(x_c)
    
    best_score = float('-inf')
    best_word = ''
    for i in word_vectors.keys():
#         if True in [w in i for w in [a, b, c]]:
        if i in [a, b, c]:
            continue
            
        v = word_vectors[i]/np.linalg.norm(word_vectors[i])
        d = (x_b - x_a + x_c ).dot(v)
        if d > best_score:
            best_score = d
            best_word = i
            
    return best_word

In [50]:
# Word analogies

print('')
print('france - paris + rome = ' + analogy('paris', 'france', 'rome', word_vectors))


france - paris + rome = italy


In [51]:
print('')
print('king - man + woman = ' + analogy('man', 'king', 'woman', word_vectors))


king - man + woman = queen


In [17]:
## A word about biases in word vectors:

print('')
print('similarity(genius, man) = %.3f' %
      cosine(word_vectors['man'], word_vectors['genius']))
print('similarity(genius, woman) = %.3f' %
      cosine(word_vectors['woman'], word_vectors['genius']))


similarity(genius, man) = 0.445
similarity(genius, woman) = 0.325


In [18]:
## Compute the association strength between:
##   - a word w
##   - two sets of attributes A and B

def association_strength(w, A, B, vectors):
    #strength = 0.0
    s1 = 0.0
    s2 = 0.0
    for i in A:
        s1 +=cosine(vectors[w], vectors[i])
    #print(s1)
    for j in B:
        s2 += cosine(vectors[w], vectors[j])
    
    #print(s2)
    strength = (1/len(A))*s1 - (1/len(B)*s2)
    ## FILL CODE
    return strength

## Perform the word embedding association test between:
##   - two sets of words X and Y
##   - two sets of attributes A and B

def weat(X, Y, A, B, vectors):
    score = 0.0
    ## FILL CODE
    score_1 = 0.0
    score_2 = 0.0
    for x in X:
        score_1 += association_strength(x, A,B, vectors)
    
    for y in Y:
        score_2 += association_strength(y, A,B, vectors)
    score = score_1 - score_2
    return score

In [19]:
## Replicate one of the experiments from:
##
## Semantics derived automatically from language corpora contain human-like biases
## Caliskan, Bryson, Narayanan (2017)

career = ['executive', 'management', 'professional', 'corporation', 
          'salary', 'office', 'business', 'career']
family = ['home', 'parents', 'children', 'family',
          'cousins', 'marriage', 'wedding', 'relatives']
male = ['john', 'paul', 'mike', 'kevin', 'steve', 'greg', 'jeff', 'bill']
female = ['amy', 'joan', 'lisa', 'sarah', 'diana', 'kate', 'ann', 'donna']

print('')
print('Word embedding association test: %.3f' %
      weat(career, family, male, female, word_vectors))


Word embedding association test: 0.847


In [20]:
association_strength('cat', career, family, word_vectors)

-0.06923520712962554