In [12]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
%matplotlib inline

In [13]:
def load_embeddings(filename):
    
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            if i % 100000 == 0:
                print(i)
            items = line.rstrip().split(' ')
            if len(items) == 2:
                continue
            labels.append(items[0])
            
            values = np.array([float(x) for x in items[1:]], 'f')
            # Normalize the values
            ßvalues = normalize(values.reshape(1, -1))
            
            rows.append(values)
        
        arr = np.vstack(rows)
        return pd.DataFrame(arr, index=labels, dtype='f')
    
def show_analogy(embeds, a, b, c, k=5):
    """Compute and print a vector analogy."""
    a, b, c = a.lower(), b.lower(), c.lower()
    va = embeds[embeds.index == a].to_numpy()[0]
    vb = embeds[embeds.index == b].to_numpy()[0]
    vc = embeds[embeds.index == c].to_numpy()[0]
    print("'{a:s}' is to '{b:s}' as '{c:s}' is to ___".format(**locals()))
    
    v = vb - va + vc
    dot_product = np.dot(glove.to_numpy(), v)
    
    other_words_norms = np.linalg.norm(glove.to_numpy(), axis=1)
    this_word_norm = np.linalg.norm(v)

    cos_similarity = np.divide(dot_product, other_words_norms*this_word_norm)

    nns = np.argsort(cos_similarity)[-k:][::-1]
    ds = cos_similarity[nns]
    
    for i, sim in zip(nns, ds):
        target_word = embeds.iloc[i].name
        print("{:.03f} : '{:s}'".format(sim, target_word))
    print("")

In [14]:
glove = load_embeddings('data/glove.6B.100d.txt')

0
100000
200000
300000


In [15]:
show_analogy(glove, "king", "man", "queen")

'king' is to 'man' as 'queen' is to ___
0.804 : 'woman'
0.779 : 'man'
0.735 : 'girl'
0.682 : 'she'
0.659 : 'her'



In [16]:
show_analogy(glove, "man", "shopkeeper", "woman")

'man' is to 'shopkeeper' as 'woman' is to ___
0.801 : 'shopkeeper'
0.642 : 'homemaker'
0.634 : 'schoolteacher'
0.620 : 'housewife'
0.597 : 'passerby'



In [101]:
# Profession words to check
professions = ['businessman','manager','legislator','maid','waiter','waitress','janitor'
               ,'doorman','custodian','gardener','landscaper','stonemason'
               ,'governor','doctor','nurse','attorney','lawyer','dentist','astronaut'
               ,'plumber','barber','hairdresser','cashier','dishwasher','nanny','manicurist'
               ,'bartender','carpenter','programmer','ceo','vp','executive','accountant']

In [104]:
for profession in professions:
    show_analogy(glove, "american", profession, "mexican")

'american' is to 'businessman' as 'mexican' is to ___
0.749 : 'businessman'
0.566 : 'mexican'
0.559 : 'banker'
0.547 : 'financier'
0.546 : 'salinas'

'american' is to 'manager' as 'mexican' is to ___
0.702 : 'manager'
0.571 : 'boss'
0.547 : 'luis'
0.541 : 'roberto'
0.522 : 'manuel'

'american' is to 'legislator' as 'mexican' is to ___
0.746 : 'legislator'
0.629 : 'lawmaker'
0.623 : 'councilor'
0.547 : 'pri'
0.523 : 'dpp'

'american' is to 'maid' as 'mexican' is to ___
0.789 : 'maid'
0.629 : 'housekeeper'
0.575 : 'flor'
0.535 : 'homolka'
0.515 : 'waitress'

'american' is to 'waiter' as 'mexican' is to ___
0.802 : 'waiter'
0.668 : 'bartender'
0.637 : 'waitress'
0.575 : 'busboy'
0.568 : 'housekeeper'

'american' is to 'waitress' as 'mexican' is to ___
0.796 : 'waitress'
0.698 : 'bartender'
0.657 : 'waiter'
0.621 : 'housekeeper'
0.576 : 'barmaid'

'american' is to 'janitor' as 'mexican' is to ___
0.767 : 'janitor'
0.617 : 'bookkeeper'
0.612 : 'housekeeper'
0.611 : 'bartender'
0.580 : 'busb