In [9]:
import numpy as np

In [10]:
def parse_glove_file(file_path, ndim):
    # Count lines to pre-allocate memory
    line_count = 0
    with open(file_path, 'r') as f:
        for line in f:
            line_count += 1
    print("Found {:,} words.".format(line_count))
    
    # Pre-allocate vectors as a contiguous array
    W = np.zeros((line_count, ndim), dtype=np.float32)
    words = []

    print("Parsing vectors... ", end="")
    with open(file_path, 'r') as f:
        for i, line in enumerate(f):
            word, numbers = line.split(maxsplit=1)
            words.append(word)
            W[i] = np.fromstring(numbers, dtype=np.float32, sep=" ")
    
    print("Done! (W.shape = {:s})".format(str(W.shape)))
    return words, W

def find_nn_cos(v, Wv, k=10):
    cos_similarity = np.dot(Wv, v) / (np.linalg.norm(v) * np.linalg.norm(Wv, axis=1))
    nns = np.argsort(cos_similarity)[-k:]
    return (nns, cos_similarity[nns])

def analogy(vA, vB, vC, Wv, k=5):
    return find_nn_cos(vB - vA + vC, Wv, k)

class Glove:
    
    def __init__(self, file_path, ndim):
        self.words, W = parse_glove_file(file_path, ndim)
        self.W = W / np.linalg.norm(W, axis=1)[:,None]
        self.word_index = {}
        for i, word in enumerate(self.words):
            self.word_index[word] = i
            
    def vector(self, word):
        return self.W[self.word_index[word]]
    
    def analogy(self, a, b, c, k=5):
        return [self.words[i] for i in analogy(self.vector(a), self.vector(b), self.vector(c), self.W, k)[0]]


In [12]:
glove = Glove("/home/madereth/Projects/w210-final/data/glove/vectors.txt", 200)

Found 100,001 words.
Parsing vectors... Done! (W.shape = (100001, 200))


In [14]:
glove.analogy("he", "she", "king")

['prince', 'luther', 'mary', 'queen', 'king']

In [16]:
glove.analogy("he", "she", "clinton")

['bush', 'clintons', 'rodham', 'hillary', 'clinton']

In [18]:
glove.analogy("democrat", "republican", "clinton")

['aides', 'bushs', 'clintons', 'bush', 'clinton']

In [21]:
glove.analogy("democrat", "republican", "tax")

['budget', 'spending', 'income', 'taxes', 'tax']

In [23]:
glove.analogy("democrat", "republican", "budget")

['deficit', 'budgets', 'spending', 'fiscal', 'budget']

In [25]:
glove.analogy("he", "she", "president")

['mrs', 'chairwoman', 'pres', 'vice', 'president']

In [27]:
glove.analogy("republican", "democrat", "reagan")

['administration', 'nixon', 'ronald', 'reagans', 'reagan']

In [35]:
glove.analogy("president", "executive", "congress")

['legislature', 'committee', 'lawmakers', 'congressional', 'congress']