In [1]:
import numpy as np

In [2]:
def load_embedding(filepath):
    
    '''
    Load word/entity embeddings
    Args: file path
    Returns: word2index, embedding matrix
    '''
    
    with open(filepath) as f:
        # first line implies the dimension
        n_word, m_dim = [int(i) for i in f.readline().strip().split()]
        # word vector matrix
        mat = np.zeros((n_word, m_dim))
        # word2index
        w2i = {}
        i_row = 0
        
        for line in f:
            ss = line.strip().split()
            # some words have spaces (different length after splitting)
            # use backward matching (match vectors first, then match words)
            
            vector = ss[-300:]
            word_list = ss[:len(ss)-300]
            word = "_".join(word_list)
            
            # <word, id> dictionary
            w2i[word] = i_row
            # word vector matrix
            mat[i_row,:] = [float(i) for i in vector]
            i_row += 1
        
        assert i_row == n_word
        
        # normalize each row by its L2-norm. 
        # This makes dot(v1,v2) and cos(v1,v2) equivalent for any v1, v2 (rows) in this matrix
        for i in range(mat.shape[0]):
            mat[i,:] /= np.linalg.norm(mat[i,:])
        
        # index2word
        i2w = dict([(i, w) for w, i in w2i.items()])
        
        return w2i, i2w, mat

In [3]:
%time w2i, i2w, m = load_embedding("../entity_embedding/enwiki_20180420_300d.txt")

CPU times: user 6min 26s, sys: 31.5 s, total: 6min 57s
Wall time: 7min 27s


In [33]:
q1 = m[w2i["ENTITY/Donald_Trump"], :]

In [34]:
q2 = m[w2i["ENTITY/Hillary_Clinton"], :]

In [35]:
q3 = m[w2i["ENTITY/Kobe_Bryant"], :]

In [None]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')

In [None]:
# Normalizes the vectors in the word2vec class.
model.init_sims(replace=True) 

In [None]:
model['king']

In [None]:
model['queen']