In [1]:
import numpy as np

In [2]:
def load_embedding(filepath):
    
    '''
    Load word/entity embeddings
    Args: file path
    Returns: word2index, embedding matrix
    '''
    
    with open(filepath) as f:
        # first line implies the dimension
        n_word, m_dim = [int(i) for i in f.readline().strip().split()]
        # word vector matrix
        mat = np.zeros((n_word, m_dim))
        # word2index
        w2i = {}
        i_row = 0
        
        for line in f:
            ss = line.strip().split()
            # some words have spaces (different length after splitting)
            # use backward matching (match vectors first, then match words)
            
            vector = ss[-300:]
            word_list = ss[:len(ss)-300]
            word = "_".join(word_list)
            
            # <word, id> dictionary
            w2i[word] = i_row
            # word vector matrix
            mat[i_row,:] = [float(i) for i in vector]
            i_row += 1
        
        assert i_row == n_word
        
        # normalize each row by its L2-norm. 
        # This makes dot(v1,v2) and cos(v1,v2) equivalent for any v1, v2 (rows) in this matrix
        for i in range(mat.shape[0]):
            mat[i,:] /= np.linalg.norm(mat[i,:])
        
        # index2word
        i2w = dict([(i, w) for w, i in w2i.items()])
        
        return w2i, i2w, mat

In [3]:
%time w2i, i2w, m = load_embedding("../entity_embedding/enwiki_20180420_300d.txt")

CPU times: user 6min 26s, sys: 31.5 s, total: 6min 57s
Wall time: 7min 27s


In [33]:
q1 = m[w2i["ENTITY/Donald_Trump"], :]

In [34]:
q2 = m[w2i["ENTITY/Hillary_Clinton"], :]

In [35]:
q3 = m[w2i["ENTITY/Kobe_Bryant"], :]

In [1]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')

In [3]:
# Normalizes the vectors in the word2vec class.
model.init_sims(replace=True) 

In [6]:
model['king'].size

300

In [7]:
model['king']

array([ 4.34063822e-02,  1.02627501e-02,  2.96526169e-03,  4.81171533e-02,
       -8.83269403e-03, -1.24498932e-02,  3.85273732e-02, -6.83061704e-02,
        1.76653881e-02,  1.25171900e-01, -8.34479332e-02, -1.04309916e-01,
       -6.12400137e-02, -8.58033169e-03, -5.78751788e-02, -5.85481450e-02,
        1.19451676e-02,  1.79808424e-03,  1.59829706e-02,  4.44158353e-02,
        4.71077040e-02,  3.88638563e-02,  2.05255002e-02,  4.71077040e-02,
        3.48260514e-02, -6.09035306e-02, -8.68127644e-02,  2.06096210e-02,
        1.17769256e-01, -1.07254144e-02,  3.60037461e-02,  2.12825872e-02,
        4.29016575e-02,  1.37958273e-01, -1.11039586e-01,  2.89375894e-02,
        1.34593435e-02,  2.01890152e-03,  2.42268182e-02,  5.95575981e-02,
        4.77806702e-02, -7.97466114e-02,  9.75802392e-02,  4.91266064e-02,
        1.17769256e-01, -8.24384764e-03, -3.78544033e-02,  1.14404419e-02,
       -1.88430808e-02,  5.27858641e-03, -5.58562763e-02,  5.45103438e-02,
       -8.95046368e-02,  