In [6]:
from os import path
import numpy as np

In [1]:
def ugly_normalize(vecs):
    normalizers = np.sqrt((vecs * vecs).sum(axis=1))
    normalizers[normalizers == 0] = 1
    return (vecs.T / normalizers).T


class Embeddings:
    def __init__(self, vecsfile, vocabfile=None, normalize=True):
        if vocabfile is None: vocabfile = vecsfile.replace("npy", "vocab")
        self._vecs = np.load(vecsfile)
        self._vocab = open(vocabfile).read().split()
        if normalize:
            self._vecs = ugly_normalize(self._vecs)
        self._w2v = {w: i for i, w in enumerate(self._vocab)}

        
    @classmethod
    def load(cls, vecsfile, vocabfile=None):
        return Embeddings(vecsfile, vocabfile)

    
    def word2vec(self, w):
        return self._vecs[self._w2v[w]]

    
    def similar_to_vec(self, v, N=10):
        sims = self._vecs.dot(v)
        sims = heapq.nlargest(N, zip(sims, self._vocab, self._vecs))
        return sims

    
    def most_similar(self, word, N=10):
        w = self._vocab.index(word)
        sims = self._vecs.dot(self._vecs[w])
        sims = heapq.nlargest(N, zip(sims, self._vocab))
        return sims

In [2]:
def cosine_sim(word1, word2):
    return 1 - spatial.distance.cosine(wv(word1), wv(word2))

def wv(word):
    return glove.word_vectors[glove.dictionary[word]]

In [7]:
w2vf_path = '/home/defeater/Documents/NLP/word2vecf'
w2vf = Embeddings.load(path.join(w2vf_path, 'vecs.npy'), path.join(w2vf_path, 'vecs.vocab'))

In [13]:
f_vocab = w2vf._vocab

In [14]:
f_vocab

['можно',
 'есть',
 'что',
 'так',
 'и',
 'нужно',
 'у',
 'это',
 'не',
 'меня',
 'как',
 'нет',
 'тебя',
 'может',
 'же',
 'то',
 'надо',
 'сделать']