In [1]:
from mxnet import nd
from mxnet.contrib import text

In [2]:
glove_vec = text.embedding.get_pretrained_file_names("glove")

In [3]:
print(glove_vec)

['glove.42B.300d.txt', 'glove.6B.50d.txt', 'glove.6B.100d.txt', 'glove.6B.200d.txt', 'glove.6B.300d.txt', 'glove.840B.300d.txt', 'glove.twitter.27B.25d.txt', 'glove.twitter.27B.50d.txt', 'glove.twitter.27B.100d.txt', 'glove.twitter.27B.200d.txt']


In [4]:
glove_6b50d = text.embedding.create('glove', pretrained_file_name="glove.6B.50d.txt")

Downloading /Users/zhoujianyao/.mxnet/embeddings/glove/glove.6B.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/embeddings/glove/glove.6B.zip...


In [5]:
word_size = len(glove_6b50d)
print(word_size)

400001


In [6]:
#词的索引
index = glove_6b50d.token_to_idx['happy']
print(index)

1752


In [7]:
#索引到词
word = glove_6b50d.idx_to_token[1752]
print(word)

happy


In [8]:
#词向量
print(glove_6b50d.idx_to_vec[1752])


[ 0.092086    0.25709999 -0.58692998 -0.37029001  1.08280003 -0.55466002
 -0.78141999  0.58696002 -0.58714002  0.46318001 -0.11267     0.2606
 -0.26927999 -0.072466    1.24699998  0.30570999  0.56730998  0.30509001
 -0.050312   -0.64442998 -0.54513001  0.86429     0.20914     0.56334001
  1.12279999 -1.05159998 -0.78105003  0.29655999  0.72610003 -0.61391997
  2.4224999   1.01419997 -0.17753001  0.4147     -0.12966    -0.47064
  0.38069999  0.16309001 -0.32300001 -0.77898997 -0.42473    -0.30825999
 -0.42242     0.055069    0.38266999  0.037415   -0.43020001 -0.39442
  0.10511     0.87286001]
<NDArray 50 @cpu(0)>


# Glove应用

In [13]:
#余玄相似度
def cos_sim(x, y):
    return nd.dot(x,y)/(x.norm() * y.norm())

In [14]:
a = nd.array([4,5])
b = nd.array([400,500])
print(cos_sim(a,b))


[ 1.]
<NDArray 1 @cpu(0)>


In [9]:
#求近义词
def norm_vecs_by_row(x):
    # 分母中添加的 1e-10 是为了数值稳定性。
    return x / (nd.sum(x * x, axis=1) + 1e-10).sqrt().reshape((-1, 1))

def get_knn(token_embedding, k, word):
    word_vec = token_embedding.get_vecs_by_tokens([word]).reshape((-1, 1))
    vocab_vecs = norm_vecs_by_row(token_embedding.idx_to_vec)
    dot_prod = nd.dot(vocab_vecs, word_vec)
    indices = nd.topk(dot_prod.reshape((len(token_embedding), )), k=k+1,
                      ret_typ='indices')
    indices = [int(i.asscalar()) for i in indices]
    # 除去输入词。
    return token_embedding.to_tokens(indices[1:])

In [16]:
sim_list = get_knn(glove_6b50d, 10, 'baby')
print(sim_list)

['babies', 'boy', 'girl', 'newborn', 'pregnant', 'mom', 'child', 'toddler', 'mother', 'cat']


In [18]:
sim_val = cos_sim(glove_6b50d.get_vecs_by_tokens('baby'), glove_6b50d.get_vecs_by_tokens('babies'))
print(sim_val)


[ 0.83871293]
<NDArray 1 @cpu(0)>


In [20]:
print(get_knn(glove_6b50d, 10, 'computer'))

['computers', 'software', 'technology', 'electronic', 'internet', 'computing', 'devices', 'digital', 'applications', 'pc']


In [21]:
print(get_knn(glove_6b50d, 10, 'run'))

['running', 'runs', 'went', 'start', 'ran', 'out', 'third', 'home', 'off', 'got']


In [22]:
print(get_knn(glove_6b50d, 10, 'love'))

['dream', 'life', 'dreams', 'loves', 'me', 'my', 'mind', 'loving', 'wonder', 'soul']


In [26]:
#求类比词
#vec(c)+vec(b)−vec(a) 
def get_top_k_by_analogy(token_embedding, k, word1, word2, word3):
    word_vecs = token_embedding.get_vecs_by_tokens([word1, word2, word3])
    word_diff = (word_vecs[1] - word_vecs[0] + word_vecs[2]).reshape((-1, 1))
    vocab_vecs = norm_vecs_by_row(token_embedding.idx_to_vec)
    dot_prod = nd.dot(vocab_vecs, word_diff)
    indices = nd.topk(dot_prod.reshape((len(token_embedding), )), k=k,
                      ret_typ='indices')
    indices = [int(i.asscalar()) for i in indices]
    return token_embedding.to_tokens(indices)

In [27]:
#验证vec(son)+vec(woman)-vec(man) 与 vec(daughter) 两个向量之间的余弦相似度
def cos_sim_word_analogy(token_embedding, word1, word2, word3, word4):
    words = [word1, word2, word3, word4]
    vecs = token_embedding.get_vecs_by_tokens(words)
    return cos_sim(vecs[1] - vecs[0] + vecs[2], vecs[3])

In [28]:
word_list = get_top_k_by_analogy(glove_6b50d,1, 'man', 'woman', 'son')

In [29]:
print(word_list)

['daughter']


In [30]:
word_list = get_top_k_by_analogy(glove_6b50d, 1, 'man', 'son', 'woman')
print(word_list)

['daughter']


In [31]:
sim_val = cos_sim_word_analogy(glove_6b50d, 'man', 'woman', 'son', 'daughter')
print(sim_val)


[ 0.96583432]
<NDArray 1 @cpu(0)>


In [34]:
word_list = get_top_k_by_analogy(glove_6b50d, 1, 'beijing', 'china', 'tokyo')

In [35]:
print(word_list)

['japan']


In [38]:
word_list = get_top_k_by_analogy(glove_6b50d, 1, 'bad', 'worst', 'big')

In [39]:
print(word_list)

['biggest']
