In [1]:
# 子词嵌入
# 在word2vec中, 我们并没有直接利用构词学中的信息.
# 例如,dog和dogs分别用两个不同的词向量表示，而模型中并为直接表达这两个向量之间的关系
# fastText提出了子词嵌入的方法，从而将构词信息引入word2vec中的跳字模型

import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)


cuda


In [10]:
# 使用torchtext练习
import torchtext.vocab as vocab

[key for key in vocab.pretrained_aliases.keys() if "glove" in key]

['glove.42B.300d',
 'glove.840B.300d',
 'glove.twitter.27B.25d',
 'glove.twitter.27B.50d',
 'glove.twitter.27B.100d',
 'glove.twitter.27B.200d',
 'glove.6B.50d',
 'glove.6B.100d',
 'glove.6B.200d',
 'glove.6B.300d']

In [15]:
cache_dir = "../Datasets/Glove"
glove = vocab.GloVe(name="6B", dim=50, cache=cache_dir)

../Datasets/Glove\glove.6B.zip: 862MB [16:21, 878kB/s]                                 
100%|█████████▉| 399999/400000 [00:20<00:00, 19775.77it/s]


In [18]:
print("all words len %d " % len(glove.stoi))

glove.stoi["beautiful"]
glove.itos[3366]

all words len 400000 


'beautiful'

In [19]:
def knn(W, x, k):
  cos = torch.matmul(W, x.view((-1,))) / (
    (torch.sum(W * W, dim=1) + 1e-9).sqrt() * torch.sum(x * x).sqrt()
  )
  _, topk = torch.topk(cos, k=k)
  topk = topk.cpu().numpy()
  return topk, [cos[i].item() for i in topk]


In [20]:
def get_similar_tokens(query_token, k, embed):
  topk, cos = knn(embed.vectors, embed.vectors[embed.stoi[query_token]], k + 1)
  for i, c in zip(topk[1:], cos[1:]):
    print("cosine sim = %.3f: %s " % (c, (embed.itos[i])))


In [21]:
get_similar_tokens("chip", 3, glove)

cosine sim = 0.856: chips 
cosine sim = 0.749: intel 
cosine sim = 0.749: electronics 


In [22]:
get_similar_tokens("baby", 3, glove)

cosine sim = 0.839: babies 
cosine sim = 0.800: boy 
cosine sim = 0.792: girl 


In [23]:
def get_analogy(token_a, token_b, token_c, embed):
  vecs = [embed.vectors[embed.stoi[t]] for t in [token_a, token_b, token_c]]
  x = vecs[1] - vecs[0] + vecs[2]
  topk, cos = knn(embed.vectors, x, 1)
  return embed.itos[topk[0]]

In [25]:
get_analogy("man", "woman", "son", glove)

'daughter'

In [26]:
get_analogy("beijing", "china", "tokyo",glove)

'japan'

In [27]:
get_analogy("bad", "worst", "big", glove)

'biggest'

In [30]:
get_analogy("do", "did", "go", glove)

'went'