In [3]:
# Word2Vec 的实现

import collections
import math
import random
import sys
import time
import os
import numpy as np

import torch
from torch import nn
import torch.utils.data as Data

sys.path.append("..")
import d2l_pytorch.d2l as d2l
print(torch.__version__)

2.2.1+cu121


In [4]:

assert "ptb.train.txt" in os.listdir("../Datasets/PTB")

with open("../Datasets/PTB/ptb.train.txt", "r") as f:
  lines = f.readlines()
  raw_dataset = [st.split() for st in lines]
  
print("# sentences: %d" % len(raw_dataset))

for st in raw_dataset[:3]:
  print("# tokens: ", len(st), st[:5])


# sentences: 42068
# tokens:  24 ['aer', 'banknote', 'berlitz', 'calloway', 'centrust']
# tokens:  15 ['pierre', '<unk>', 'N', 'years', 'old']
# tokens:  11 ['mr.', '<unk>', 'is', 'chairman', 'of']


In [5]:
# 建立层词语索引
# 为了计算简单，只保留在数据集中至少出现5次的词
counter = collections.Counter([tk for st in raw_dataset for tk in st])
counter = dict(filter(lambda x: x[1] >= 5, counter.items()))

# 然后将词映射到整数索引
idx_to_token = [tk for tk, _ in counter.items()]
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx] for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])
print("# tokens : %d " % num_tokens)


# tokens : 887100 


In [6]:
def discard(idx):
  return random.uniform(0, 1) < 1 - math.sqrt((1e-4 / counter[idx_to_token[idx]] * num_tokens))

subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]

print("# tokens: %d" % sum([len(st) for st in subsampled_dataset]))


# tokens: 375665


In [7]:
def compare_counts(token):
  return "# %s: before =%d, after = %d" % (
    token,
    sum([st.count(token_to_idx[token]) for st in dataset]),
    sum([st.count(token_to_idx[token]) for st in subsampled_dataset]),
  )

compare_counts("the")

'# the: before =50770, after = 2144'

In [8]:
def get_centers_and_contexts(dataset, max_window_size):
  centers, contexts = [], []
  for st in dataset:
    if len(st) < 2:
      continue
    centers += st
    for center_i in range(len(st)):
      window_size = random.randint(1, max_window_size)
      indices = list(
        range(max(0, center_i - window_size), min(len(st), center_i + 1 + window_size))
      )
      indices.remove(center_i)
      contexts.append([st[idx] for idx in indices])

  return centers, contexts


In [9]:
tiny_dataset = [list(range(7)), list(range(7,10))]
print("dataset", tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
  print("center", center, "has contexts", context)

dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
center 0 has contexts [1]
center 1 has contexts [0, 2, 3]
center 2 has contexts [0, 1, 3, 4]
center 3 has contexts [1, 2, 4, 5]
center 4 has contexts [3, 5]
center 5 has contexts [3, 4, 6]
center 6 has contexts [4, 5]
center 7 has contexts [8]
center 8 has contexts [7, 9]
center 9 has contexts [7, 8]


In [10]:
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

In [11]:
# 负采样
# 采用负采样来进行近似训练
def get_negatives(all_contexts, sampling_weights, K):
  all_negatives, neg_candidates, i = [], [], 0
  population = list(range(len(sampling_weights)))
  for contexts in all_contexts:
    negatives = []
    while len(negatives) < len(contexts) * K:
      if i == len(neg_candidates):
        i, neg_candidates = 0, random.choices(population, sampling_weights, k=int(1e5))
      neg, i = neg_candidates[i], i + 1
      if neg not in set(contexts):
        negatives.append(neg)
    all_negatives.append(negatives)

  return all_negatives


In [12]:
sampling_weights = [counter[w] ** 0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)


In [13]:
class MyDataset(torch.utils.data.Dataset):
  def __init__(self, centers, contexts, negatives):
    assert len(centers) == len(contexts) == len(negatives)
    self.centers = centers
    self.contexts = contexts
    self.negatives = negatives

  def __getitem__(self, index):
    return (self.centers[index], self.contexts[index], self.negatives[index])

  def __len__(self):
    return len(self.centers)


In [14]:
def batchify(data):
  """
  用作DataLoader的参数collate_fn: 输入是个长为batchsize的list,
  list中的每个元素都是Dataset类调用__getitem__得到的结果
  """
  max_len = max(len(c) + len(n) for _, c, n in data)
  centers, contexts_negatives, masks, labels = [], [], [], []
  for center, context, negative in data:
    cur_len = len(context) + len(negative)
    centers += [center]
    contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
    masks += [[1] * cur_len + [0] * (max_len - cur_len)]
    labels += [[1] * len(context) + [0] * (max_len - len(context))]

  return (
    torch.tensor(centers).view(-1, 1),
    torch.tensor(contexts_negatives),
    torch.tensor(masks),
    torch.tensor(labels),
  )


In [16]:
batch_size = 512
num_workers = 0 if sys.platform.startswith("win32") else 4
dataset = MyDataset(all_centers, all_contexts, all_negatives)
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True, collate_fn=batchify, num_workers=num_workers)

for batch in data_iter:
  for name, data in zip(["centers", "contexts_negatives", "masks", "labels"], batch):
    print(name, "shape: ", data.shape)
  break

centers shape:  torch.Size([512, 1])
contexts_negatives shape:  torch.Size([512, 60])
masks shape:  torch.Size([512, 60])
labels shape:  torch.Size([512, 60])


In [17]:
# 嵌入层
# 获取词嵌入的层称为嵌入层
embed = nn.Embedding(num_embeddings=20, embedding_dim=4)
embed.weight


Parameter containing:
tensor([[-1.4418,  0.0980, -0.3250, -0.7391],
        [-1.7709,  0.9966, -1.5428,  0.1804],
        [-0.2097, -0.9861,  1.6934,  0.8230],
        [ 0.9468, -0.7926,  0.0564,  1.4849],
        [-0.2669, -1.4841, -1.5706,  0.3875],
        [ 0.1888,  0.4935,  0.0440,  0.4638],
        [ 1.9455, -0.0733, -1.2296,  0.3174],
        [-1.4716,  0.9452, -0.4368,  1.0484],
        [-0.3472,  0.6737,  1.6569,  0.1197],
        [ 1.0856,  0.7494, -0.6048, -0.1343],
        [-0.5114, -1.0079, -1.1517,  0.7051],
        [ 0.7341, -0.0736,  2.3829,  0.0430],
        [ 0.0382,  1.2946, -0.6021,  0.9613],
        [ 0.0298,  0.5747,  0.5826, -0.7626],
        [-0.3054, -1.0021,  1.3428,  1.1644],
        [-1.3099,  0.8449,  1.2868, -0.7026],
        [ 0.0469, -1.1069,  0.3855,  0.0371],
        [-0.6478, -0.7745, -1.0327, -2.1436],
        [-0.0846,  0.9328, -1.8904,  0.1263],
        [ 1.3697, -0.8446,  0.2677, -1.5738]], requires_grad=True)

In [18]:
x = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.long)
embed(x)

tensor([[[-1.7709,  0.9966, -1.5428,  0.1804],
         [-0.2097, -0.9861,  1.6934,  0.8230],
         [ 0.9468, -0.7926,  0.0564,  1.4849]],

        [[-0.2669, -1.4841, -1.5706,  0.3875],
         [ 0.1888,  0.4935,  0.0440,  0.4638],
         [ 1.9455, -0.0733, -1.2296,  0.3174]]], grad_fn=<EmbeddingBackward0>)

In [19]:
X = torch.ones((2, 1, 4))
Y = torch.ones((2, 4, 6))
torch.bmm(X, Y).shape

torch.Size([2, 1, 6])

In [20]:
# 跳字模型前向计算
# 跳字模型的输入包含中心词索引center以及连结的背景词与噪声词索引 contexts_and_negatives
# center 变量的性状为(批量大小, 1), 而contexts_and_negatives变量的形状为(批量大小, max_len)
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
  v = embed_v(center)
  u = embed_u(contexts_and_negatives)
  pred = torch.bmm(v, u.permute(0, 2, 1))
  return pred


In [21]:
# 二元交叉熵损失函数
class SigmoidBinaryCrossEntropyLoss(nn.Module):
  def __init__(self):
    super(SigmoidBinaryCrossEntropyLoss, self).__init__()

  def forward(self, inputs, targets, mask=None):
    """
    input - Tensor shape: (batch_size, len)
    target - Tensor of the same shape as input
    """
    inputs, targets, mask = inputs.float(), targets.float(), mask.float()
    res = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none", weight=mask)
    return res.mean(dim=1)

In [22]:
loss = SigmoidBinaryCrossEntropyLoss()
pred = torch.tensor([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]])
label = torch.tensor([[1, 0, 0, 0], [1, 1, 0, 0]])
mask = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 0]])
loss(pred, label, mask) * mask.shape[1] / mask.float().sum(dim=1)


tensor([0.8740, 1.2100])

In [23]:
def sigmd(x):
  return -math.log(1 / (1 + math.exp(-x)))


print("%.4f" % ((sigmd(1.5) + sigmd(-0.3) + sigmd(1) + sigmd(-2)) / 4))
print("%.4f" % ((sigmd(1.1) + sigmd(-0.6) + sigmd(-2.2)) / 3))


0.8740
1.2100


In [24]:
embed_size = 100
net = nn.Sequential(
  nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size),
  nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size),
)


In [27]:
# 定义训练函数
def train(net, lr, num_epochs):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  print("train on", device)
  net = net.to(device)
  optimizer = torch.optim.Adam(net.parameters(), lr=lr)
  for epoch in range(num_epochs):
    start, l_sum, n = time.time(), 0.0, 0
    for batch in data_iter:
      center, context_negative, mask, label = [d.to(device) for d in batch]
      pred = skip_gram(center, context_negative, net[0], net[1])

      l1 = (
        loss(pred.view(label.shape), label, mask) * mask.shape[1] / mask.float().sum(dim=1)
      ).mean()
      optimizer.zero_grad()
      l1.backward()
      optimizer.step()
      l_sum += l1.cpu().item()
      n += 1
    print("epoch %d, loss %.2f, time %.2fs" % (epoch + 1, l_sum / n, time.time() - start))


In [28]:
train(net, 0.01, 10)

train on cuda
epoch 1, loss 1.97, time 21.07s
epoch 2, loss 0.62, time 18.71s
epoch 3, loss 0.45, time 18.63s
epoch 4, loss 0.40, time 18.79s
epoch 5, loss 0.37, time 18.50s
epoch 6, loss 0.35, time 18.79s
epoch 7, loss 0.34, time 18.78s
epoch 8, loss 0.33, time 18.46s
epoch 9, loss 0.32, time 18.86s
epoch 10, loss 0.32, time 19.16s


In [29]:
def get_similar_tokens(query_token, k, embed):
  W = embed.weight.data
  x = W[token_to_idx[query_token]]
  cos = torch.matmul(W, x) / (torch.sum(W* W, dim=1) * torch.sum(x* x) + 1e-9).sqrt()
  _, topk = torch.topk(cos, k=k+1)
  topk = topk.cpu().numpy()
  for i in topk[1:]:
    print("cosine sim=%.3f: %s" % (cos[i], (idx_to_token[i])))

get_similar_tokens("chip", 3, net[0])

cosine sim=0.433: doyle
cosine sim=0.424: engineering
cosine sim=0.419: microprocessor
