# 10.3 word2vec的实现

In [1]:
import collections
import math
import random
import sys
import time
import os
import numpy as np
import torch
from torch import nn
import torch.utils.data as Data

sys.path.append("..") 
import d2lzh_pytorch as d2l
print(torch.__version__)

1.6.0+cpu


## 10.3.1 处理数据集

In [2]:
assert 'ptb.train.txt' in os.listdir("../../data/ptb")

In [3]:
with open('../../data/ptb/ptb.train.txt', 'r') as f:
    lines = f.readlines()
    # st是sentence的缩写
    raw_dataset = [st.split() for st in lines]

'# sentences: %d' % len(raw_dataset)

'# sentences: 42068'

In [4]:
for st in raw_dataset[:3]:
    print('# tokens:', len(st), st[:5])

# tokens: 24 ['aer', 'banknote', 'berlitz', 'calloway', 'centrust']
# tokens: 15 ['pierre', '<unk>', 'N', 'years', 'old']
# tokens: 11 ['mr.', '<unk>', 'is', 'chairman', 'of']


### 10.3.1.1 建立词语索引

In [5]:
# tk是token的缩写
counter = collections.Counter([tk for st in raw_dataset for tk in st])
# 为了计算方便只保留至少出现五次的词
counter = dict(filter(lambda x:x[1] >=5, counter.items()))

In [6]:
# 将词映射到整数索引
idx_to_token = [tk for tk, _ in counter.items()]
print('idx_to_token[1]:'+str(idx_to_token[1]))
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
print('token_to_idx:'+str(token_to_idx))
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx]
 for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])
'# tokens: %d' % num_tokens # 输出 '# tokens: 887100'

idx_to_token[1]:<unk>


'# tokens: 887100'

### 10.3.1.2 二次采样

In [7]:
# 如果直接套用公式的max 会导致全部被丢弃
def discard(idx):
    return random.uniform(0,1) < 1- math.sqrt(
        1e-4 / counter[idx_to_token[idx]] * num_tokens)

subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset])

'# tokens: 375471'

In [8]:
def compare_counts(token):
    return '# %s: before=%d, after=%d' % (token, sum(
        [st.count(token_to_idx[token]) for st in dataset]), sum(
        [st.count(token_to_idx[token]) for st in subsampled_dataset]))

compare_counts('the')

'# the: before=50770, after=2050'

In [9]:
# 低频词
compare_counts('join')

'# join: before=45, after=45'

### 10.3.1.3 提取中心词和背景词

In [10]:
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        # 每个句⼦⾄少要有2个词才可能组成⼀对“中⼼词-背景词”
        if len(st) <2: 
            continue
        centers += st
        for center_i in range(len(st)):
            # 随机初始化窗口大小
            window_size = random.randint(1, max_window_size)
            indices =list(range(max(0,center_i - window_size),
                                min(len(st), center_i + 1 + window_size)))
            #将中心词排除在背景词之外
            indices.remove(center_i)
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

In [11]:
# 人工创造数据集进行测试
tiny_dataset = [list(range(7)), list(range(7, 10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
    print('center', center, 'has contexts', context)

dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
center 0 has contexts [1, 2]
center 1 has contexts [0, 2]
center 2 has contexts [0, 1, 3, 4]
center 3 has contexts [1, 2, 4, 5]
center 4 has contexts [2, 3, 5, 6]
center 5 has contexts [4, 6]
center 6 has contexts [4, 5]
center 7 has contexts [8, 9]
center 8 has contexts [7, 9]
center 9 has contexts [7, 8]


In [12]:
# 实验中，我们设置最大的背景窗口大小为5
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

## 10.3.2 负采样

In [13]:
def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                # 根据每个词的权重（sampling_weights）随机生成k个词的索引作为噪声词。
                # 为了高效计算，可以将k设得稍大一点
                i, neg_candidates = 0, random.choices(
                    population, sampling_weights, k=int(1e5))
            neg, i = neg_candidates[i], i + 1
            # 噪声词不能是背景词
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

sampling_weights = [counter[w]**0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

## 10.3.3 读取数据

In [14]:
def batchify(data):
    """用作DataLoader的参数collate_fn: 输入是个长为batchsize的list, list中的每个元素都是__getitem__得到的结果"""
    max_len = max(len(c) + len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
    return (torch.tensor(centers).view(-1, 1), torch.tensor(contexts_negatives),
            torch.tensor(masks), torch.tensor(labels))

In [15]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, centers, contexts, negatives):
        assert len(centers) == len(contexts) == len(negatives)
        self.centers = centers
        self.contexts = contexts
        self.negatives = negatives
        
    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])

    def __len__(self):
        return len(self.centers)

batch_size = 512
num_workers = 0 if sys.platform.startswith('win32') else 4

dataset = MyDataset(all_centers, 
                    all_contexts, 
                    all_negatives)
# 数据加载，我们采用素以及小批量来读取他们
# collate_fn：如何取样本的，我们可以定义自己的函数来准确地实现想要的功能
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True,
                            collate_fn=batchify, 
                            num_workers=num_workers)
# masks 作为掩码变量
# contexts_negatives  变量中的某个元素为填充项时，相同位置的掩码变量 masks 中 的 元 素 取 0 ，否则取 1 
# labels:用于区分正类和负类，我们还需要 将 contexts_negatives 变量中的背景词和噪声词区分开来
# 并将与背景词（正类）对应的元素设1，其余清0
for batch in data_iter:
    for name, data in zip(['centers', 'contexts_negatives', 'masks',
                           'labels'], batch):
        print(name, 'shape:', data.shape)
    break

centers shape: torch.Size([512, 1])
contexts_negatives shape: torch.Size([512, 60])
masks shape: torch.Size([512, 60])
labels shape: torch.Size([512, 60])


## 10.3.4 跳字模型
### 10.3.4.1 嵌入层

In [16]:
# Embedding层的输入形状为NxM（N是batch size,M是序列的长度），则输出的形状是N*M*embedding_dimensio
embed = nn.Embedding(num_embeddings=20, embedding_dim=4)
embed.weight

Parameter containing:
tensor([[-1.6770, -1.1819, -1.1333,  0.2844],
        [ 0.2734, -1.4217,  1.6144,  2.9778],
        [-0.6329, -0.0366,  0.5247,  0.6242],
        [ 1.3503,  1.6916, -1.0920, -0.2742],
        [ 1.4962,  0.3332, -0.3864, -0.9550],
        [ 2.5103, -0.8343,  1.2988, -0.5021],
        [-0.8823, -0.4087, -2.3586, -0.4057],
        [-1.5472, -1.2864,  0.0407, -0.0411],
        [ 1.2784, -0.7039, -1.1151,  2.0858],
        [-0.4984,  0.0649, -0.9711,  0.8626],
        [ 1.4216,  1.4594, -2.7379, -0.0952],
        [ 0.4801, -2.1646, -1.2175, -1.5018],
        [ 1.1297,  1.3696,  0.2834,  0.8801],
        [ 0.9566, -0.2099, -1.2655, -1.0015],
        [ 0.4876, -0.8442,  2.6937,  0.8997],
        [-1.4209,  1.3570, -2.4463, -0.4427],
        [ 1.0600, -0.3643, -0.6034, -0.6555],
        [ 0.1348,  0.2052,  1.1426, -1.0324],
        [-0.7283,  0.2093, -0.2198,  0.3258],
        [ 0.3281,  1.9519,  1.0173,  0.2180]], requires_grad=True)

In [17]:
x = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.long)
embed(x)

tensor([[[ 0.2734, -1.4217,  1.6144,  2.9778],
         [-0.6329, -0.0366,  0.5247,  0.6242],
         [ 1.3503,  1.6916, -1.0920, -0.2742]],

        [[ 1.4962,  0.3332, -0.3864, -0.9550],
         [ 2.5103, -0.8343,  1.2988, -0.5021],
         [-0.8823, -0.4087, -2.3586, -0.4057]]], grad_fn=<EmbeddingBackward>)

### 10.3.4.2 小批量乘法

In [18]:
# 这里掩饰的目的是为了表达bmm 可以完成 两个小批量中的矩阵运算
X = torch.ones((2, 1, 4))
Y = torch.ones((2, 4, 6))
torch.bmm(X, Y).shape

torch.Size([2, 1, 6])

### 10.3.4.3 跳字模型前向计算

In [19]:
# contexts_and_negatives 背景词和噪声词 变量的形状为（batch_size,max_len）
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    # 通过词嵌入层分别由词索引变化为词向量
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = torch.bmm(v, u.permute(0, 2, 1))
    return pred

## 10.3.5 训练模型
### 10.3.5.1 二元交叉熵损失函数

In [20]:
class SigmoidBinaryCrossEntropyLoss(nn.Module):
    def __init__(self): # none mean sum
        super(SigmoidBinaryCrossEntropyLoss, self).__init__()
    def forward(self, inputs, targets, mask=None):
        """
        input – Tensor shape: (batch_size, len)
        target – Tensor of the same shape as input
        我们添加的掩码变量mask的目的是用于避免填充对损失函数计算的影响
        """
        inputs, targets, mask = inputs.float(), targets.float(), mask.float()
        res = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none", weight=mask)
        return res.mean(dim=1)

loss = SigmoidBinaryCrossEntropyLoss()

In [21]:
pred = torch.tensor([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]])
# 标签变量label中的1和0分别代表背景词和噪声词
label = torch.tensor([[1, 0, 0, 0], [1, 1, 0, 0]])
mask = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 0]])  # 掩码变量
loss(pred, label, mask) * mask.shape[1] / mask.float().sum(dim=1)

tensor([0.8740, 1.2100])

In [22]:
# sigmod函数
def sigmd(x):
    return - math.log(1 / (1 + math.exp(-x)))

print('%.4f' % ((sigmd(1.5) + sigmd(-0.3) + sigmd(1) + sigmd(-2)) / 4)) # 注意1-sigmoid(x) = sigmoid(-x)
print('%.4f' % ((sigmd(1.1) + sigmd(-0.6) + sigmd(-2.2)) / 3))

0.8740
1.2100


### 10.3.5.2 初始化模型参数

In [23]:
# 此参数是超参数，需要人为进行调整
embed_size = 100
net = nn.Sequential(
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size),
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size)
)

### 10.3.5.3 定义训练函数

In [24]:
def train(net, lr, num_epochs):
    # 调用对应的torch.device 本实验用CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("train on", device)
    net = net.to(device)
    # 优化器采用Adam
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [d.to(device) for d in batch]
            
            pred = skip_gram(center, context_negative, net[0], net[1])
            
            # 使用掩码变量mask来避免填充项对损失函数计算的影响
            l = (loss(pred.view(label.shape), label, mask) *
                 mask.shape[1] / mask.float().sum(dim=1)).mean() # 一个batch的平均loss
            # optimizer.zero_grad()意思是把梯度置零，也就是把loss关于weight的导数变成0
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            l_sum += l.cpu().item()
            n += 1
        print('epoch %d, loss %.2f, time %.2fs'
              % (epoch + 1, l_sum / n, time.time() - start))

In [25]:
train(net, 0.01, 10)

train on cpu
epoch 1, loss 1.96, time 97.10s
epoch 2, loss 0.63, time 95.22s
epoch 3, loss 0.45, time 94.96s
epoch 4, loss 0.40, time 95.70s
epoch 5, loss 0.37, time 94.96s
epoch 6, loss 0.35, time 95.28s
epoch 7, loss 0.34, time 95.37s
epoch 8, loss 0.33, time 95.97s
epoch 9, loss 0.32, time 95.07s
epoch 10, loss 0.32, time 95.19s


## 10.3.6 应用词嵌入模型

In [26]:
# 我们可以根据两个词向量的余弦相似度表示词与词之间在语义上的相似度
def get_similar_tokens(query_token, k, embed):
    W = embed.weight.data
    x = W[token_to_idx[query_token]]
    # 添加的1e-9是为了数值稳定性 计算余弦值cost,
    cos = torch.matmul(W, x) / (torch.sum(W * W, dim=1) * torch.sum(x * x) + 1e-9).sqrt()
    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:  # 除去输入词
        # 输出余弦值
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))
        
get_similar_tokens('chip', 3, net[0])

cosine sim=0.459: sluggish
cosine sim=0.423: products
cosine sim=0.420: turmoil
