5.1.3.2 实现前馈神经网络语言模型

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from tqdm.auto import tqdm
from utils import BOS_TOKEN,EOS_TOKEN
from utils import load_reuters,save_pretrained,get_loader,init_weights

In [2]:
# 创建FNNLM的数据处理类
# 该类将实现FNNLM的训练数据构建与存储功能
class NGramDatasets(Dataset):
    def __init__(self, corpus, vocab, context_size=2):
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        
        for sentence in tqdm(corpus, desc="Dataset Construction"):
            # 插入句首和句尾符号
            sentence = [self.bos] + sentence +[self.eos]
            if len(sentence) < context_size:
                continue
            for i in range(context_size, len(sentence)):
                #长度为context_size的上文
                context = sentence[i-context_size:i]
                target = sentence[i]
                self.data.append((context,target))
                
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        return self.data[i]
    
    def collate_fn(self, examples):
        # 从独立样本集合中构建batch输入输出
        inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
        targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
        return (inputs, targets)

In [3]:
class FeedForwardNNLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(FeedForwardNNLM, self).__init__()
        # 词嵌入层
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 词向量层到隐含层的线性变换
        self.linear1 = nn.Linear(context_size*embedding_dim, hidden_dim)
        # 隐含层到输出层的线性变换
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
        # 词向量层到隐含层的激活函数
        self.activate = F.relu
        init_weights(self)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((inputs.shape[0],-1))
        hidden = self.activate(self.linear1(embeds))
        output = self.linear2(hidden)
        
        log_probs = F.log_softmax(output,dim=1)
        return log_probs

In [4]:
embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10

In [5]:
corpus, vocab = load_reuters()

In [6]:
# 读取文本数据，构建FFNNLM训练数据集（n-grams）
dataset = NGramDatasets(corpus,vocab,context_size)

Dataset Construction:   0%|          | 0/54716 [00:00<?, ?it/s]

In [13]:
data_loader = get_loader(dataset,batch_size)

In [15]:
# 负对数似然损失函数
nll_loss = nn.NLLLoss()
# 构建FFNNLM并加载至device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = FeedForwardNNLM(len(vocab),embedding_dim,context_size,hidden_dim)
model.to(device)
device

device(type='cuda')

In [19]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
model.train()
total_losses = []

In [20]:
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc = f"Training Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
        loss = nll_loss(log_probs,targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")
    total_losses.append(total_loss)

Training Epoch 0:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 9152.42


Training Epoch 1:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 7814.06


Training Epoch 2:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 7334.58


Training Epoch 3:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 7025.26


Training Epoch 4:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 6807.09


Training Epoch 5:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 6647.92


Training Epoch 6:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 6529.04


Training Epoch 7:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 6435.18


Training Epoch 8:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 6357.48


Training Epoch 9:   0%|          | 0/1681 [00:00<?, ?it/s]

Loss: 6290.65


In [22]:
# 词向量即model.embeddings，保存至ffnlm.vec文件
save_pretrained(vocab, model.embeddings.weight.data, "ffnnlm.vec")

Pretrained embeddings saved to: ffnnlm.vec
