## Linear Language Model with Data Loader

Status of Notebook: Work in Progress

Difference from `loglin-lm.ipynb` is that we use a data loader to load the data.

In [1]:
import torch  # 导入torch模块，用于深度学习相关操作
import random  # 导入random模块，用于生成随机数
import torch.nn as nn  # 导入torch.nn模块，用于定义神经网络模型
import math  # 导入math模块，用于数学运算
import time  # 导入time模块，用于计时
import numpy as np  # 导入numpy模块，用于数值计算和数组操作

### Download the Data

In [6]:
# uncomment to download the datasets
#!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/ptb/test.txt
#!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/ptb/train.txt
#!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/ptb/valid.txt

### Process the Data

In [2]:
def read_data(filename):
    """读取数据文件并返回数据列表"""
    data = []
    with open(filename, "r") as f:  # 打开文件
        for line in f:  # 逐行读取文件内容
            line = line.strip().split(" ")  # 去掉首尾空格并按空格分割成列表
            data.append(line)  # 将每行数据加入到数据列表中
    return data

train_data = read_data('data/ptb/train.txt')  # 读取训练数据文件，返回训练数据列表
val_data = read_data('data/ptb/valid.txt')  # 读取验证数据文件，返回验证数据列表

word_to_index = {}  # 创建空字典，用于存储单词到索引的映射关系
index_to_word = {}  # 创建空字典，用于存储索引到单词的映射关系

word_to_index["<s>"] = len(word_to_index)  # 将特殊符号"<s>"添加到字典中，并给它分配一个索引
index_to_word[len(word_to_index)-1] = "<s>"  # 将索引与"<s>"的映射关系存储到字典中

word_to_index["<unk>"] = len(word_to_index)  # 将特殊符号"<unk>"添加到字典中，并给它分配一个索引
index_to_word[len(word_to_index)-1] = "<unk>"  # 将索引与"<unk>"的映射关系存储到字典中

def create_dict(data, check_unk=False):
    """根据数据列表创建字典，记录词汇和索引的映射关系"""
    for line in data:  # 遍历数据列表的每一行
        for word in line:  # 遍历每一行中的每个单词
            if check_unk == False:  # 如果不需要检查未知单词
                if word not in word_to_index:  # 如果单词不在字典中
                    word_to_index[word] = len(word_to_index)  # 将单词添加到字典中，并分配一个索引
                    index_to_word[len(word_to_index)-1] = word  # 将索引和单词的映射关系存储到字典中
            else:  # 如果需要检查未知单词
                if word not in word_to_index:  # 如果单词不在字典中
                    word_to_index[word] = word_to_index["<unk>"]  # 将单词的索引设为未知单词的索引
                    index_to_word[len(word_to_index)-1] = word  # 将索引和单词的映射关系存储到字典中

create_dict(train_data)  # 根据训练数据创建字典
create_dict(val_data, check_unk=True)  # 根据验证数据创建字典，并检查未知单词情况

# create word and tag tensors from data
def create_tensor(data):
    """
    将数据列表转换为张量的生成器函数
    参数:
        - data: 数据列表
    返回值:
        - 生成器对象，每次迭代产生一个张量
    """
    for line in data:  # 遍历数据列表的每一行
        yield [word_to_index[word] for word in line]  # 生成一个张量，将每个单词根据字典映射为对应的索引值

train_data = [*create_tensor(train_data)]  # 使用生成器函数将训练数据转换为张量列表
val_data = [*create_tensor(val_data)]  # 使用生成器函数将验证数据转换为张量列表
number_of_words = len(word_to_index)  # 计算词汇表的长度，即单词的总数

### Convert data to PyTorch Dataset

In [3]:
from torch.utils.data import Dataset, DataLoader

class PTB(Dataset):
    def __init__(self, data):
        """
        PTB数据集的构造函数
        参数:
            - data: 数据列表
        """
        self.data = data

    def __len__(self):
        """
        返回数据集的长度
        """
        return len(self.data)

    def __getitem__(self, idx):
        """
        根据索引获取数据项
        参数:
            - idx: 索引值
        返回值:
            - 数据项的张量表示
        """
        return torch.as_tensor(self.data[idx])

train_dataset = PTB(train_data)  # 创建训练数据集对象
val_dataset = PTB(val_data)  # 创建验证数据集对象

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)  # 创建训练数据集的数据加载器
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True)  # 创建验证数据集的数据加载器

In our implementation we are using batched training. There are a few differences from the original implementation found [here](https://github.com/neubig/nn4nlp-code/blob/master/02-lm/loglin-lm.py). 

### Define the Model

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # 检查CUDA是否可用并设置设备
N = 2  # N-gram的长度

class LogLinear(nn.Module):
    def __init__(self, number_of_words, ngram_length):
        super(LogLinear, self).__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(number_of_words, number_of_words) for _ in range(ngram_length)])
        # 创建嵌入层的 ModuleList，长度为 ngram_length
        # 每个嵌入层都是一个 nn.Embedding 对象，用于将索引映射为词向量
        # number_of_words 是词汇表的大小，嵌入层的维度与之相同
        # 每个嵌入层都初始化为相同的权重矩阵

        self.bias = torch.zeros(number_of_words, requires_grad=True).type(torch.FloatTensor).to(device)
        # 创建偏置参数bias，维度为 number_of_words
        # requires_grad=True 表示需要计算梯度
        # 偏置参数初始化为全零

        for i in range(N):
            nn.init.xavier_uniform_(self.embeddings[i].weight)
            # 使用 Xavier 初始化方法初始化每个嵌入层的权重矩阵

    def forward(self, x):
        embs = torch.cat([lookup(x) for x, lookup in zip(x.T, self.embeddings)]).view(N, x.shape[0], -1)
        # 将输入数据 x 依次传入每个嵌入层，并将结果拼接在一起
        # 这里使用了列表推导式和 zip 函数，将 x.T（转置后的 x）和嵌入层一一对应
        # 得到的 embs 是一个形状为 N x batch_size x embedding_size 的张量
        # 其中 N 是 ngram_length，batch_size 是输入数据的 batch 大小，embedding_size 是嵌入向量的维度

        embs = torch.sum(embs, dim=0)  # 将 ngram 形式的嵌入向量相加，得到 batch_size x embedding_size 的张量
        scores = embs + self.bias
        # 将嵌入向量与偏置参数相加，得到表示得分的张量
        # scores 的形状为 batch_size x number_of_words

        return scores

### Model Settings and Functions

In [7]:
model = LogLinear(number_of_words, N)  # 创建LogLinear模型，输入参数为单词数量和N
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)  # 使用Adam优化器来优化模型参数，学习率为0.1
criterion = torch.nn.CrossEntropyLoss()  # 使用交叉熵损失函数

if torch.cuda.is_available():  # 如果CUDA可用，则将模型移动到CUDA设备上进行加速

def calc_sent_loss(sent):
    S = word_to_index["<s>"]  # 特殊标记<s>对应的索引
    hist = [S] * N  # 初始历史记录，全部为<S>
    all_targets = []  # 存储所有目标单词的列表
    all_histories = []  # 存储所有历史记录的列表
    for next_word in sent + torch.Tensor([S]):  # 遍历输入的句子和结束标记<S>
        all_histories.append(list(hist))  # 将当前历史记录添加到列表中
        all_targets.append(next_word)  # 将当前目标单词添加到列表中
        hist = hist[1:] + [next_word]  # 更新历史记录，将最旧的单词移除，添加当前单词

    logits = model(torch.LongTensor(all_histories).to(device))  # 将历史记录转换为张量，并通过模型获取logits
    loss = criterion(logits, torch.LongTensor(all_targets).to(device))  # 计算损失
    return loss

MAX_LEN = 100  # 生成句子的最大长度

def generate_sent():
    S = word_to_index["<s>"]  # 特殊标记<s>对应的索引
    hist = [S] * N  # 初始历史记录，全部为<S>
    sent = []  # 存储生成的句子的列表
    while True:
        logits = model(torch.LongTensor([hist]).to(device))  # 输入当前的历史记录，通过模型获取logits
        p = torch.nn.functional.softmax(logits)  # 对logits进行softmax激活，得到概率分布（1 x 单词数量）
        next_word = p.multinomial(num_samples=1).item()  # 从概率分布中根据多项式分布采样一个单词作为下一个单词
        if next_word == S or len(sent) == MAX_LEN:  # 如果采样到的单词为结束标记<S>或者句子已达到最大长度，则结束生成过程
            break
        sent.append(next_word)  # 将当前采样的单词添加到句子中
        hist = hist[1:] + [next_word]  # 更新历史记录，将最旧的单词移除，添加当前单词
    return sent  # 返回生成的句子列表想·

### Train the Model

In [9]:
for ITER in range(10):  # 迭代训练循环，共进行10次迭代（可以根据需要更改为100次）
    model.train()  # 设置模型为训练模式
    train_words, train_loss = 0, 0.0  # 初始化训练单词数和训练损失为0
    for sent_id, sent in enumerate(train_loader):  # 遍历训练数据加载器中的每个句子
        my_loss = calc_sent_loss(sent[0])  # 计算当前句子的损失
        train_loss += my_loss.item()  # 累加损失值
        train_words += len(sent)  # 累加训练单词数
        optimizer.zero_grad()  # 清空梯度
        my_loss.backward()  # 反向传播，计算梯度
        optimizer.step()  # 更新模型参数
        if (sent_id+1) % 5000 == 0:  # 每处理5000个句子显示一次进度
            print("--finished %r sentences" % (sent_id+1))
    print("iter %r: train loss/word=%.4f, ppl=%.4f" % (ITER, train_loss/train_words, math.exp(train_loss/train_words)))
    # 输出当前迭代的训练损失和困惑度

    model.eval()  # 设置模型为评估模式
    dev_words, dev_loss = 0, 0.0  # 初始化验证单词数和验证损失为0
    start = time.time()  # 记录当前时间
    for sent_id, sent in enumerate(val_loader):  # 遍历验证数据加载器中的每个句子
        my_loss = calc_sent_loss(sent[0])  # 计算当前句子的损失
        dev_loss += my_loss.item()  # 累加损失值
        dev_words += len(sent)  # 累加验证单词数
    print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))
    # 输出当前迭代的验证损失、困惑度和耗时

    for _ in range(5):  # 生成5个句子进行展示
        sent = generate_sent()  # 生成句子
        print(" ".join([index_to_word[x] for x in sent]))  # 将索引转换为单词，并以空格分隔打印句子中的单词

--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 0: train loss/word=9.0947, ppl=8907.6500
iter 0: dev loss/word=9.7668, ppl=17444.9221, time=1.76s
in this case of the trade deficit of the globe weeks columnist months <unk> from a <unk> character succeed reflects as an effort will teaching mr. chestman was essentially flat to deal with the board is this time the <unk> an international <unk> machines are n't being any at this time you were n't disclosed this week it to take over a company said it will introduce a new york <unk> that since friday 's sharp swings in the field sales were down on N at a <unk> company said it will invest in quarterly profit by the new securities
on monday at N yen $ N million navy contract for advanced there were <unk> when he 's no decision has been done by the bush administration has of ne



advertisers and advertising rates for the s&p N issue of the issues <unk> pace with rival very small amounts to veto the constitution <unk> sen coordinator of the big three <unk> the las vegas 's increased <unk> activity is only one or for one thing is important as of as many as N million navy contract for the government is <unk> by mr. <unk> has <unk> business conditions and the earnings or N on the firm of that this is that mr. gorbachev 's economic activity and only half of the proposal to reduce interest rates in the <unk> he
--finished 5000 sentences


KeyboardInterrupt: 