# Continuous Bag of Words (CBOW) Text Classifier

The code below implements a continuous bag of words text classifier.
- We tokenize the text, create a vocabulary and encode each piece of text in the dataset
- The lookup allows for extracting embeddings for each tokenized input
- The embedding vectors are added together
- The resulting vector is multiplied with a weight matrix, which is then added a bias vector; this results in scores
- The scores are applied a softmax to generate probabilities which are used for the final classification

The code used in this notebook was inspired by code from the [official repo](https://github.com/neubig/nn4nlp-code) used in the [CMU Neural Networks for NLP class](http://www.phontron.com/class/nn4nlp2021/schedule.html) by [Graham Neubig](http://www.phontron.com/index.php). 

![img txt](../img/cbow.png?raw=true)

In [1]:
import torch
import random
import torch.nn as nn

In [None]:
% % capture
''' uncomment to download the data
# download the files
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/dev.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/test.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/train.txt

# create the data folders
!mkdir data data/classes
!cp dev.txt data/classes
!cp test.txt data/classes
!cp train.txt data/classes
'''

## Read and Process Data

In [2]:
# 定义读取数据的函数
def read_data(filename):
    data = []
    with open(filename, 'r') as f:
        for line in f:
             # 将每行转换为小写并去除首尾空格
            line = line.lower().strip()
             # 按 '|||' 分割每行数据，并添加到列表中
            line = line.split(' ||| ')
            data.append(line)
    return data

# 读取训练数据和测试数据
train_data = read_data('data/classes/train.txt')
test_data = read_data('data/classes/test.txt')

# 创建空的字典word_to_index，用于将单词映射为索引
word_to_index = {}
# 添加特殊单词 "<unk>" 到字典，对应的索引为当前字典的长度
word_to_index["<unk>"] = len(word_to_index)
# 创建空的字典tag_to_index，用于将标签映射为索引
tag_to_index = {}

# 定义创建字典的函数create_dict，传入数据列表和是否检查未登录词的标志
def create_dict(data, check_unk=False):
    # 遍历数据列表中的每一行
    for line in data:
        # 遍历每一行中的单词
        for word in line[1].split(" "):
            # 如果不需要检查未登录词
            if check_unk == False:
                 # 如果单词不在word_to_index字典中，将其添加，并使用当前字典长度作为索引
                if word not in word_to_index:
                    word_to_index[word] = len(word_to_index)
            # 如果需要检查未登录词
            else:
                # 如果单词不在word_to_index字典中，将其添加，并使用"<unk>"对应的索引
                if word not in word_to_index:
                    word_to_index[word] = word_to_index["<unk>"]
        # 当前行的标签不在tag_to_index字典中，将其添加，并使用当前字典长度作为索引
        if line[0] not in tag_to_index:
            tag_to_index[line[0]] = len(tag_to_index)

# 调用create_dict函数，传入训练数据train_data，用于创建字典并将单词和标签映射为索引
create_dict(train_data)
# 调用create_dict函数，传入测试数据test_data，并设置check_unk参数为True，用于创建字典并将单词和标签映射为索引，
# 在此过程中，对于不存在于word_to_index字典中的单词，将其映射为"<unk>"对应的索引
create_dict(test_data, check_unk=True)

# 定义创建张量的函数create_tensor，传入数据列表data
def create_tensor(data):
    # 遍历数据列表中的每一行
    for line in data:
        # 使用列表推导式将当前行的单词转换为对应的索引，并使用列表存储
        # line[1].split(" ") 将当前行的单词拆分为一个列表
        # word_to_index[word] for word in line[1].split(" ") 将每个单词转换为对应的索引
        # [word_to_index[word] for word in line[1].split(" ")] 创建一个存储索引的列表
        # tag_to_index[line[0]] 获取当前行的标签对应的索引
        yield ([word_to_index[word] for word in line[1].split(" ")], tag_to_index[line[0]])

# 调用create_tensor函数，传入训练数据train_data，并将生成的张量转换为列表存储在train_data变量中
train_data = list(create_tensor(train_data))
# 调用create_tensor函数，传入测试数据test_data，并将生成的张量转换为列表存储在test_data变量中
test_data = list(create_tensor(test_data))
# 获取单词的总数，即word_to_index字典中的键值对个数
number_of_words = len(word_to_index)
# 获取标签的总数，即tag_to_index字典中的键值对个数
number_of_tags = len(tag_to_index)

## Model

In [1]:
# 根据是否检测到GPU设备，将device变量设置为"cuda"或"cpu"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 定义CBoW类，继承自torch.nn.Module
class CBoW(torch.nn.Module):
    def __init__(self, nwords, ntags, emb_size):
        super(CBoW, self).__init__()

         # 创建一个Embedding层，将输入的单词索引映射为指定大小的词嵌入向量
        self.embedding = torch.nn.Embedding(nwords, emb_size)
        # 创建一个线性层，将词嵌入向量投影到标签的维度
        self.linear = torch.nn.Linear(emb_size, ntags)

         # 使用xavier_uniform_方法初始化embedding层和linear层的权重
        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.linear.weight)

    def forward(self, x):
        # 前向传播函数，输入x为单词索引的张量
        emb = self.embedding(x)  # seq x emb_size
        out = torch.sum(emb, dim=0)  # emb_size
        out = out.view(1, -1)  # reshape to (1, emb_size)
        out = self.linear(out)  # 1 x ntags
        return out

# 定义词嵌入的维度大小
EMB_SIZE = 64
# 创建CBoW模型实例，传入单词和标签的总数，以及词嵌入的维度大小
model = CBoW(number_of_words, number_of_tags, EMB_SIZE)
# 定义交叉熵损失函数
criterion = torch.nn.CrossEntropyLoss()
# 创建Adam优化器，传入模型的参数
optimizer = torch.optim.Adam(model.parameters())
# 将type变量设置为torch.LongTensor
type = torch.LongTensor
# 如果检测到GPU设备可用，则将模型移动到GPU上，并将type变量设置为torch.cuda.LongTensor
if torch.cuda.is_available():
    model.to(device)
    type = torch.cuda.LongTensor

NameError: name 'torch' is not defined

## Train the Model

In [4]:
# 对于每个epoch，进行以下操作（执行10次）
for epoch in range(10):
    # 将模型设置为训练模式
    model.train()
    # 随机打乱训练数据
    random.shuffle(train_data)
    # 初始化总损失为0.0
    total_loss = 0.0
    # 初始化训练正确预测的计数器为0
    train_correct = 0
    # 遍历训练数据中的每个句子和标签
    for sentence, tag in train_data:
         # 将句子转换为torch张量，并将其类型设置为指定的type（可能是CPU或GPU上的张量）
        sentence = torch.tensor(sentence).type(type)
          # 将标签转换为torch张量，并将其类型设置为指定的type
        tag = torch.tensor([tag]).type(type)
         # 将句子输入模型，获取模型的输出
        output = model(sentence)
          # 通过计算输出张量中的值的最大索引，获取预测的标签
        predicted = torch.argmax(output.data.detach()).item()
        # 使用损失函数计算输出和标签之间的损失
        loss = criterion(output, tag)
         # 累加损失值到总损失
        total_loss += loss.item()
        # 清空优化器的梯度
        optimizer.zero_grad()
         # 反向传播计算梯度
        loss.backward()
         # 执行一步优化器的参数更新
        optimizer.step()
        # 如果预测标签与真实标签相等，则将训练正确预测的计数器加一
        if predicted == tag:
            train_correct += 1

    # 将模型设置为评估模式（禁用了一些特定于训练的操作，如dropout）
    model.eval()
    # 初始化测试正确预测的计数器为0
    test_correct = 0
    # 遍历测试数据中的每个句子和标签
    for sentence, tag in test_data:
        # 将句子转换为torch张量，并将其类型设置为指定的type
        sentence = torch.tensor(sentence).type(type)
        # 将句子输入模型，获取模型的输出
        output = model(sentence)
        # 通过计算输出张量中的值的最大索引，获取预测的标签
        predicted = torch.argmax(output.data.detach()).item()
         # 如果预测标签与真实标签相等，则将测试正确预测的计数器加一
        if predicted == tag:
            test_correct += 1

    # 构建日志字符串，包含当前epoch的信息
    log = f'epoch: {epoch+1} | ' \
        f'train loss/sent: {total_loss/len(train_data):.4f} | ' \
        f'train accuracy: {train_correct/len(train_data):.4f} | ' \
        f'test accuracy: {test_correct/len(test_data):.4f}'
     # 打印日志
    print(log)

epoch: 1 | train loss/sent: 1.4111 | train accuracy: 0.3841 | test accuracy: 0.3982
epoch: 2 | train loss/sent: 0.8886 | train accuracy: 0.6522 | test accuracy: 0.3991
epoch: 3 | train loss/sent: 0.5147 | train accuracy: 0.8148 | test accuracy: 0.3950
epoch: 4 | train loss/sent: 0.3200 | train accuracy: 0.8878 | test accuracy: 0.3796
epoch: 5 | train loss/sent: 0.2148 | train accuracy: 0.9247 | test accuracy: 0.3738
epoch: 6 | train loss/sent: 0.1536 | train accuracy: 0.9466 | test accuracy: 0.3783
epoch: 7 | train loss/sent: 0.1097 | train accuracy: 0.9618 | test accuracy: 0.3638
epoch: 8 | train loss/sent: 0.0797 | train accuracy: 0.9716 | test accuracy: 0.3692
epoch: 9 | train loss/sent: 0.0568 | train accuracy: 0.9805 | test accuracy: 0.3661
epoch: 10 | train loss/sent: 0.0435 | train accuracy: 0.9853 | test accuracy: 0.3529
