# Bag of Words Text Classifier

The code below implements a simple bag of words text classifier.
- We tokenize the text, create a vocabulary and encode each piece of text in the dataset
- The lookup allows for extracting embeddings for each tokenized inputs
- The embedding vectors are added together with a bias vector
- The resulting vector is referred to as the scores
- The score are applied a softmax to generate probabilities which are used for the classification task

The code used in this notebook was inspired by code from the [official repo](https://github.com/neubig/nn4nlp-code) used in the [CMU Neural Networks for NLP class](http://www.phontron.com/class/nn4nlp2021/schedule.html) by [Graham Neubig](http://www.phontron.com/index.php). 

We are also adding a PyTorch data loader to this notebook which is how it differs from `bow.ipynb`.

![img txt](../img/bow.png?raw=true)

In [1]:
import torch
# 导入torch库，用于进行深度学习相关的操作

import random
# 导入random库，用于生成随机数

import torch.nn as nn
# 导入torch.nn模块，用于定义神经网络模型的基类

### Download the Data

In [2]:
%%capture

# download the files
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/dev.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/test.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/train.txt

# create the data folders
!mkdir data data/classes
!cp dev.txt data/classes
!cp test.txt data/classes
!cp train.txt data/classes

### Read the Data

In [79]:
# function to read in data, process each line and split columns by " ||| "
def read_data(filename):
    """
    读取数据的函数

    参数:
    - filename (str): 文件名

    返回值：
    - data (list): 包含读取内容的列表
    """
    data = []
    with open(filename, 'r') as f:
        for line in f:
            line = line.lower().strip()  # 将行转换为小写并去除首尾空白字符
            line = line.split(' ||| ')   # 使用 ' ||| ' 进行分割，得到一个列表
            data.append(line)            # 将每行数据添加到data列表中
    return data

# 使用read_data函数读取训练数据和测试数据
train_data = read_data('data/classes/train.txt')
test_data = read_data('data/classes/test.txt')

### Construct the Vocab and Datasets

In [80]:
# creating the word and tag indices
word_to_index = {}
# 创建一个空字典，用于存储单词到索引的映射关系

word_to_index["<unk>"] = len(word_to_index)
# 将"<unk>"作为特殊标记（未知单词）添加到字典中，并通过len(word_to_index)获取其对应的索引

tag_to_index = {}
# 创建一个空字典，用于存储标签到索引的映射关系

def create_dict(data, check_unk=False):
    """
    创建字典的函数

    参数:
    - data (list): 包含数据的列表
    - check_unk (bool): 是否检查未知单词，默认为False

    返回值：无
    """
    for line in data:  # 遍历数据列表中的每一行
        for word in line[1].split(" "):  # 对每一行进行拆分，以空格为分隔符，得到单词列表
            if check_unk == False:  # 如果不检查未知单词
                if word not in word_to_index:  # 如果单词不在word_to_index字典中
                    word_to_index[word] = len(word_to_index)  # 将单词添加到字典中，并赋予其一个索引值
            else:  # 如果检查未知单词
                if word not in word_to_index:  # 如果单词不在word_to_index字典中
                    word_to_index[word] = word_to_index["<unk>"]  # 将单词的索引设置为"<unk>"的索引值

        if line[0] not in tag_to_index:  # 如果标签不在tag_to_index字典中
            tag_to_index[line[0]] = len(tag_to_index)  # 将标签添加到字典中，并赋予其一个索引值

# 调用create_dict函数分别处理训练数据集和测试数据集
create_dict(train_data)
create_dict(test_data, check_unk=True)


def create_tensor(data):
    """
    创建张量的生成器函数

    参数:
    - data (list): 包含数据的列表

    返回值：
    - 生成器：生成包含单词索引和标签索引的列表
    """
    for line in data:  # 遍历数据列表中的每一行
        yield [[word_to_index[word] for word in line[1].split(" ")], tag_to_index[line[0]]]
        # 使用列表推导式生成包含单词索引和标签索引的列表，并使用 yield 返回

# 通过 create_tensor 函数创建张量的列表，分别用于训练数据集和测试数据集
train_data = [*create_tensor(train_data)]
test_data = [*create_tensor(test_data)]

# 计算单词和标签的数量
number_of_words = len(word_to_index)
number_of_tags = len(tag_to_index)

### Convert data to PyTorch Dataset

In [89]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

# 导入数据并创建数据集（Dataset）和数据加载器（DataLoader），同时确保数据是以 X、y 分隔的

class TextDataset(Dataset):
    """
    自定义的文本数据集类，继承自 torch.utils.data.Dataset

    参数:
    - data (list): 数据列表，包含输入数据和标签数据的对应项
    """

    def __init__(self, data):
        self.data = data

    def __len__(self):
        """
        获取数据集的样本数量

        返回值:
        - int: 数据集样本的数量
        """
        return len(self.data)

    def __getitem__(self, idx):
        """
        获取指定索引位置的样本

        参数:
        - idx (int): 索引值

        返回值:
        - tuple: 包含输入数据和标签数据的元组
        """
        return torch.as_tensor(self.data[idx][0]), torch.as_tensor(self.data[idx][1])

# 创建训练数据集和测试数据集的实例
train_dataset = TextDataset(train_data)
test_dataset = TextDataset(test_data)

# 创建训练数据加载器和测试数据加载器
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)

### Model

In [95]:
# cpu or gpu
device = "cuda" if torch.cuda.is_available() else "cpu"
# 根据系统是否有可用的CUDA设备，选择使用CUDA还是CPU作为设备

class BoW(torch.nn.Module):
    """
    Bag-of-Words模型的神经网络模块

    参数:
    - nwords (int): 单词的数量
    - ntags (int): 标签的数量
    """
    def __init__(self, nwords, ntags):
        super(BoW, self).__init__()

        # 创建一个嵌入层，将单词索引映射到向量空间中
        self.embedding = nn.Embedding(nwords, ntags)
        nn.init.xavier_uniform_(self.embedding.weight)  # 使用均匀分布的随机数初始化嵌入层的权重

        # 创建一个与标签数量相同的偏置项张量，并设置其 requires_grad 为 True，表示在训练过程中需要计算梯度
        type = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
        self.bias = torch.zeros(ntags, requires_grad=True).type(type)

    def forward(self, x):
        """
        前向传播函数

        参数:
        - x (tensor): 输入张量，包含单词索引

        返回值:
        - tensor: 输出张量，代表预测的标签向量
        """
        emb = self.embedding(x)  # 将输入张量 x 映射到嵌入向量空间，得到形状为 seq_len x ntags 的张量
        out = torch.sum(emb, dim=0) + self.bias  # 对嵌入张量进行求和，得到形状为 ntags 的张量，并加上偏置项
        out = out.view(1, -1)  # 将输出张量重新形状为 (1, ntags)，用于表示预测的标签向量
        return out

### Train the Model

In [101]:
# 创建模型、损失函数和优化器
model = BoW(number_of_words, number_of_tags).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
type = torch.LongTensor

# 检查是否可用 GPU，若可用则将模型和数据类型转移到 GPU 上
if torch.cuda.is_available():
    model.to(device)
    type = torch.cuda.LongTensor

# 定义训练函数，接收模型、优化器、损失函数和训练数据作为参数
def train_bow(model, optimizer, criterion, train_data):
    # 迭代训练数据集 10 次
    for ITER in range(10):
        model.train()
        total_loss = 0.0
        train_correct = 0
        # 在训练数据加载器上进行迭代
        for batch, (sentence, tag) in enumerate(train_loader):
            # 将输入数据和标签数据移动到指定的设备（CPU 或 GPU）
            sentence = sentence[0].to(device)
            tag = tag.to(device)

            # 前向传播，计算模型的输出
            output = model(sentence)

            # 预测类别
            predicted = torch.argmax(output.data.detach()).item()

            # 计算损失函数
            loss = criterion(output, tag)
            total_loss += loss.item()

            # 梯度清零，反向传播，参数更新
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # 统计训练正确的样本数量
            if predicted == tag:
                train_correct += 1

        model.eval()
        test_correct = 0
        # 在测试数据加载器上进行迭代
        for batch, (sentence, tag) in enumerate(test_loader):
            # 将输入数据移动到指定的设备（CPU 或 GPU）
            sentence = sentence[0].to(device)

            # 前向传播，计算模型的输出
            output = model(sentence)

            # 预测类别
            predicted = torch.argmax(output.data.detach()).item()

            # 统计测试正确的样本数量
            if predicted == tag:
                test_correct += 1

        # 打印当前迭代的日志信息，包括训练损失、训练准确率和测试准确率
            log = f'ITER: {ITER+1} | ' \
            f'train loss/sent: {total_loss/len(train_data):.4f} | ' \
            f'train accuracy: {train_correct/len(train_data):.4f} | '
            f'test accuracy: {test_correct/len(test_data):.4f}'
        print(log)

# call the train_bow function
train_bow(model, optimizer, criterion, train_data)

ITER: 1 | train loss/sent: 1.4731 | train accuracy: 0.3668 | test accuracy: 0.3778
ITER: 2 | train loss/sent: 1.1223 | train accuracy: 0.6056 | test accuracy: 0.4118
ITER: 3 | train loss/sent: 0.9106 | train accuracy: 0.7155 | test accuracy: 0.4186
ITER: 4 | train loss/sent: 0.7685 | train accuracy: 0.7687 | test accuracy: 0.4032
ITER: 5 | train loss/sent: 0.6635 | train accuracy: 0.8070 | test accuracy: 0.4054
ITER: 6 | train loss/sent: 0.5814 | train accuracy: 0.8346 | test accuracy: 0.4113
ITER: 7 | train loss/sent: 0.5157 | train accuracy: 0.8558 | test accuracy: 0.3991
ITER: 8 | train loss/sent: 0.4631 | train accuracy: 0.8722 | test accuracy: 0.3946
ITER: 9 | train loss/sent: 0.4183 | train accuracy: 0.8839 | test accuracy: 0.4014
ITER: 10 | train loss/sent: 0.3807 | train accuracy: 0.8969 | test accuracy: 0.3928


### Exercises

To keep on practising, you can try the following exercises:

- Try to use different batch sizes and see how it affects the training.
- Try to use [`torchtext`](https://pytorch.org/text/stable/index.html#) to load other datasets and create tokenizer and vocabularies for them. This [example](https://pytorch.org/tutorials/beginner/transformer_tutorial.html) on Transformer could be useful to help guide you.
- Write a mini Python library easily help you train and evaluate models. You can use the code from this notebook as a starting point.