## CS310 Natural Language Processing
## Assignment 1. Neural Text Classification

**Total points**: 50

You should roughtly follow the structure of the notebook. Add additional cells if you feel needed. 

You can (and you should) re-use the code from Lab 2. 

Make sure your code is readable and well-structured.

### 0. Import Necessary Libraries

Install datasets

Url: https://huggingface.co/docs/datasets/en/installation
```bash
conda install -c huggingface -c conda-forge datasets
```

In [469]:
import json
import torch
import re
from torch.utils.data.dataset import random_split
from data_utils import DatasetIterator, get_tokenizer, build_vocab_from_iter, to_map_style_dataset

Load data

In [470]:
test_sentences = []
train_sentences = []
test_url = "test.jsonl"
train_url = "train.jsonl"
with open(test_url, 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)
        test_sentences.append((data["sentence"], data["label"][0]))

with open(train_url, 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)
        train_sentences.append((data["sentence"], data["label"][0]))

print(len(test_sentences))
print(len(train_sentences))

651
12677


Apply Tokenizer

In [471]:
# Basic tokenizer
def basic_tokenizer(s):
    tokens = re.findall(r'[\u4e00-\u9fff]', s)
    return tokens


# Improved tokenizer
def improved_tokenizer(s):
    tokens = re.findall(r'[\u4e00-\u9fff]|[0-9]+|[a-zA-Z]+|[^\u4e00-\u9fff\da-zA-Z\s]', s)
    return tokens


# yeild tokenizer
def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield improved_tokenizer(text)


# Check the output of yield_tokens()
count = 0
for tokens in yield_tokens(iter(train_sentences)):
    print(tokens)
    count += 1
    if count > 3:
        break

['卖', '油', '条', '小', '刘', '说', '：', '我', '说']
['保', '姆', '小', '张', '说', '：', '干', '啥', '子', '嘛', '？']
['卖', '油', '条', '小', '刘', '说', '：', '你', '看', '你', '往', '星', '空', '看', '月', '朦', '胧', '，', '鸟', '朦', '胧']
['卖', '油', '条', '小', '刘', '说', '：', '咱', '是', '不', '是', '歇', '一', '下', '这', '双', '，', '疲', '惫', '的', '双', '腿', '？']


Build Vocabulary

In [472]:
train_dataset = iter(train_sentences)
test_dataset = iter(test_sentences)
vocab = build_vocab_from_iter(yield_tokens(train_dataset), specials=["<unk>"])
print('vocab size:', len(vocab))

vocab size: 2823


In [473]:
# Check the vocab
print(vocab(['卖', '油', '条', '这']))
print(vocab(['11', '5']))

print(vocab(['！', '。', '，', '@#$@!#$%']))

[473, 457, 283, 10]
[1738, 1547]
[69, 302, 6, 0]


In [474]:
text_pipeline = lambda x: vocab(improved_tokenizer(x))
label_pipeline = lambda x: int(x)

# Test text_pipeline()
tokens = text_pipeline('卖油条')
print(tokens)

# Test label_pipeline()
lbl = label_pipeline('1')
print(lbl)

[473, 457, 283]
1


Batchify Data 

In [475]:
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def collate_batch(batch):
    label_list, token_ids_list, offsets = [], [], [0]
    for _text, _label in batch:
        label_list.append(label_pipeline(_label))
        token_ids = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        token_ids_list.append(token_ids)
        offsets.append(token_ids.size(0))  # Note that offsets contains the length (number of tokens) of each example

    # 将标签列表转换为张量
    labels = torch.tensor(label_list, dtype=torch.int64)
    # 将所有token_ids拼接成一个大的tensor
    token_ids = torch.cat(token_ids_list, dim=0)
    # 计算偏移量的累积值
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)  # 用cumsum计算累积偏移量

    return labels.to(device), token_ids.to(device), offsets.to(device)

In [476]:
# Use collate_batch to generate the dataloader
train_iter = train_sentences

dataloader = DataLoader(
    train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch
)

In [477]:
# Test the dataloader
for i, (labels, token_ids, offsets) in enumerate(dataloader):
    if i == 0:
        break

# What does offsets mean?
print('Number of tokens in this batch: ', token_ids.size(0))
print('Number of examples in one batch: ', labels.size(0))
print('Example 0: ', token_ids[offsets[0]:offsets[1]])
print('Example 7: ', token_ids[offsets[7]:])

Number of tokens in this batch:  121
Number of examples in one batch:  8
Example 0:  tensor([473, 457, 283,  23, 424,   1,   2,   3,   1])
Example 7:  tensor([ 473,  457,  283,   23,  424,    1,    2, 1188,  222])


### 2. Build the Model

In [478]:
from torch import nn


class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class, hidden_dim1, hidden_dim2):
        super(TextClassificationModel, self).__init__()

        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)  # 初始化EmbeddingBag层
        self.hidden_layers = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim1),
            nn.ReLU(),
            nn.Linear(hidden_dim1, hidden_dim2),
            nn.ReLU()
        )
        self.fc = nn.Linear(hidden_dim2, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        for layer in self.hidden_layers:
            if isinstance(layer, nn.Linear):
                layer.weight.data.uniform_(-initrange, initrange)
                layer.bias.data.zero_()
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, token_ids, offsets):
        embedded = self.embedding(token_ids, offsets)  # 使用embedding层
        out = self.hidden_layers(embedded)
        return self.fc(out)

In [479]:
# Build the model
train_iter = train_sentences
test_iter = test_sentences
num_class = len(set([label for (_, label) in train_iter]))
vocab_size = len(vocab)
emsize = 64  # embedding size
hidden_dim1 = 16
hidden_dim2 = 8
model = TextClassificationModel(vocab_size, emsize, num_class, hidden_dim1, hidden_dim2).to(device)

In [480]:
# Test the model
model.eval()
with torch.no_grad():
    for i, (labels, token_ids, offsets) in enumerate(dataloader):
        output = model(token_ids, offsets)
        if i == 0:
            break

print('output size:', output.size())

output size: torch.Size([8, 2])


### 3. Train and Evaluate

In [481]:
import time
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def train(model, dataloader, optimizer, criterion, epoch: int):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (labels, token_ids, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        # Forward pass
        output = model(token_ids, offsets)
        try:
            # Compute loss
            loss = criterion(output, labels)
        except Exception:
            print('Error in loss calculation')
            print('output: ', output.size())
            print('labels: ', labels.size())
            print('token_ids: ', token_ids)
            print('offsets: ', offsets)
            raise
        # Backward propagation, grad clipping, and optimization
        loss.backward()  # 反向传播
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)  # 梯度裁剪，防止梯度爆炸
        optimizer.step()  # 更新参数

        # Calculate correct prediction in current batch
        _, predicted = output.max(1)  # 获取预测的类别
        total_acc += (predicted == labels).sum().item()  # 计算正确的预测数量并累加

        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()


@torch.no_grad()
def evaluate(model, dataloader, criterion):
    model.eval()
    total_acc, total_count = 0, 0
    y_labels = []
    y_preds = []

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            output = model(text, offsets)
            predictions = output.argmax(1)

            total_acc += (predictions == label).sum().item()
            total_count += label.size(0)

            y_labels.extend(label.tolist())
            y_preds.extend(predictions.tolist())

    accuracy = total_acc / total_count
    precision, recall, f1Score, _ = precision_recall_fscore_support(y_labels, y_preds, average='weighted')

    return accuracy, precision, recall, f1Score

In [482]:
# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate
BATCH_SIZE = 8  # batch size for training

criterion = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

In [483]:
# First, obtain some output and labels
model.eval()
with torch.no_grad():
    for i, (labels, token_ids, offsets) in enumerate(dataloader):
        output = model(token_ids, offsets)
        print(f"batch {i} output: {output}")
        if i == 0:
            break
print('output shape:', output.shape)

loss = criterion(output, labels)
print('loss:', loss)

# Manually calculate the loss
loss_manual = []
for i in range(output.shape[0]):
    probs = torch.nn.functional.softmax(output[i], dim=-1)  # 使用softmax计算预测的概率
    # 获取对应标签的概率
    correct_prob = probs[labels[i]]
    # 使用交叉熵公式计算损失
    l = -torch.log(correct_prob)
    loss_manual.append(l)
loss_manual = torch.stack(loss_manual)
print('loss_manual mean:', loss_manual.mean())


batch 0 output: tensor([[-0.1171,  0.0866],
        [-0.2377,  0.0669],
        [-0.0183,  0.0112],
        [-0.0592,  0.0150],
        [-0.1017,  0.0010],
        [-0.0892,  0.0736],
        [-0.2351,  0.0953],
        [-0.1376,  0.0119]])
output shape: torch.Size([8, 2])
loss: tensor(0.7588)
loss_manual mean: tensor(0.7588)


Load train, valid, and test data

In [484]:
# Prepare train, valid, and test data

train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

In [485]:
# Run the training loop
total_accu = None
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    train(model, train_dataloader, optimizer, criterion, epoch)
    accu_val, precision, recall, f1 = evaluate(model, valid_dataloader, criterion)

    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val

    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

| epoch   1 |   500/ 1506 batches | accuracy    0.693
| epoch   1 |  1000/ 1506 batches | accuracy    0.691
| epoch   1 |  1500/ 1506 batches | accuracy    0.685
-----------------------------------------------------------
| end of epoch   1 | time:  3.51s | valid accuracy    0.702 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   2 |   500/ 1506 batches | accuracy    0.695
| epoch   2 |  1000/ 1506 batches | accuracy    0.659
| epoch   2 |  1500/ 1506 batches | accuracy    0.704
-----------------------------------------------------------
| end of epoch   2 | time:  2.84s | valid accuracy    0.702 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   3 |   500/ 1506 batches | accuracy    0.704
| epoch   3 |  1000/ 1506 batches | accuracy    0.691
| epoch   3 |  1500/ 1506 batches | accuracy    0.683
-----------------------------------------------------------
| end of epoch   3 | time:  2.88s | valid accuracy    0.702 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   4 |   500/ 1506 batches | accuracy    0.688
| epoch   4 |  1000/ 1506 batches | accuracy    0.698
| epoch   4 |  1500/ 1506 batches | accuracy    0.698
-----------------------------------------------------------
| end of epoch   4 | time:  3.03s | valid accuracy    0.702 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   5 |   500/ 1506 batches | accuracy    0.701
| epoch   5 |  1000/ 1506 batches | accuracy    0.684
| epoch   5 |  1500/ 1506 batches | accuracy    0.699
-----------------------------------------------------------
| end of epoch   5 | time:  3.07s | valid accuracy    0.702 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   6 |   500/ 1506 batches | accuracy    0.688
| epoch   6 |  1000/ 1506 batches | accuracy    0.683
| epoch   6 |  1500/ 1506 batches | accuracy    0.711
-----------------------------------------------------------
| end of epoch   6 | time:  2.92s | valid accuracy    0.702 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   7 |   500/ 1506 batches | accuracy    0.686
| epoch   7 |  1000/ 1506 batches | accuracy    0.701
| epoch   7 |  1500/ 1506 batches | accuracy    0.699
-----------------------------------------------------------
| end of epoch   7 | time:  2.75s | valid accuracy    0.702 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   8 |   500/ 1506 batches | accuracy    0.686
| epoch   8 |  1000/ 1506 batches | accuracy    0.703
| epoch   8 |  1500/ 1506 batches | accuracy    0.690
-----------------------------------------------------------
| end of epoch   8 | time:  2.87s | valid accuracy    0.527 
-----------------------------------------------------------
| epoch   9 |   500/ 1506 batches | accuracy    0.715
| epoch   9 |  1000/ 1506 batches | accuracy    0.723
| epoch   9 |  1500/ 1506 batches | accuracy    0.719
-----------------------------------------------------------
| end of epoch   9 | time:  2.76s | valid accuracy    0.707 
-----------------------------------------------------------
| epoch  10 |   500/ 1506 batches | accuracy    0.730
| epoch  10 |  1000/ 1506 batches | accuracy    0.721
| epoch  10 |  1500/ 1506 batches | accuracy    0.719
-----------------------------------------------------------
| end of epoch  10 | time:  2.83s | valid accuracy    0.705 
-------------------------------

In [486]:
# Save the model
torch.save(model.state_dict(), "A1_out.pth")

## Evaluate with Test Data

This is a necessary step. But since the `test` split of SST2 is not annotated, we will use the `dev` split here to pretend it is the test data.

In [487]:
accu_val, precision, recall, f1_score = evaluate(model, test_dataloader, criterion)
print("test accuracy {:8.3f}, precision {:8.3f}, recall {:8.3f}, f1_score {:8.3f}".format(accu_val, precision, recall,
                                                                                          f1_score))

test accuracy    0.757, precision    0.751, recall    0.757, f1_score    0.684


### 4. Explore Word Segmentation

In [488]:
import jieba

In [489]:
def jieba_tokenizer(s):
    tokens = jieba.lcut(s)
    return tokens


def yield_tokens(data_iter):
    for text, _ in data_iter:
        tokens = jieba_tokenizer(text)
        yield tokens

In [490]:
vocab = build_vocab_from_iter(yield_tokens(train_dataset), specials=["<unk>"])
print('vocab size:', len(vocab))

vocab size: 13847


In [491]:
# Check the vocab
print(vocab(['卖', '油', '条', '这']))
print(vocab(['11', '5']))

print(vocab(['！', '。', '，', '@#$@!#$%']))

[385, 2871, 1968, 16]
[2278, 1977]
[43, 153, 4, 0]


In [492]:
text_pipeline = lambda x: vocab(improved_tokenizer(x))
label_pipeline = lambda x: int(x)

# Test text_pipeline()
tokens = text_pipeline('卖油条')
print(tokens)

# Test label_pipeline()
lbl = label_pipeline('1')
print(lbl)

[385, 2871, 1968]
1


In [493]:
# Use collate_batch to generate the dataloader
dataloader = DataLoader(
    train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch
)

In [494]:
# Build the model
num_class = len(set([label for (_, label) in train_iter]))
vocab_size = len(vocab)
emsize = 64  # embedding size
hidden_dim1 = 16
hidden_dim2 = 8
model1 = TextClassificationModel(vocab_size, emsize, num_class, hidden_dim1, hidden_dim2).to(device)

In [495]:
# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate
BATCH_SIZE = 8  # batch size for training

criterion = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model1.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

In [496]:
# Prepare train, valid, and test data
train_iter = iter(train_sentences)
test_iter = iter(test_sentences)
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

In [497]:
# Run the training loop
total_accu = None
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    train(model1, train_dataloader, optimizer, criterion, epoch)
    accu_val, precision, recall, f1 = evaluate(model1, valid_dataloader, criterion)

    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val

    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

| epoch   1 |   500/ 1506 batches | accuracy    0.686
| epoch   1 |  1000/ 1506 batches | accuracy    0.697
| epoch   1 |  1500/ 1506 batches | accuracy    0.696
-----------------------------------------------------------
| end of epoch   1 | time:  5.40s | valid accuracy    0.721 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   2 |   500/ 1506 batches | accuracy    0.700
| epoch   2 |  1000/ 1506 batches | accuracy    0.686
| epoch   2 |  1500/ 1506 batches | accuracy    0.685
-----------------------------------------------------------
| end of epoch   2 | time:  5.76s | valid accuracy    0.721 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   3 |   500/ 1506 batches | accuracy    0.680
| epoch   3 |  1000/ 1506 batches | accuracy    0.687
| epoch   3 |  1500/ 1506 batches | accuracy    0.705
-----------------------------------------------------------
| end of epoch   3 | time:  5.95s | valid accuracy    0.721 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   4 |   500/ 1506 batches | accuracy    0.689
| epoch   4 |  1000/ 1506 batches | accuracy    0.673
| epoch   4 |  1500/ 1506 batches | accuracy    0.701
-----------------------------------------------------------
| end of epoch   4 | time:  5.37s | valid accuracy    0.721 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   5 |   500/ 1506 batches | accuracy    0.702
| epoch   5 |  1000/ 1506 batches | accuracy    0.684
| epoch   5 |  1500/ 1506 batches | accuracy    0.667
-----------------------------------------------------------
| end of epoch   5 | time:  5.05s | valid accuracy    0.721 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   6 |   500/ 1506 batches | accuracy    0.691
| epoch   6 |  1000/ 1506 batches | accuracy    0.673
| epoch   6 |  1500/ 1506 batches | accuracy    0.693
-----------------------------------------------------------
| end of epoch   6 | time:  5.12s | valid accuracy    0.721 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   7 |   500/ 1506 batches | accuracy    0.707
| epoch   7 |  1000/ 1506 batches | accuracy    0.665
| epoch   7 |  1500/ 1506 batches | accuracy    0.682
-----------------------------------------------------------
| end of epoch   7 | time:  5.07s | valid accuracy    0.721 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   8 |   500/ 1506 batches | accuracy    0.693
| epoch   8 |  1000/ 1506 batches | accuracy    0.691
| epoch   8 |  1500/ 1506 batches | accuracy    0.685
-----------------------------------------------------------
| end of epoch   8 | time:  5.62s | valid accuracy    0.721 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   9 |   500/ 1506 batches | accuracy    0.687
| epoch   9 |  1000/ 1506 batches | accuracy    0.697
| epoch   9 |  1500/ 1506 batches | accuracy    0.684
-----------------------------------------------------------
| end of epoch   9 | time:  5.35s | valid accuracy    0.721 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch  10 |   500/ 1506 batches | accuracy    0.692
| epoch  10 |  1000/ 1506 batches | accuracy    0.701
| epoch  10 |  1500/ 1506 batches | accuracy    0.687
-----------------------------------------------------------
| end of epoch  10 | time:  5.29s | valid accuracy    0.721 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [498]:
# Save the model
torch.save(model1.state_dict(), "A1_out_jieba.pth")

In [499]:
accu_val, precision, recall, f1_score = evaluate(model1, test_dataloader, criterion)
print("test accuracy {:8.3f}, precision {:8.3f}, recall {:8.3f}, f1_score {:8.3f}".format(accu_val, precision, recall,
                                                                                          f1_score))

test accuracy    0.739, precision    0.546, recall    0.739, f1_score    0.628


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
