# 数据预处理

In [1]:
from tqdm import tqdm
import json
import numpy as np
import spacy

nlp = spacy.load('en_core_web_sm')

## 构造全局词典，并将词语转为数字生成不同于MLP处理方法的词向量
该词向量仅以最长的text词语个数为最大维度，将每个单词以在vocab中的索引做替换，以0填充

In [2]:
with open('./exp1data/train_data.txt') as f:
    train_data_raw = f.read()
    f.close()

train_data_raw = [json.loads(data) for data in train_data_raw.strip().split('\n')]

word_set = set()
for data in tqdm(train_data_raw):
    data['nlp']=nlp(data['raw'])
    for token in data['nlp']:
        if (not token.is_stop) and (not token.is_punct):
            word_set.add(token.lemma_)

vocab = list(word_set)
word2idx = {w: i for i, w in enumerate(vocab)}

max_word_size = 0
for data in tqdm(train_data_raw):
    temp_size = 0
    for token in data['nlp']:
        if token.lemma_ in word_set:
            temp_size += 1
    if temp_size > max_word_size:
        max_word_size = temp_size

for data in tqdm(train_data_raw):
    vec = np.zeros(max_word_size)
    idx = 0
    for token in data['nlp']:
        try:
            vec[idx] = word2idx[token.lemma_]
            idx += 1
        except:
            pass
    data['vec'] = vec

100%|██████████| 8000/8000 [03:01<00:00, 44.17it/s]
100%|██████████| 8000/8000 [00:00<00:00, 12269.26it/s]
100%|██████████| 8000/8000 [00:01<00:00, 7746.34it/s]


In [3]:
train_data = []
train_label = []

for data in tqdm(train_data_raw):
    train_data.append(list(data['vec']))
    train_label.append(data['label'])

all_data = np.array(train_data)
all_label = np.array(train_label)

100%|██████████| 8000/8000 [00:00<00:00, 36123.34it/s]


## 分割训练/验证集

In [4]:
from sklearn.model_selection import train_test_split

train_data, val_data, train_label, val_label = train_test_split(all_data, all_label, test_size=0.2)

# 模型训练

## 导入Pytorch包

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

## Config

In [6]:
BATCH_SIZE = 64
PRINT = 50

## 将ndarray转为tensor格式

In [7]:
train_dataset = TensorDataset(
    torch.LongTensor(train_data),
    torch.LongTensor(train_label)
    )

loader_train = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(
    torch.LongTensor(val_data),
    torch.LongTensor(val_label)
    )
loader_val = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

## 构造TextCNN

In [8]:
class TextCNN(nn.Module):
    def __init__(self, vocab, embedding_size, max_word, class_num):
        super(TextCNN, self).__init__()

        # Embedding
        self.ebd = nn.Embedding(len(vocab), embedding_size)

        # CNN
        output_channel = 100          # 这里设置多通道防止过拟合，但是在原论文的实际实验中效果不大
        self.cnn = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=output_channel, kernel_size=(2, embedding_size), stride=1),    # 经过这一卷积层后feature map大小变为(maxword-1) * embedding_size
            nn.ReLU(),
            nn.MaxPool2d((2, 1)),   # 经过maxpooling后变为(maxword-1)/2 * embedding_size
            nn.Dropout(0.5)         # 同样为了防止过拟合
        )

        # Fully Connected
        self.fc = nn.Linear(int((max_word - 1) / 2) * output_channel, class_num)

    def forward(self, x):
        embedding = self.ebd(x).unsqueeze(1)
        conved = self.cnn(embedding)
        flatten = conved.view(x.shape[0], -1)
        output = self.fc(flatten)
        return output

## 构造训练及验证函数（同MLP）

In [9]:
def val(loader_val, model):
    model.eval()
    cor, all = 0, 0
    for (x, y) in loader_val:
        all += len(y)
        scores = model(x)
        for idx, each in enumerate(scores):
            if y[idx] == np.argmax(each.detach().numpy()): 
                cor += 1

    acc = cor / all
    print('val acc: ', acc)

def train(model, loss_func, optim, loader_train, loader_val, epoch=1):
    for e in range(epoch):
        for idx, (x, y) in enumerate(loader_train):
            # switch to train mode
            model.train()
            scores = model(x)
            loss = loss_func(scores, y)

            optim.zero_grad()
            loss.backward()
            optim.step()
            
            if idx % PRINT == 0:
                print('Epoch %d, Iteration %d, loss = %.4f' % (e, idx, loss.item()))
                if loader_val:
                    val(loader_val, model)
                print()

## 训练模型
此处调参过程已省略，具体调参过程参照实验报告

In [10]:
lr = 3e-3
wd = 5e-4

textcnn_model = TextCNN(vocab, 100, max_word_size, 10)
loss_func = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(textcnn_model.parameters(), lr=lr, weight_decay=wd)

train(textcnn_model, loss_func, optimizer, loader_train, loader_val, 40)

Epoch 0, Iteration 0, loss = 2.3675
val acc:  0.09

Epoch 0, Iteration 50, loss = 2.0775
val acc:  0.355

Epoch 1, Iteration 0, loss = 0.9228
val acc:  0.645625

Epoch 1, Iteration 50, loss = 0.8427
val acc:  0.72875

Epoch 2, Iteration 0, loss = 0.5205
val acc:  0.773125

Epoch 2, Iteration 50, loss = 0.4548
val acc:  0.79125

Epoch 3, Iteration 0, loss = 0.2167
val acc:  0.81625

Epoch 3, Iteration 50, loss = 0.3069
val acc:  0.82125

Epoch 4, Iteration 0, loss = 0.1416
val acc:  0.82875

Epoch 4, Iteration 50, loss = 0.1170
val acc:  0.823125

Epoch 5, Iteration 0, loss = 0.0812
val acc:  0.83875

Epoch 5, Iteration 50, loss = 0.1363
val acc:  0.8425

Epoch 6, Iteration 0, loss = 0.0447
val acc:  0.84875

Epoch 6, Iteration 50, loss = 0.1050
val acc:  0.8575

Epoch 7, Iteration 0, loss = 0.1041
val acc:  0.86375

Epoch 7, Iteration 50, loss = 0.0665
val acc:  0.865625

Epoch 8, Iteration 0, loss = 0.0678
val acc:  0.87625

Epoch 8, Iteration 50, loss = 0.0793
val acc:  0.87625

Epoc

## 使用全部数据训练模型

In [12]:
all_dataset = TensorDataset(
    torch.LongTensor(all_data),
    torch.LongTensor(all_label)
    )
loader_all = DataLoader(all_dataset, batch_size=BATCH_SIZE, shuffle=True)

lr = 3e-3
wd = 5e-4

model = TextCNN(vocab, 100, max_word_size, 10)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

train(model, loss_func, optimizer, loader_all, [], 10)

Epoch 0, Iteration 0, loss = 2.3931

Epoch 0, Iteration 50, loss = 2.1543

Epoch 0, Iteration 100, loss = 1.4001

Epoch 1, Iteration 0, loss = 1.0104

Epoch 1, Iteration 50, loss = 0.8289

Epoch 1, Iteration 100, loss = 0.6506

Epoch 2, Iteration 0, loss = 0.3662

Epoch 2, Iteration 50, loss = 0.4268

Epoch 2, Iteration 100, loss = 0.4036

Epoch 3, Iteration 0, loss = 0.2713

Epoch 3, Iteration 50, loss = 0.1748

Epoch 3, Iteration 100, loss = 0.2229

Epoch 4, Iteration 0, loss = 0.1819

Epoch 4, Iteration 50, loss = 0.1523

Epoch 4, Iteration 100, loss = 0.2118

Epoch 5, Iteration 0, loss = 0.1285

Epoch 5, Iteration 50, loss = 0.1560

Epoch 5, Iteration 100, loss = 0.1303

Epoch 6, Iteration 0, loss = 0.1034

Epoch 6, Iteration 50, loss = 0.0682

Epoch 6, Iteration 100, loss = 0.1010

Epoch 7, Iteration 0, loss = 0.0897

Epoch 7, Iteration 50, loss = 0.0798

Epoch 7, Iteration 100, loss = 0.1899

Epoch 8, Iteration 0, loss = 0.0669

Epoch 8, Iteration 50, loss = 0.1137

Epoch 8, Iter

# 预测测试集

In [13]:
test_data = []
test_label = []
textcnn_model.eval()

with open('./exp1data/test.txt') as testf:
    testf.readline()
    for line in testf.readlines():
        id, text = line.split(',', 1)
        
        vec = np.zeros(max_word_size)
        idx = 0
        for token in nlp(text):
            try:
                vec[idx] = word2idx[token.lemma_]
                idx += 1
            except:
                pass

        test_data.append(list(vec))

In [14]:
scores = textcnn_model(torch.LongTensor(test_data))

for s in scores:
    test_label.append(np.argmax(s.detach().numpy()))

with open('./exp1data/textcnnoutput.txt', 'w') as outputf:
    outputf.write('id, pred\n')
    for id, pred in enumerate(test_label):
        outputf.write('%d, %d\n' % (int(id), pred))

  scores = textcnn_model(torch.LongTensor(test_data))
