# 数据预处理

In [1]:
from tqdm import tqdm
import json
import numpy as np

## 引入sklearn

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

countVectorizer = CountVectorizer(stop_words='english')
tfidfVectorizer = TfidfVectorizer(stop_words='english')

## 生成词向量矩阵和label向量

In [3]:
with open('./exp1data/train_data.txt') as f:
    train_data_raw = f.read()
    f.close()

train_data_raw = [json.loads(data) for data in train_data_raw.strip().split('\n')]

text_num = len(train_data_raw)
texts, label = [], []

for data in tqdm(train_data_raw):
    texts.append(data['raw'])
    label.append(data['label'])

matrix = countVectorizer.fit_transform(texts)
vocab = countVectorizer.get_feature_names_out()

matrix = tfidfVectorizer.fit_transform(texts)
vocab = tfidfVectorizer.get_feature_names_out()

all_data = matrix.toarray()
all_label = np.array(label)

100%|██████████| 8000/8000 [00:00<00:00, 1486946.38it/s]


## 分割训练集和验证集

In [4]:
from sklearn.model_selection import train_test_split
train_data, val_data, train_label, val_label = train_test_split(all_data, all_label, test_size=0.2)

# 模型训练

## 导入Pytorch包

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

## Config

In [6]:
BATCH_SIZE = 64
PRINT = 10

## 将ndarray数组转为tensor格式

In [7]:
train_dataset = TensorDataset(
    torch.FloatTensor(train_data),
    torch.LongTensor(train_label)
    )
loader_train = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(
    torch.FloatTensor(val_data),
    torch.LongTensor(val_label)
    )
loader_val = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

## 构造全连接神经网络FC(MLP)

In [8]:
class FC(nn.Module):
    def __init__(self, input_size, hidden_size, classes_num):
        super(FC, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, classes_num)
    
    # 不加激活默认RELU激活
    def forward(self, x):
        scores = self.fc1(x)
        scores = self.fc2(scores)
        return scores

## 构造训练及验证函数

In [9]:
def val(loader_val, model):
    model.eval()
    cor, all = 0, 0
    for (x, y) in loader_val:
        all += len(y)
        scores = model(x)
        for idx, each in enumerate(scores):
            if y[idx] == np.argmax(each.detach().numpy()): 
                cor += 1

    acc = cor / all
    print('val acc: ', acc)

def train(model, loss_func, optim, loader_train, loader_val, epoch=1):
    for e in range(epoch):
        for idx, (x, y) in enumerate(loader_train):
            # switch to train mode
            model.train()
            scores = model(x)
            loss = loss_func(scores, y)

            optim.zero_grad()
            loss.backward()
            optim.step()
            
            if idx % PRINT == 0:
                print('Epoch %d, Iteration %d, loss = %.4f' % (e, idx, loss.item()))
                if loader_val:
                    val(loader_val, model)
                print()

## 训练模型
此处调参过程已省略，具体调参过程参照实验报告

In [10]:
lr = 9e-4
wd = 1e-4

model = FC(train_data.shape[1], int(np.sqrt(train_data.shape[1])), 10)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

train(model, loss_func, optimizer, loader_train, loader_val, 10)

Epoch 0, Iteration 0, loss = 2.3200
val acc:  0.14

Epoch 0, Iteration 10, loss = 2.2670
val acc:  0.37375

Epoch 0, Iteration 20, loss = 2.1944
val acc:  0.679375

Epoch 0, Iteration 30, loss = 2.1430
val acc:  0.80625

Epoch 0, Iteration 40, loss = 1.9872
val acc:  0.861875

Epoch 0, Iteration 50, loss = 1.8822
val acc:  0.90125

Epoch 0, Iteration 60, loss = 1.7530
val acc:  0.921875

Epoch 0, Iteration 70, loss = 1.5712
val acc:  0.924375

Epoch 0, Iteration 80, loss = 1.3943
val acc:  0.92625

Epoch 0, Iteration 90, loss = 1.2467
val acc:  0.928125

Epoch 1, Iteration 0, loss = 0.8322
val acc:  0.931875

Epoch 1, Iteration 10, loss = 0.6923
val acc:  0.93375

Epoch 1, Iteration 20, loss = 0.6645
val acc:  0.943125

Epoch 1, Iteration 30, loss = 0.5022
val acc:  0.944375

Epoch 1, Iteration 40, loss = 0.4547
val acc:  0.94625

Epoch 1, Iteration 50, loss = 0.3851
val acc:  0.94875

Epoch 1, Iteration 60, loss = 0.4530
val acc:  0.95125

Epoch 1, Iteration 70, loss = 0.3129
val acc:

## 使用全部数据训练模型

In [14]:
all_dataset = TensorDataset(
    torch.FloatTensor(all_data),
    torch.LongTensor(all_label)
    )
loader_all = DataLoader(all_dataset, batch_size=BATCH_SIZE, shuffle=True)

lr = 9e-4
wd = 1e-4

model = FC(train_data.shape[1], int(np.sqrt(train_data.shape[1])), 10)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

train(model, loss_func, optimizer, loader_all, [], 10)

Epoch 0, Iteration 0, loss = 2.3059

Epoch 0, Iteration 10, loss = 2.2644

Epoch 0, Iteration 20, loss = 2.2097

Epoch 0, Iteration 30, loss = 2.1194

Epoch 0, Iteration 40, loss = 1.9980

Epoch 0, Iteration 50, loss = 1.9124

Epoch 0, Iteration 60, loss = 1.7552

Epoch 0, Iteration 70, loss = 1.5678

Epoch 0, Iteration 80, loss = 1.3716

Epoch 0, Iteration 90, loss = 1.2110

Epoch 0, Iteration 100, loss = 0.9546

Epoch 0, Iteration 110, loss = 0.8670

Epoch 0, Iteration 120, loss = 0.7794

Epoch 1, Iteration 0, loss = 0.5907

Epoch 1, Iteration 10, loss = 0.5250

Epoch 1, Iteration 20, loss = 0.4688

Epoch 1, Iteration 30, loss = 0.3731

Epoch 1, Iteration 40, loss = 0.3380

Epoch 1, Iteration 50, loss = 0.3269

Epoch 1, Iteration 60, loss = 0.3091

Epoch 1, Iteration 70, loss = 0.3081

Epoch 1, Iteration 80, loss = 0.2519

Epoch 1, Iteration 90, loss = 0.2425

Epoch 1, Iteration 100, loss = 0.2441

Epoch 1, Iteration 110, loss = 0.2137

Epoch 1, Iteration 120, loss = 0.1687

Epoch 2,

# 预测测试集

In [15]:
test_data = []
test_label = []
model.eval()

with open('./exp1data/test.txt') as testf:
    testf.readline()
    for line in testf.readlines():
        id, text = line.split(',', 1)
        test_data.append(text)
    
    # test_data = countVectorizer.transform(test_data)
    test_data = tfidfVectorizer.transform(test_data)

scores = model(torch.FloatTensor(test_data.toarray()))

for s in scores:
    test_label.append(np.argmax(s.detach().numpy()))

with open('./exp1data/mlpoutput.txt', 'w') as outputf:
    outputf.write('id, pred\n')
    for id, pred in enumerate(test_label):
        outputf.write('%d, %d\n' % (int(id), pred))