In [1]:
from tqdm import tqdm
import json
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import spacy

nlp = spacy.load('en_core_web_sm')

In [2]:
BATCH_SIZE = 64
PRINT = 50

In [3]:
def val(loader_val, model):
    model.eval()
    cor, all = 0, 0
    for (x, y) in loader_val:
        all += len(y)
        scores = model(x)
        for idx, each in enumerate(scores):
            if y[idx] == np.argmax(each.detach().numpy()): 
                cor += 1

    acc = cor / all
    print('val acc: ', acc)

def train(model, loss_func, optim, loader_train, loader_val, epoch=1):
    for e in range(epoch):
        for idx, (x, y) in enumerate(loader_train):
            # switch to train mode
            model.train()
            scores = model(x)
            loss = loss_func(scores, y)

            optim.zero_grad()
            loss.backward()
            optim.step()
            
            if idx % PRINT == 0:
                print('Epoch %d, Iteration %d, loss = %.4f' % (e, idx, loss.item()))
                if loader_val:
                    val(loader_val, model)
                print()

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# countVectorizer = CountVectorizer(stop_words='english')
tfidfVectorizer = TfidfVectorizer(stop_words='english')

In [5]:
with open('./exp1data/train_data.txt') as f:
    train_data_raw = f.read()
    f.close()

train_data_raw = [json.loads(data) for data in train_data_raw.strip().split('\n')]

text_num = len(train_data_raw)
texts, label = [], []

for data in tqdm(train_data_raw):
    texts.append(data['raw'])
    label.append(data['label'])

# matrix = countVectorizer.fit_transform(texts)
# vocab = countVectorizer.get_feature_names_out()

matrix = tfidfVectorizer.fit_transform(texts)
vocab = tfidfVectorizer.get_feature_names_out()

all_data = matrix.toarray()
all_label = np.array(label)

100%|██████████| 8000/8000 [00:00<00:00, 2123429.44it/s]


In [14]:
class FC(nn.Module):
    def __init__(self, input_size, hidden_size, classes_num):
        super(FC, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, classes_num)

        # softmax将输出转为概率
        self.sf = nn.Softmax(dim=1)
    
    # 不加激活默认RELU激活
    def forward(self, x):
        scores = self.fc1(x)
        scores = self.fc2(scores)
        return self.sf(scores)  # 加入softmax

In [15]:
all_dataset = TensorDataset(
    torch.FloatTensor(all_data),
    torch.LongTensor(all_label)
    )
loader_all = DataLoader(all_dataset, batch_size=BATCH_SIZE, shuffle=True)

lr = 9e-4
wd = 1e-4

mlp_model = FC(all_data.shape[1], int(np.sqrt(all_data.shape[1])), 10)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=lr, weight_decay=wd)

train(mlp_model, loss_func, optimizer, loader_all, [], 10)

Epoch 0, Iteration 0, loss = 2.3028

Epoch 0, Iteration 50, loss = 2.2484

Epoch 0, Iteration 100, loss = 2.0415

Epoch 1, Iteration 0, loss = 1.7720

Epoch 1, Iteration 50, loss = 1.6621

Epoch 1, Iteration 100, loss = 1.5934

Epoch 2, Iteration 0, loss = 1.5256

Epoch 2, Iteration 50, loss = 1.5370

Epoch 2, Iteration 100, loss = 1.5341

Epoch 3, Iteration 0, loss = 1.4923

Epoch 3, Iteration 50, loss = 1.5001

Epoch 3, Iteration 100, loss = 1.5005

Epoch 4, Iteration 0, loss = 1.4819

Epoch 4, Iteration 50, loss = 1.4897

Epoch 4, Iteration 100, loss = 1.5444

Epoch 5, Iteration 0, loss = 1.4787

Epoch 5, Iteration 50, loss = 1.5084

Epoch 5, Iteration 100, loss = 1.5051

Epoch 6, Iteration 0, loss = 1.4766

Epoch 6, Iteration 50, loss = 1.4788

Epoch 6, Iteration 100, loss = 1.4878

Epoch 7, Iteration 0, loss = 1.4776

Epoch 7, Iteration 50, loss = 1.4798

Epoch 7, Iteration 100, loss = 1.4962

Epoch 8, Iteration 0, loss = 1.4732

Epoch 8, Iteration 50, loss = 1.4855

Epoch 8, Iter

In [16]:
with open('./exp1data/train_data.txt') as f:
    train_data_raw = f.read()
    f.close()

train_data_raw = [json.loads(data) for data in train_data_raw.strip().split('\n')]

word_set = set()
for data in tqdm(train_data_raw):
    data['nlp']=nlp(data['raw'])
    for token in data['nlp']:
        if (not token.is_stop) and (not token.is_punct):
            word_set.add(token.lemma_)

vocab = list(word_set)
word2idx = {w: i for i, w in enumerate(vocab)}

max_word_size = 0
for data in tqdm(train_data_raw):
    temp_size = 0
    for token in data['nlp']:
        if token.lemma_ in word_set:
            temp_size += 1
    if temp_size > max_word_size:
        max_word_size = temp_size

for data in tqdm(train_data_raw):
    vec = np.zeros(max_word_size)
    idx = 0
    for token in data['nlp']:
        try:
            vec[idx] = word2idx[token.lemma_]
            idx += 1
        except:
            pass
    data['vec'] = vec

100%|██████████| 8000/8000 [05:06<00:00, 26.12it/s]
100%|██████████| 8000/8000 [00:00<00:00, 8118.83it/s]
100%|██████████| 8000/8000 [00:01<00:00, 5474.27it/s]


In [17]:
train_data = []
train_label = []

for data in tqdm(train_data_raw):
    train_data.append(list(data['vec']))
    train_label.append(data['label'])

all_data = np.array(train_data)
all_label = np.array(train_label)

100%|██████████| 8000/8000 [00:00<00:00, 64317.12it/s]


In [27]:
class TextCNN(nn.Module):
    def __init__(self, vocab, embedding_size, max_word, class_num):
        super(TextCNN, self).__init__()

        # Embedding
        self.ebd = nn.Embedding(len(vocab), embedding_size)

        # CNN
        output_channel = 100          # 这里设置多通道防止过拟合，但是在原论文的实际实验中效果不大
        self.cnn = nn.Sequential(
            nn.Conv2d(
                in_channels=1, 
                out_channels=output_channel, 
                kernel_size=(2, embedding_size), 
                stride=1
                ),                  # 经过这一卷积层后feature map大小变为(maxword-1) * embedding_size
            nn.ReLU(),
            nn.MaxPool2d((2, 1)),   # 经过maxpooling后变为(maxword-1)/2 * embedding_size
            nn.Dropout(0.5)         # 同样为了防止过拟合
        )

        # Fully Connected
        self.fc = nn.Linear(int((max_word - 1) / 2) * output_channel, class_num)

        # softmax 将输出转为概率
        self.sf = nn.Softmax(dim=1)

    def forward(self, x):
        embedding = self.ebd(x).unsqueeze(1)
        conved = self.cnn(embedding)
        flatten = conved.view(x.shape[0], -1)
        output = self.fc(flatten) 
        return self.sf(output) # 加入softmax

In [28]:
all_dataset = TensorDataset(
    torch.LongTensor(all_data),
    torch.LongTensor(all_label)
    )
loader_all = DataLoader(all_dataset, batch_size=BATCH_SIZE, shuffle=True)

lr = 3e-3
wd = 5e-4

textcnn_model = TextCNN(vocab, 100, max_word_size, 10)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(textcnn_model.parameters(), lr=lr, weight_decay=wd)

train(textcnn_model, loss_func, optimizer, loader_all, [], 40)

Epoch 0, Iteration 0, loss = 2.3016

Epoch 0, Iteration 50, loss = 2.3362

Epoch 0, Iteration 100, loss = 2.3205

Epoch 1, Iteration 0, loss = 2.2737

Epoch 1, Iteration 50, loss = 2.3049

Epoch 1, Iteration 100, loss = 2.3242

Epoch 2, Iteration 0, loss = 2.2827

Epoch 2, Iteration 50, loss = 2.1090

Epoch 2, Iteration 100, loss = 2.0623

Epoch 3, Iteration 0, loss = 2.0638

Epoch 3, Iteration 50, loss = 2.0732

Epoch 3, Iteration 100, loss = 1.9181

Epoch 4, Iteration 0, loss = 1.7971

Epoch 4, Iteration 50, loss = 1.8716

Epoch 4, Iteration 100, loss = 1.7549

Epoch 5, Iteration 0, loss = 1.6283

Epoch 5, Iteration 50, loss = 1.6055

Epoch 5, Iteration 100, loss = 1.6427

Epoch 6, Iteration 0, loss = 1.6619

Epoch 6, Iteration 50, loss = 1.5553

Epoch 6, Iteration 100, loss = 1.5630

Epoch 7, Iteration 0, loss = 1.5397

Epoch 7, Iteration 50, loss = 1.5666

Epoch 7, Iteration 100, loss = 1.6229

Epoch 8, Iteration 0, loss = 1.5494

Epoch 8, Iteration 50, loss = 1.5157

Epoch 8, Iter

In [29]:
mlp_test_data = []
textcnn_test_data = []

test_label = []
mlp_model.eval()
textcnn_model.eval()

with open('./exp1data/test.txt') as testf:
    testf.readline()
    for line in testf.readlines():
        id, text = line.split(',', 1)

        mlp_test_data.append(text)
        
        vec = np.zeros(max_word_size)
        idx = 0
        for token in nlp(text):
            try:
                vec[idx] = word2idx[token.lemma_]
                idx += 1
            except:
                pass

        textcnn_test_data.append(list(vec))

    mlp_test_data = tfidfVectorizer.transform(mlp_test_data)

In [30]:
mlp_scores = mlp_model(torch.FloatTensor(mlp_test_data.toarray()))
textcnn_scores = textcnn_model(torch.LongTensor(textcnn_test_data))

for s1, s2 in zip(mlp_scores, textcnn_scores):
    s = s1 + s2
    test_label.append(np.argmax(s.detach().numpy()))

with open('./exp1data/ensembleoutput.txt', 'w') as outputf:
    outputf.write('id, pred\n')
    for id, pred in enumerate(test_label):
        outputf.write('%d, %d\n' % (int(id), pred))

  textcnn_scores = textcnn_model(torch.LongTensor(textcnn_test_data))
