In [6]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
import tqdm
#from tqdm import trange,tqdm
from tqdm.notebook import tqdm
from torchtext import data
from torchtext.data import Iterator, BucketIterator
from torchtext.vocab import Vectors
import torch
import torch.nn as nn
import torch.nn.functional as F

epochs=2
num_classes = 5
batch_size=32
data_path='./data/'
vectors = Vectors('glove.twitter.27B.200d.txt', 'C:/Users/YYH/Desktop/nlp-beginner/Task 2/embedding/')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device='cpu'

In [2]:
# 数据加载器

def load_iters(batch_size=32, device="cpu", data_path='data', vectors=None):
    # 加载Field 已包含数据预处理
    TEXT = data.Field(lower=True, batch_first=True, include_lengths=True)
    LABEL = data.LabelField(batch_first=True)
    # 标明数据文件中的栏位
    train_fields = [(None, None), (None, None), ('text', TEXT), ('label', LABEL)]
    test_fields = [(None, None), (None, None), ('text', TEXT)]
    train_data = data.TabularDataset.splits(
        path=data_path,
        train='train.tsv',
        format='tsv',
        fields=train_fields,
        skip_header=True
    )[0]

    test_data = data.TabularDataset.splits(
        path='data',
        train='test.tsv',
        format='tsv',
        fields=test_fields,
        skip_header=True
    )[0]
    TEXT.build_vocab(train_data.text, vectors=vectors)
    LABEL.build_vocab(train_data.label)
    train_data, dev_data = train_data.split([0.8, 0.2])

    train_iter, dev_iter = BucketIterator.splits(
        (train_data, dev_data),
        batch_sizes=(batch_size, batch_size),
        device=device,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        repeat=False,
        shuffle=True
    )

    test_iter = Iterator(
        test_data,
        batch_size=batch_size,
        device=device,
        sort=False,
        sort_within_batch=False,
        repeat=False,
        shuffle=False
    )
    return train_iter, dev_iter, test_iter, TEXT, LABEL
train_iter, dev_iter, test_iter, TEXT, LABEL = load_iters(batch_size, device, data_path, vectors)
vocab_size = len(TEXT.vocab.itos)

In [3]:
# 定义LSTM模型

class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, output_dim,  
                 bidirectional, dropout):
        super().__init__()          
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.act = nn.ReLU()

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        dense_outputs=self.fc(hidden)
        outputs=self.act(dense_outputs)

        return outputs
    
embed_size = 200
hidden_size = 256
num_layers = 1
bidirectional = True
dropout_rate = 0.1

lstm_model = LSTM(vocab_size, embed_size, hidden_size, num_layers, num_classes, bidirectional, dropout_rate)



In [4]:
# 定义textCNN模型

class TextCNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim, 
        kernel_sizes, 
        num_filters, 
        num_classes, dropout_rate):
        super(TextCNN, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.kernel_sizes = kernel_sizes
        self.num_filters = num_filters
        self.num_classes = num_classes
        self.dropout_rate = dropout_rate
        self.embedding = nn.Embedding(
            vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embed_size), padding=(k - 1, 0))
            for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, num_classes)
        
    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x).squeeze(3))
        x_max = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x_max

    def forward(self, x,lens):
        embed = self.embedding(x).unsqueeze(1)
        conv_results = [self.conv_and_pool(embed, conv) for conv in self.convs]
        out = torch.cat(conv_results, 1)
        return self.fc(self.dropout(out))

embed_size = 200
kernel_sizes=[3, 4, 5]
num_filters=100
dropout_rate = 0.1

cnn_model=TextCNN(vocab_size,embed_size,kernel_sizes,num_filters,num_classes,dropout_rate)

In [11]:
# 训练模型
def train(model, loss_fn, optimizer, train_generator, dev_generator,epochs):
    model.to(device)
    loss_history=[]
    for epoch in range(epochs):
        for step, batch in enumerate(tqdm(train_generator)):
            model.train()
            (inputs, lens), labels = batch.text, batch.label
            if 0 in lens:
                continue
            optimizer.zero_grad()
            forward_output = model(inputs,lens)
            loss = loss_fn(forward_output, labels)
            loss.backward()
            optimizer.step()
            if step % 10 == 0:
                loss_history.append(loss.item())
        model.eval()
        with torch.no_grad():
            corr_num = 0
            err_num = 0
            for batch in dev_iter:
                (inputs, lens), labels = batch.text, batch.label
                if 0 in lens:
                    continue
                outputs = model(inputs, lens)
                corr_num += (outputs.argmax(1) == labels).sum().item()
                err_num += (outputs.argmax(1) != labels).sum().item()
            tqdm.write('Epoch {}, Accuracy {}'.format(epoch, corr_num / (corr_num + err_num))) 
        torch.save(model, './model/model_'+model_name+'_epoch_{}.pkl'.format(epoch))
    return model

In [12]:
model=lstm_model
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss()
train(model,loss_fn,optimizer,train_iter,dev_iter,epochs)

  0%|          | 0/3902 [00:00<?, ?it/s]

Epoch 0, Accuracy 0.646802511854415


  0%|          | 0/3902 [00:00<?, ?it/s]

Epoch 1, Accuracy 0.6678200692041523


LSTM(
  (embedding): Embedding(16533, 200)
  (lstm): LSTM(200, 256, batch_first=True, dropout=0.1, bidirectional=True)
  (fc): Linear(in_features=512, out_features=5, bias=True)
  (act): ReLU()
)

In [13]:
model=cnn_model
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss()
train(model,loss_fn,optimizer,train_iter,dev_iter,epochs)

  0%|          | 0/3902 [00:00<?, ?it/s]

Epoch 0, Accuracy 0.6266500064077919


  0%|          | 0/3902 [00:00<?, ?it/s]

Epoch 1, Accuracy 0.6491093169293861


TextCNN(
  (embedding): Embedding(16533, 200)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 200), stride=(1, 1), padding=(2, 0))
    (1): Conv2d(1, 100, kernel_size=(4, 200), stride=(1, 1), padding=(3, 0))
    (2): Conv2d(1, 100, kernel_size=(5, 200), stride=(1, 1), padding=(4, 0))
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=300, out_features=5, bias=True)
)