In [13]:
from dataloader import dataset2dataloader
from models import BiLSTM_CRF_NER
from torch.optim import Adam
import torch
import numpy as np
import os

处理数据

In [14]:
train_iter, val_iter, sent_vocab, tag_vocab = dataset2dataloader(batch_size=128)
word_vectors = sent_vocab.vectors

#device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
device = torch.device('cpu')

这里由于BiLSTM的数据处理问题，出现了如下报错  
`RuntimeError: 'lengths' argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor`  
这似乎是由于在进行文本嵌入的时候，调用的`nn.Embedding`函数的问题  
由于**时间**关系我暂时只能先使`device = torch.device('cpu')`  
按照老师所说，后面的NLP专选课实验也会使用BiLSTM+CRF实现NER，届时可能会解决这一问题

训练

In [15]:
model = BiLSTM_CRF_NER(vocab_size=len(sent_vocab.stoi), embedding_dim=50, hidden_size=128, num_tags=len(tag_vocab.stoi), word_vectors=word_vectors, device=device)

In [16]:
epoch = 10
learning_rate = 0.01
model_path = "model.pkl"

In [17]:
optimizer = Adam(model.parameters(), lr=learning_rate)

In [18]:
if os.path.exists(model_path):
    model = torch.load(model_path)
else:
    for ep in range(epoch):
        model.train()
        for i, batch in enumerate(train_iter):
            x, y = batch.sent.t(), batch.tag.t()
            mask = (x != sent_vocab.stoi["<pad>"])
            optimizer.zero_grad()
            loss = model(x, y, mask)
            loss.backward()
            optimizer.step()
            if i % 100 == 0:
                print(f"epoch:{ep}, iter:{i}, loss:{loss.item()}", end=" ")

        model.eval()
        train_accs = []
        preds, golds = [], []
        for i, batch in enumerate(train_iter):
            x, y = batch.sent.t(), batch.tag.t()
            mask = (x != sent_vocab.stoi["<pad>"])
            with torch.no_grad():
                preds = model.predict(x, mask)
            right, total = 0, 0
            for pred, gold in zip(preds, y):
                right += np.sum(np.array(pred) == gold[:len(pred)].numpy())
                total += len(pred)
            train_accs.append(right*1.0/total)
        train_acc = np.array(train_accs).mean()

        val_accs = []
        for i, batch in enumerate(val_iter):
            x, y = batch.sent.t(), batch.tag.t()
            mask = (x != sent_vocab.stoi["<pad>"])
            with torch.no_grad():
                preds = model.predict(x, mask)
            right, total = 0, 0
            for pred, gold in zip(preds, y):
                right += np.sum(np.array(pred) == gold[:len(pred)].numpy())
                total += len(pred)
            val_accs.append(right * 1.0 / total)
        val_acc = np.array(val_accs).mean()
        print("epoch %d train acc:%.2f, val acc:%.2f" % (epoch, train_acc, val_acc))
torch.save(model, model_path)

epoch:0, iter:0, loss:5228.29833984375 epoch:0, iter:100, loss:342.710205078125 epoch 10 train acc:0.97, val acc:0.94
epoch:1, iter:0, loss:211.81451416015625 epoch:1, iter:100, loss:70.90070343017578 epoch 10 train acc:0.99, val acc:0.96
epoch:2, iter:0, loss:72.41127014160156 epoch:2, iter:100, loss:44.5744743347168 epoch 10 train acc:1.00, val acc:0.96
epoch:3, iter:0, loss:19.108728408813477 epoch:3, iter:100, loss:35.543296813964844 epoch 10 train acc:1.00, val acc:0.97
epoch:4, iter:0, loss:16.934940338134766 epoch:4, iter:100, loss:11.126664161682129 epoch 10 train acc:1.00, val acc:0.96
epoch:5, iter:0, loss:4.5881476402282715 epoch:5, iter:100, loss:5.764562606811523 epoch 10 train acc:1.00, val acc:0.96
epoch:6, iter:0, loss:1.6615972518920898 epoch:6, iter:100, loss:1.5588173866271973 epoch 10 train acc:1.00, val acc:0.95
epoch:7, iter:0, loss:1.6595497131347656 epoch:7, iter:100, loss:0.116668701171875 epoch 10 train acc:1.00, val acc:0.96
epoch:8, iter:0, loss:0.4976882934

测试

In [43]:
#test_sents = ["My name is Yufei Wang , I am from Jinzhou , Hubei , China ."]
#test_sents = ["HUST is the abbreviation of Huazhong University of Science and Technology . It is located in Hongshan , Wuhan , China ."]
test_sents = ["Sufjan Stevens released his thirteenth album Carrie and Lowell on March 31 , 2015 in America and received a high score of 9 . 3 from Pitchfork ."]

在这里修改文本以获得输出

In [44]:
for sent in test_sents:
    ids = [sent_vocab.stoi[word] for word in sent.split(" ")]
    input_tensor = torch.tensor([ids])
    mask = input_tensor != sent_vocab.stoi["<pad>"]
    with torch.no_grad():
        pred = model.predict(input_tensor, mask)

In [45]:
print(sent, "\n", [tag_vocab.itos[tag_id] for tag_id in pred[0]])

Sufjan Stevens released his thirteenth album Carrie and Lowell on March 31 , 2015 in America and received a high score of 9 . 3 from Pitchfork . 
 ['B-PER', 'E-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'S-PER', 'O', 'S-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'S-ORG', 'O']


简要地测试了几条文本，可以看出大部分`PER`、`LOC`和`ORG`都可以成功识别，但是对于日期等`MISC`识别效果较差