In [1]:
#from dataloader import dataset2dataloader
from torch.optim import Adam
import torch
import torch.nn as nn
from torchcrf import CRF
import numpy as np
import os
import pandas as pd
import spacy
from torch.nn import init
from torchtext.legacy import data

模型

In [2]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_tags, word_vectors=None, device="cpu"):
        super(BiLSTM_CRF, self).__init__()
        self.device = device
        self.hidden_size = hidden_size
        
        self.embed = nn.Embedding(vocab_size, embedding_dim, _weight=word_vectors).to(device)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, bidirectional=True, batch_first=True).to(device)
        self.hidden2tag = nn.Linear(hidden_size*2, num_tags)
        self.crf = CRF(num_tags=num_tags, batch_first=True).to(device)

    def get_emissions(self, x):
        batch_size, seq_len = x.shape
        embedded = self.embed(x)
        h0, c0 = torch.zeros(2, batch_size, self.hidden_size).to(self.device), torch.zeros(2, batch_size, self.hidden_size).to(self.device)
        lstm_out, (_, _) = self.lstm(embedded, (h0, c0))
        emissions = self.hidden2tag(lstm_out)
        return emissions
    
    def forward(self, x, y, mask):
        emissions = self.get_emissions(x)
        loss = -self.crf(emissions=emissions, tags=y, mask=mask)
        return loss

    def predict(self, x, mask=None):
        emissions = self.get_emissions(x)
        preds = self.crf.decode(emissions, mask)
        return preds

处理数据

In [3]:
def prepare_data(dataset_path, debug=False):
    train_file_path = os.path.join(dataset_path, "train.txt")
    dev_file_path = os.path.join(dataset_path, "dev.txt")
    test_file_path = os.path.join(dataset_path, "test.txt")

    def process_file(file_path, target_file_path):
        sents, tags = [], []
        with open(file_path, "r") as f:
            lines = f.readlines()
            sent, tag = [], []
            for line in lines:
                line = line.strip()
                if len(line) == 0:
                    sents.append(" ".join(sent))
                    tags.append(" ".join(tag))
                    sent, tag = [], []
                else:
                    splited = line.split(" ")
                    sent.append(splited[0])
                    tag.append(splited[-1])
            if len(sent) != 0:
                sents.append(" ".join(sent))
                tags.append(" ".join(tag))
        df = pd.DataFrame()
        df["sent"] = sents if not debug else sents[:100]
        df["tag"] = tags if not debug else tags[:100]
        df.to_csv(target_file_path, index=False)

    train_csv = os.path.join(dataset_path, "train.csv") if not debug else os.path.join(dataset_path, "train_small.csv")
    dev_csv = os.path.join(dataset_path, "dev.csv") if not debug else os.path.join(dataset_path, "train_dev.csv")
    test_csv = os.path.join(dataset_path, "test.csv") if not debug else os.path.join(dataset_path, "train_test.csv")

    if not os.path.exists(test_csv):
        process_file(train_file_path, train_csv)
        process_file(dev_file_path, dev_csv)
        process_file(test_file_path, test_csv)

    return train_csv, dev_csv, test_csv

In [4]:
def dataset2dataloader(dataset_path="/data/wyf/InformationRetrievalProject/data/", batch_size=3, debug=False):
    train_csv, dev_csv, test_csv = prepare_data(dataset_path, debug=debug)

    def tokenizer(text):
        return text.split(" ")

    # 这里只是定义了数据格式
    TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=False)
    TAG = data.Field(sequential=True, tokenize=tokenizer, lower=False)
    train, val, test = data.TabularDataset.splits(
        path='', train=train_csv, validation=dev_csv, test=test_csv, format='csv', skip_header=True,
        fields=[('sent', TEXT), ('tag', TAG)])

    TEXT.build_vocab(train, vectors='glove.6B.50d')  # , max_size=30000)
    TAG.build_vocab(val)
    # 下面不注释acc0.89 反之0.95
    #TAG.build_vocab(test)

    # 当 corpus 中有的 token 在 vectors 中不存在时 的初始化方式.
    TEXT.vocab.vectors.unk_init = init.xavier_uniform

    DEVICE = "cpu"
    train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.sent), device=DEVICE)
    val_iter = data.BucketIterator(val, batch_size=batch_size, sort_key=lambda x: len(x.sent), device=DEVICE)
    #test_iter = data.BucketIterator(test, batch_size=batch_size, sort_key=lambda x: len(x.sent), device=DEVICE)


    # 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
    test_iter = data.Iterator(dataset=test, batch_size=128, train=False, sort=False, device=DEVICE)

    return train_iter, val_iter, test_iter, TEXT.vocab, TAG.vocab

In [5]:
train_iter, val_iter, test_iter, sent_vocab, tag_vocab = dataset2dataloader(batch_size=128)
word_vectors = sent_vocab.vectors

#device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
device = torch.device('cpu')

这里由于BiLSTM的数据处理问题，出现了如下报错  
`RuntimeError: 'lengths' argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor`  
这似乎是由于在进行文本嵌入的时候，调用的`nn.Embedding`函数的问题  
由于**时间**关系我暂时只能先使`device = torch.device('cpu')`  
按照老师所说，后面的NLP专选课实验也会使用BiLSTM+CRF实现NER，届时可能会解决这一问题

训练

In [6]:
model = BiLSTM_CRF(vocab_size=len(sent_vocab.stoi), embedding_dim=50, hidden_size=128, num_tags=len(tag_vocab.stoi), word_vectors=word_vectors, device=device)

In [7]:
epoch = 10
learning_rate = 0.01
model_path = "model_BC.pkl"

In [8]:
optimizer = Adam(model.parameters(), lr=learning_rate)

In [9]:
if os.path.exists(model_path):
    model = torch.load(model_path)
else:
    for ep in range(epoch):
        model.train()
        for i, batch in enumerate(train_iter):
            x, y = batch.sent.t(), batch.tag.t()
            mask = (x != sent_vocab.stoi["<pad>"])
            optimizer.zero_grad()
            loss = model(x, y, mask)
            loss.backward()
            optimizer.step()
            if i % 100 == 0:
                print(f"epoch:{ep}, iter:{i}, loss:{loss.item()}", end=" ")

        model.eval()
        train_accs = []
        preds, golds = [], []
        for i, batch in enumerate(train_iter):
            x, y = batch.sent.t(), batch.tag.t()
            mask = (x != sent_vocab.stoi["<pad>"])
            with torch.no_grad():
                preds = model.predict(x, mask)
            right, total = 0, 0
            for pred, gold in zip(preds, y):
                right += np.sum(np.array(pred) == gold[:len(pred)].numpy())
                total += len(pred)
            train_accs.append(right*1.0/total)
        train_acc = np.array(train_accs).mean()

        val_accs = []
        for i, batch in enumerate(val_iter):
            x, y = batch.sent.t(), batch.tag.t()
            mask = (x != sent_vocab.stoi["<pad>"])
            with torch.no_grad():
                preds = model.predict(x, mask)
            right, total = 0, 0
            for pred, gold in zip(preds, y):
                right += np.sum(np.array(pred) == gold[:len(pred)].numpy())
                total += len(pred)
            val_accs.append(right * 1.0 / total)
        val_acc = np.array(val_accs).mean()
        print("epoch %d train acc:%.2f, val acc:%.2f" % (ep, train_acc, val_acc))
torch.save(model, model_path)

测试

In [10]:
model.eval()
test_accs = []
for i, batch in enumerate(test_iter):
    x, y = batch.sent.t(), batch.tag.t()
    mask = (x != sent_vocab.stoi["<pad>"])
    with torch.no_grad():
        preds = model.predict(x, mask)
    right, total = 0, 0
    for pred, gold in zip(preds, y):
        right += np.sum(np.array(pred) == gold[:len(pred)].numpy())
        total += len(pred)
    test_accs.append(right * 1.0 / total)
test_acc = np.array(test_accs).mean()
print("test acc:%.2f" % (test_acc))

test acc:0.94


In [11]:
#test_sents = ["My name is Yufei Wang , I am from Jinzhou , Hubei , China ."]
#test_sents = ["HUST is the abbreviation of Huazhong University of Science and Technology . It is located in Hongshan , Wuhan , China ."]
test_sents = ["Sufjan Stevens released his thirteenth album Carrie and Lowell on March 31 , 2015 in America and received a high score of 9 . 3 from Pitchfork ."]

在这里修改文本以获得输出

In [12]:
for sent in test_sents:
    ids = [sent_vocab.stoi[word] for word in sent.split(" ")]
    input_tensor = torch.tensor([ids])
    mask = input_tensor != sent_vocab.stoi["<pad>"]
    with torch.no_grad():
        pred = model.predict(input_tensor, mask)

In [13]:
print(sent, "\n", [tag_vocab.itos[tag_id] for tag_id in pred[0]])

Sufjan Stevens released his thirteenth album Carrie and Lowell on March 31 , 2015 in America and received a high score of 9 . 3 from Pitchfork . 
 ['B-PER', 'E-PER', 'O', 'O', 'O', 'B-PER', 'E-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'S-PER', 'O', 'S-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'S-ORG', 'O']


简要地测试了几条文本，可以看出大部分`PER`、`LOC`和`ORG`都可以成功识别，但是对于日期等`MISC`识别效果有待提升