In [13]:
from tqdm import tqdm
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

import random
random.seed(1)

import numpy as np
np.random.seed(1)

In [14]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"

#载入测试数据
training_data = []
with open("NER/train.txt",'r') as f:
    word,tags = [],[]
    for line in f.readlines():
        line = line.strip().split()
        if len(line)==2:
            word.append(line[0])
            tags.append(line[1])
        else:
            training_data.append((word,tags))
            word,tags = [],[]
print("training data:",len(training_data))        

dev_data = []
with open("NER/dev.txt",'r') as f:
    word,tags = [],[]
    for line in f.readlines():
        line = line.strip().split()
        if len(line)==2:
            word.append(line[0])
            tags.append(line[1])
        else:
            dev_data.append((word,tags))
            word,tags = [],[]
print("dev data:",len(dev_data))  


test_data = []
with open("NER/test.txt",'r') as f:
    word=[]
    for line in f.readlines():
        line = line.strip().split()
        if len(line)>0:
            word.append(line[0])
        else:
            test_data.append(word)
            word=[]
print("test data:",len(test_data))  

training data: 14041
dev data: 3250
test data: 3453


In [15]:
word_to_ix = {"UNK":0}
tag_to_ix = {START_TAG: 0, STOP_TAG: 1}
for words, tags in training_data:
    for word in words:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
    for t in tags:
        if t not in tag_to_ix:
            tag_to_ix[t] = len(tag_to_ix)
ix_to_tag = {j:i for i,j in tag_to_ix.items()}
print(ix_to_tag)

training_tensors = []
for words,tags in training_data:
    words = torch.tensor([word_to_ix[w] for w in words], dtype=torch.long)
    targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
    training_tensors.append((words,targets))

dev_tensors = []
for words,tags in dev_data:
    idxs = [word_to_ix[w] if w in word_to_ix else word_to_ix['UNK'] for w in words]
    words_in = torch.tensor(idxs, dtype=torch.long)
    targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
    dev_tensors.append((words_in,targets))

test_tensors = []
for words in test_data:
    idxs = [word_to_ix[w] if w in word_to_ix else word_to_ix['UNK'] for w in words]
    words_in = torch.tensor(idxs, dtype=torch.long)
    test_tensors.append(words_in)

{0: '<START>', 1: '<STOP>', 2: 'B-ORG', 3: 'O', 4: 'B-MISC', 5: 'B-PER', 6: 'I-PER', 7: 'B-LOC', 8: 'I-ORG', 9: 'I-MISC', 10: 'I-LOC'}


In [16]:
def load_word_embed(file,word_to_ix):
    row = 0
    words_embed = {}
    with open(file, mode='rb')as f:
        for line in f.readlines():
            line_list = line.split()
            word = line_list[0]
            if word in word_to_ix:
                embed = [float(v) for v in line_list[1:]]
                words_embed[word] = embed

    ix2word = {ix: w for w, ix in word_to_ix.items()}
    id2emb = {}
    nonzero=0
    for ix in range(len(word_to_ix)):
        if ix2word[ix].lower() in words_embed:
            id2emb[ix] = words_embed[ix2word[ix].lower()]
            nonzero+=1
        else:
            id2emb[ix] = np.random.uniform(-1,1,300) #[0.0] * 300
    return [id2emb[ix] for ix in range(len(word_to_ix))]

file = "glove.6B.300d.txt"
glove_embeds = load_word_embed(file,word_to_ix)

# Define functions

In [17]:
# refer to https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()

# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [18]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, pretrain=True):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        if pretrain:
            self.word_embeds = nn.Embedding.from_pretrained(torch.FloatTensor(glove_embeds)).to(device)
        else:
            self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size).to(device))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
        init_alphas = init_alphas.to(device)
        
        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [19]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        #self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeds = nn.Embedding.from_pretrained(torch.FloatTensor(glove_embeds)).to(device)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))
    
    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats
    
    def forward(self, sentence):
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)
        score = F.log_softmax(lstm_feats,dim=1)
        return score

In [20]:
def extract_ner(tag_list):
    res = []
    tmp = []
    for i,tag in enumerate(tag_list):
        if tag in ["B-ORG","B-MISC",'B-PER','B-LOC']:
            tmp.append(i)
            if i<len(tag_list)-1:
                if tag_list[i+1]=='O':
                    res.append("_".join([str(j) for j in tmp]))
                    tmp = []
                elif tag_list[i+1]==tag:
                    continue
                elif tag_list[i+1] in ["B-ORG","B-MISC",'B-PER','B-LOC'] and tag_list[i+1]!=tag:
                    res.append("_".join([str(j) for j in tmp]))
                    tmp = []
                else:
                    continue
            else:
                res.append("_".join([str(j) for j in tmp]))
                tmp = []
        elif tag in ["I-ORG","I-MISC",'I-PER','I-LOC']:
            if i>0 and tag_list[i-1] in ["B-ORG","B-MISC",'B-PER','B-LOC']:
                if tag[-3:]==tag_list[i-1][-3:]:
                    tmp.append(i)
                    res.append("_".join([str(j) for j in tmp]))
                    tmp = []
                else:
                    res.append("_".join([str(j) for j in tmp]))
                    tmp = []
    return res
        
ix_to_tag = {0: '<START>', 1: '<STOP>', 2: 'B-ORG', 3: 'O', 4: 'B-MISC', 5: 'B-PER', 6: 'I-PER', 7: 'B-LOC', 8: 'I-ORG', 9: 'I-MISC', 10: 'I-LOC'}
def evaluate(yture,ypred):
    """return f1-score"""
    N = 0 #正确标签数
    n_correct = 0 #正确预测标签数
    n_predict = 0 #预测数
    for tags,targets in zip(yture,ypred):
        tags = [ix_to_tag[i] for i in tags]
        targets = [ix_to_tag[i] for i in targets]
        ner_true = extract_ner(tags)
        ner_pred = extract_ner(targets)
        n_predict+=len(ner_pred)
        N+=len(ner_true)
        for ner in ner_pred:
            if ner in ner_true:
                n_correct+=1
    p = n_correct/n_predict if n_predict>0 else 0 #precision
    r = n_correct/N if N>0 else 0 #recall
    return p,r,2 * p * r / (p + r) if p + r else 0

# Eval

In [21]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## BiLSTM_CRF

In [23]:
#BiLstmCrf model
EMBEDDING_DIM = 300
param_grid = [{'HIDDEN_DIM':128,'learning_rate':0.01,'num_epochs':100}]

for params in param_grid:
    print(params)
    HIDDEN_DIM = params['HIDDEN_DIM']
    learning_rate = params['learning_rate']
    num_epochs = params['num_epochs']

    model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
    model.to(device) #使用GPU

    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4)

    for epoch in range(num_epochs):
        ave_loss = 0
        rand_training = random.sample(training_tensors,100) #随机选择150例子训练
        for sentence, tags in tqdm(rand_training):
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.
            sentence_in = sentence.to(device)
            targets = tags.to(device)

            # Step 3. Run our forward pass.
            loss = model.neg_log_likelihood(sentence_in, targets)
            ave_loss+=loss.cpu().detach().numpy().item()
            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss.backward()
            optimizer.step()
        ave_loss /= len(training_data)

    #validation
    with torch.no_grad():
        ypred = []
        ytrue = []
        for sentence, tags in dev_tensors:
            sentence_in = sentence.to(device)
            targets = model(sentence_in)[1]
            ypred.append(targets)
            ytrue.append(tags.numpy())
        p,r,f1 = evaluate(ytrue,ypred)
        print(f"loss={ave_loss}, dev_precision={p}, dev_recall={r}, dev_f1={f1}")
    torch.save(model, f'BiLstmCrf_model.pkl')

{'HIDDEN_DIM': 128, 'learning_rate': 0.01, 'num_epochs': 100}


100%|██████████| 100/100 [00:04<00:00, 22.61it/s]
100%|██████████| 100/100 [00:04<00:00, 21.09it/s]
100%|██████████| 100/100 [00:04<00:00, 23.02it/s]
100%|██████████| 100/100 [00:04<00:00, 22.57it/s]
100%|██████████| 100/100 [00:04<00:00, 20.63it/s]
100%|██████████| 100/100 [00:04<00:00, 20.30it/s]
100%|██████████| 100/100 [00:04<00:00, 24.18it/s]
100%|██████████| 100/100 [00:03<00:00, 26.86it/s]
100%|██████████| 100/100 [00:04<00:00, 23.02it/s]
100%|██████████| 100/100 [00:04<00:00, 23.78it/s]
100%|██████████| 100/100 [00:04<00:00, 22.51it/s]
100%|██████████| 100/100 [00:04<00:00, 21.73it/s]
100%|██████████| 100/100 [00:04<00:00, 22.56it/s]
100%|██████████| 100/100 [00:04<00:00, 23.10it/s]
100%|██████████| 100/100 [00:04<00:00, 24.06it/s]
100%|██████████| 100/100 [00:03<00:00, 26.32it/s]
100%|██████████| 100/100 [00:04<00:00, 24.11it/s]
100%|██████████| 100/100 [00:04<00:00, 23.96it/s]
100%|██████████| 100/100 [00:04<00:00, 21.45it/s]
100%|██████████| 100/100 [00:04<00:00, 22.97it/s]


loss=0.019490292571343364, dev_precision=0.8320111992533831, dev_recall=0.6004377841387439, dev_f1=0.6975061124694377


## BiLSTM

In [10]:
#BiLstm
EMBEDDING_DIM = 300
param_grid = [{'HIDDEN_DIM':128,'learning_rate':0.01,'num_epochs':300}]

for params in param_grid:
    print(params)
    HIDDEN_DIM = params['HIDDEN_DIM']
    learning_rate = params['learning_rate']
    num_epochs = params['num_epochs']

    model = BiLSTM(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
    model.to(device) #use GPU
    
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4)

    #train
    for epoch in range(num_epochs):
        ave_loss = 0
        rand_training = random.sample(training_tensors,150) #随机选择150例子训练
        for sentence, tags in tqdm(rand_training):
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.
            sentence_in = sentence.to(device)
            targets = tags.to(device)
            
            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)

            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            ave_loss+=loss.cpu().detach().numpy().item()
            
            loss.backward()
            optimizer.step()
        ave_loss /= len(training_data)

    with torch.no_grad():
        ypred = []
        ytrue = []
        for sentence, tags in dev_tensors:
            sentence_in = sentence.to(device)
            targets = model(sentence_in).cpu().detach().numpy()
            targets = np.argmax(targets,axis=1)
            ypred.append(targets)
            ytrue.append(tags.numpy())
        p,r,f1 = evaluate(ytrue,ypred)
        print(f"loss={ave_loss}, dev_precision={p}, dev_recall={r}, dev_f1={f1}")
    torch.save(model, f'BiLstm_model.pkl')

  self.word_embeds = nn.Embedding.from_pretrained(torch.FloatTensor(glove_embeds)).to(device)


{'HIDDEN_DIM': 128, 'learning_rate': 0.01, 'num_epochs': 300}


100%|██████████| 150/150 [00:01<00:00, 106.02it/s]
100%|██████████| 150/150 [00:01<00:00, 122.37it/s]
100%|██████████| 150/150 [00:01<00:00, 112.80it/s]
100%|██████████| 150/150 [00:01<00:00, 126.33it/s]
100%|██████████| 150/150 [00:01<00:00, 122.20it/s]
100%|██████████| 150/150 [00:01<00:00, 130.27it/s]
100%|██████████| 150/150 [00:01<00:00, 126.50it/s]
100%|██████████| 150/150 [00:01<00:00, 118.37it/s]
100%|██████████| 150/150 [00:01<00:00, 125.59it/s]
100%|██████████| 150/150 [00:01<00:00, 115.56it/s]
100%|██████████| 150/150 [00:01<00:00, 127.07it/s]
100%|██████████| 150/150 [00:01<00:00, 139.19it/s]
100%|██████████| 150/150 [00:01<00:00, 113.29it/s]
100%|██████████| 150/150 [00:01<00:00, 116.86it/s]
100%|██████████| 150/150 [00:01<00:00, 126.96it/s]
100%|██████████| 150/150 [00:01<00:00, 117.34it/s]
100%|██████████| 150/150 [00:01<00:00, 104.97it/s]
100%|██████████| 150/150 [00:01<00:00, 104.93it/s]
100%|██████████| 150/150 [00:01<00:00, 118.56it/s]
100%|██████████| 150/150 [00:01

loss=0.004547394324984289, dev_precision=0.76602658788774, dev_recall=0.4366054891395858, dev_f1=0.5561990561990563


# test

In [11]:
model = torch.load('BiLstmCrf_model.pkl')
model.to(device)

BiLSTM_CRF(
  (word_embeds): Embedding(23624, 300)
  (lstm): LSTM(300, 64, bidirectional=True)
  (hidden2tag): Linear(in_features=128, out_features=11, bias=True)
)

In [12]:
# 检查precision
with torch.no_grad():
    with open("result.txt",'w') as f:
        for sentence,sentence_list in zip(test_tensors,test_data):
            sentence_in = sentence.to(device)
            targets = model(sentence_in)[1]
            targets = [ix_to_tag[i] for i in targets]
            
            for w,t in zip(sentence_list,targets):
                f.write(w+"\t"+t+"\n")
            f.write("\n")