# 数据准备+预处理

In [1]:
import torch
import pickle as pk
import numpy as np
from tqdm import tqdm
import random
import gensim
import os

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    torch.cuda.set_device(0)
else:
    device = torch.device('cpu')

[x_intent,y_intent,x_ner,y_ner] = pk.load(open('data/data-intent1-ner.pkl','rb'))
train_data = [(x_ner[i],y_ner[i]) for i in range(len(x_ner))]
vocab = set([j for i in x_ner for j in i])
vocab = dict(zip(vocab,range(1,len(vocab)+1)))
vocab['<PAD>'] = 0
vocab['<UNK>'] = len(vocab)
tag2label = ['O']+list(set([j for i in y_ner for j in i]) - set('O'))
tag2label = dict(zip(tag2label,range(len(tag2label))))

# 准备好模型的参数
parameter = {
    'min_count_word':1,
    'batch_size':32,
    'epoch':10,
    'hid_dim':300,
    'dropout':0.5,
    'lr':0.001,
    'tag2label':tag2label,
    'num_tags':len(tag2label),
    'd_model':768,
    'shuffle':True,
    'vocab':None,
    'model_path':None,
    'num_unknow':0,
    'n_layers':2,
    'device':device,
}
out_path = 'model/'
os.mkdir(out_path) if not os.path.exists(out_path) else 1
model_path = os.path.join(out_path, "ner/")
os.mkdir(model_path) if not os.path.exists(model_path) else 1
parameter['vocab'] = vocab
parameter['vocab_size'] = len(vocab)
parameter['model_path'] = model_path

def batch_yield(data,parameter,shuffle = True):
    def list2torch(ins):
        return torch.from_numpy(np.array(ins))

    def seq2id(seq, vocab):
        sentence_id = []
        for word in seq:
            if word not in vocab:
                word = '<UNK>'
            sentence_id.append(vocab[word])
        return sentence_id
    # 构建一个迭代器，获取相应的seqs（index型）和label，按照batch_size提取
    if shuffle:
        random.shuffle(data)
    seqs,labels = [],[]
    for (seq,label) in tqdm(data):
        seq = seq2id(seq,parameter['vocab'])
        label = [parameter['tag2label'][label_] for label_ in label]
        if len(seqs) == parameter['batch_size']:
            seq_len_list = [len(i) for i in seqs]
            max_len = max(seq_len_list)
            seqs = [i+[0]*(max_len-len(i)) for i in seqs]
            labels = [i+[0]*(max_len-len(i)) for i in labels]
            yield list2torch(seqs),list2torch(labels),False
            seqs,labels = [],[]
        seqs.append(seq)
        labels.append(label)
    if len(seqs) != 0:
        seq_len_list = [len(i) for i in seqs]
        max_len = max(seq_len_list)
        seqs = [i+[0]*(max_len-len(i)) for i in seqs]
        labels = [i+[0]*(max_len-len(i)) for i in labels]
        yield list2torch(seqs), list2torch(labels),True

pk.dump(parameter,open(parameter['model_path']+'/parameter.pkl','wb'))
            
# data = batch_yield(train_data,parameter)
# seqs, labels,keys = next(data)
# seqs.shape,labels.shape,seqs[:2],labels[:2]


In [2]:
parameter['tag2label']

{'O': 0,
 'I-km1': 1,
 'E-tag': 2,
 'I-kg': 3,
 'E-title': 4,
 'E-kg': 5,
 'B-tag': 6,
 'E-class': 7,
 'E-author': 8,
 'B-km2': 9,
 'I-class': 10,
 'I-tag': 11,
 'B-km1': 12,
 'S-author': 13,
 'S-class': 14,
 'S-kg': 15,
 'E-km2': 16,
 'B-author': 17,
 'B-kg': 18,
 'B-class': 19,
 'I-km2': 20,
 'I-title': 21,
 'S-title': 22,
 'I-author': 23,
 'E-km1': 24,
 'B-title': 25}

# 模型构建及模型训练

In [3]:
import torch.nn.functional as F # pytorch 激活函数的类
from torch import nn,optim # 构建模型和优化器
from torchcrf import CRF


# 构建基于bilstm+crf实现ner
class bilstm_crf(nn.Module):
    def __init__(self, parameter):
        super(bilstm_crf, self).__init__()
        vocab_size = parameter['vocab_size']
        embedding_dim = parameter['d_model']
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        hidden_size = parameter['hid_dim']
        num_layers = parameter['n_layers']
        dropout = parameter['dropout']
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, bidirectional=True, batch_first=True, dropout=dropout)

        output_size = parameter['num_tags']
        self.fc = nn.Linear(hidden_size*2, output_size)
        
        self.crf = CRF(output_size,batch_first=True)
        
    def forward(self, x):
        out = self.embedding(x)
        out,(h, c)= self.lstm(out)
        out = self.fc(out)
        return out
    
import os
import shutil
import pickle as pk
from torch.utils.tensorboard import SummaryWriter


# 构建模型
model = bilstm_crf(parameter).to(parameter['device'])

# 确定训练模式
model.train()

# 确定优化器和损失
optimizer = torch.optim.SGD(model.parameters(),lr=0.00005, momentum=0.95, nesterov=True)
# optimizer = torch.optim.Adam(model.parameters(),lr = parameter['lr'], \
#                              weight_decay = 0.01)

# 准备学习率策略
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)


# 开始训练
loss_cal = []
min_loss = float('inf')
for epoch in range(parameter['epoch']):
    # 迭代器重置
    train_yield = batch_yield(train_data,parameter)
    while 1:
        inputs,targets,keys = next(train_yield)
        out = model(inputs.long().to(parameter['device']))
        loss = -model.crf(out,targets.long().to(parameter['device']))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_cal.append(loss.item())
        if keys:
            break
    loss_cal = sum(loss_cal)/len(loss_cal)
    if loss_cal < min_loss:
        min_loss = loss_cal
        torch.save(model.state_dict(), parameter['model_path']+'/bilstm_crf.h5')
        print('epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, \
                                                       parameter['epoch'],loss_cal))
    loss_cal = [loss.item()]
    scheduler.step()

  score = torch.where(mask[i].unsqueeze(1), next_score, score)
100%|████████████████████████████████████████████████████████████████████████████| 42534/42534 [11:33<00:00, 61.35it/s]


epoch [1/10], Loss: 73.6042


100%|████████████████████████████████████████████████████████████████████████████| 42534/42534 [10:51<00:00, 65.26it/s]


epoch [2/10], Loss: 5.3350


100%|████████████████████████████████████████████████████████████████████████████| 42534/42534 [10:57<00:00, 64.72it/s]


epoch [3/10], Loss: 2.3925


100%|████████████████████████████████████████████████████████████████████████████| 42534/42534 [12:36<00:00, 56.19it/s]


epoch [4/10], Loss: 1.3998


100%|████████████████████████████████████████████████████████████████████████████| 42534/42534 [12:51<00:00, 55.12it/s]


epoch [5/10], Loss: 0.9673


100%|████████████████████████████████████████████████████████████████████████████| 42534/42534 [11:41<00:00, 60.60it/s]


epoch [6/10], Loss: 0.7036


100%|████████████████████████████████████████████████████████████████████████████| 42534/42534 [12:03<00:00, 58.79it/s]


epoch [7/10], Loss: 0.5708


100%|████████████████████████████████████████████████████████████████████████████| 42534/42534 [11:17<00:00, 62.78it/s]


epoch [8/10], Loss: 0.4540


100%|████████████████████████████████████████████████████████████████████████████| 42534/42534 [10:57<00:00, 64.67it/s]


epoch [9/10], Loss: 0.3770


100%|████████████████████████████████████████████████████████████████████████████| 42534/42534 [10:42<00:00, 66.15it/s]


epoch [10/10], Loss: 0.3374
