In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
import copy
import time

# 定义模型

In [3]:
from DataProcess.model_utils import tensorized, sort_by_lengths, cal_loss, cal_lstm_crf_loss
from models import TrainingConfig, LSTMConfig
from models import BiLSTM, BiLSTM_CRF

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
class BiLSTM_Model:
    def __init__(self, wor2id, tag2id, vocab_size, out_size, crf=True):
        '''
        vocab_size: 词典的大小
        out_size: 标注的种类
        '''
        # 加载模型参数
        self.emb_size = LSTMConfig.emb_size
        self.hidden_size = LSTMConfig.hidden_size
        self.crf = crf
        
        self.word2id = word2id
        self.tag2id = tag2id
        
        # 根据是否添加crf初始化不同的模型，选择不一样的损失计算函数
        if not crf:
            self.model = BiLSTM(vocab_size, self.emb_size,
                               self.hidden_size, out_size, tag2id).to(device)
            self.cal_loss_func = cal_loss
        else:
            self.model = BiLSTM_CRF(vocab_size, self.emb_size,
                                   self.hidden_size, out_size, tag2id).to(device)
            self.cal_loss_func = cal_lstm_crf_loss
            
        # 加载训练参数
        self.epochs = TrainingConfig.epochs
        self.print_step = TrainingConfig.print_step
        self.lr = TrainingConfig.lr 
        self.batch_size = TrainingConfig.batch_size
        
        # 初始化优化器
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        
        # 初始化其他指标
        self.step = 0
        self._best_val_loss = 1e18
        self.best_model = None
        
    def train(self, word_lists, tag_lists, dev_word_lists, dev_tag_lists):
        # 对数据集按照长度进行排序
        word_lists, tag_lists, _ = sort_by_lengths(word_lists, tag_lists)
        dev_word_lists, dev_tag_lists, _ = sort_by_lengths(dev_word_lists, dev_tag_lists)
        
        B = self.batch_size
        total_step = (len(word_lists) // B + 1)
        for e in range(1, self.epochs+1):
            step = 0
            losses = 0. 
            for ind in range(0, len(word_lists), B):
                step += 1 
                batch_sents = word_lists[ind:ind+B]
                batch_tags = tag_lists[ind:ind+B]
                losses += self.train_step(batch_sents, batch_tags)
                if step % self.print_step == 0:
                    print("Epoch {} | step/total_step: {}/{} {:.2f}% | Loss: {:.4f}".format(
                            e, step, total_step, 
                            100. * step / total_step,
                            losses / self.print_step))
                losses = 0.
        # 每轮结束测试在验证集上的性能，保存最好的一个
        val_loss = self.validate(dev_word_lists, dev_tag_lists)
        print("Epoch: {} | Val Loss: {:.4f}".format(e, val_loss))
        
    def train_step(self, batch_sents, batch_tags):
        self.model.train()
        # 准备数据
        ## 将索引进行向量化，按照第0个索引的位置为最大长度，获取每个单词的index
        tensorized_sents, lengths = tensorized(batch_sents, self.word2id)
        tensorized_sents = tensorized_sents.to(device)
        
        ## 同上，只不过是将tag转换为索引
        targets, lengths = tensorized(batch_tags, self.tag2id)
        targets = targets.to(device)
        
        # 前向传播
        scores = self.model(tensorized_sents, lengths)
        
        # 计算损失，更新参数
        self.optimizer.zero_grad()
        loss = self.cal_loss_func(scores, targets, self.tag2id).to(device)
        loss.backward()
        self.optimizer.step()
        
        return loss.item()
    
    def validate(self, dev_word_lists, dev_tag_lists):
        self.model.eval()
        with torch.no_grad():
            val_losses = 0.
            val_step = 0 
            for ind in range(0, len(dev_word_lists), self.batch_size):
                val_step += 1 
                # 准备batch数据
                batch_sents = dev_word_lists[ind:ind+self.batch_size]
                batch_tags = dev_tag_lists[ind:ind+self.batch_size]
                tensorized_sents, lengths = tensorized(batch_sents, self.word2id)
                tensorized_sents = tensorized_sents.to(device)
                targets, lengths = tensorized(batch_tags, self.tag2id)
                targets = targets.to(device)
                
                # 前向传播
                scores = self.model(tensorized_sents, lengths)
                # 计算损失
                loss = self.cal_loss_func(
                    scores, targets, self.tag2id
                ).to(device)
                val_losses += loss.item()
            val_loss = val_losses / val_step
            if val_loss < self._best_val_loss:
                print("保存模型...")
                self.best_model = copy.deepcopy(self.model)
                self._best_val_loss = val_loss
            return val_loss
        
    def test(self, word_lists, tag_lists):
        '''
        返回最佳模型在测试集上的结果
        '''
        ## 将要预测的单词列表和tag列表按照长度进行排序，并进行向量化
        word_lists, tag_lists, indices = sort_by_lengths(word_lists, tag_lists)
        tensorized_sents, lengths = tensorized(word_lists, self.word2id)
        tensorized_sents = tensorized_sents.to(device)
#         lengths = lengths.to(device)
        
#         self.best_model.to(device)
        self.best_model.eval()
        ## 得到预测的tag数据
        with torch.no_grad():
            batch_tagids = self.best_model.test(tensorized_sents, lengths)
            
        # 将id转化为标注
        pred_tag_lists = []
        id2tag = dict((id_, tag) for tag, id_ in self.tag2id.items())
        for i, ids in enumerate(batch_tagids):
            tag_list = []
            if self.crf:
                ## 对于crf要去掉最后的<end>标记
                for j in range(lengths[i] - 1):
                    tag_list.append(id2tag[ids[j].item()])
            else:
                for j in range(lengths[i]):
                    tag_list.append(id2tag[ids[j].item()])
            pred_tag_lists.append(tag_list)
        
        # indices存有根据长度排序的顺序信息
        ind_maps = sorted(list(enumerate(indices)), key=lambda e: e[1])
        ## 获取原始的顺序
        indices, _ = list(zip(*ind_maps))
        pred_tag_lists = [pred_tag_lists[i] for i in indices]
        tag_lists = [tag_lists[i] for i in indices]
        
        return pred_tag_lists, tag_lists

# 读取数据

In [6]:
from DataProcess.data import build_corpus

In [7]:
train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train")
dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

# 训练模型

In [8]:
from DataProcess.utils import extend_maps, preprocess_data_for_lstmcrf
from DataProcess.utils import save_model, flatten_lists
from DataProcess.evaluating import Metrics

In [9]:
# 定义bilstm+crf的训练函数
def bilstm_train_and_eval(train_data, dev_data, test_data, word2id, tag2id,
                         crf=True, remove_O=False):
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data
    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)
    bilstm_model = BiLSTM_Model(word2id, tag2id, vocab_size, out_size, crf=crf)
    bilstm_model.train(train_word_lists, train_tag_lists, 
                      dev_word_lists, dev_tag_lists)
    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(bilstm_model, "./ckpts/"+model_name+".pkl")
    
    print("训练完毕，共用时{}秒".format(int(time.time()-start)))
    print("评估 {} 模型中...".format(model_name))
    pred_tag_lists, test_tag_lists = bilstm_model.test(test_word_lists,
                                                      test_tag_lists)
    
    metrics = Metrics(test_tag_lists, pred_tag_lists, remove_O=remove_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()
    
    return pred_tag_lists

## 训练并测试bilstm

In [10]:
print("正在训练评估BiLSTM模型....")
lstm_pred = bilstm_train_and_eval((train_word_lists, train_tag_lists),
                                 (dev_word_lists, dev_tag_lists), 
                                 (test_word_lists, test_tag_lists),
                                 word2id, tag2id,
                                 crf=False)

正在训练评估BiLSTM模型....
Epoch 1 | step/total_step: 15/60 25.00% | Loss: 0.1432
Epoch 1 | step/total_step: 30/60 50.00% | Loss: 0.0839
Epoch 1 | step/total_step: 45/60 75.00% | Loss: 0.0584
Epoch 1 | step/total_step: 60/60 100.00% | Loss: 0.0859
Epoch 2 | step/total_step: 15/60 25.00% | Loss: 0.0459
Epoch 2 | step/total_step: 30/60 50.00% | Loss: 0.0400
Epoch 2 | step/total_step: 45/60 75.00% | Loss: 0.0292
Epoch 2 | step/total_step: 60/60 100.00% | Loss: 0.0395
Epoch 3 | step/total_step: 15/60 25.00% | Loss: 0.0242
Epoch 3 | step/total_step: 30/60 50.00% | Loss: 0.0222
Epoch 3 | step/total_step: 45/60 75.00% | Loss: 0.0184
Epoch 3 | step/total_step: 60/60 100.00% | Loss: 0.0262
Epoch 4 | step/total_step: 15/60 25.00% | Loss: 0.0156
Epoch 4 | step/total_step: 30/60 50.00% | Loss: 0.0138
Epoch 4 | step/total_step: 45/60 75.00% | Loss: 0.0126
Epoch 4 | step/total_step: 60/60 100.00% | Loss: 0.0183
Epoch 5 | step/total_step: 15/60 25.00% | Loss: 0.0118
Epoch 5 | step/total_step: 30/60 50.00% | 



           precision    recall  f1-score   support
   M-CONT     1.0000    1.0000    1.0000        53
  M-TITLE     0.8824    0.8866    0.8845      1922
    M-EDU     0.9253    0.8994    0.9122       179
    E-PRO     0.7895    0.9091    0.8451        33
    M-LOC     0.8667    0.6190    0.7222        21
   E-NAME     0.9725    0.9464    0.9593       112
    B-PRO     1.0000    0.6667    0.8000        33
    E-ORG     0.9343    0.8481    0.8891       553
   E-CONT     1.0000    1.0000    1.0000        28
   B-RACE     1.0000    1.0000    1.0000        14
    M-ORG     0.9359    0.9683    0.9518      4325
    E-EDU     0.9623    0.9107    0.9358       112
  B-TITLE     0.9394    0.8834    0.9105       772
    B-LOC     1.0000    0.5000    0.6667         6
    M-PRO     0.7375    0.8676    0.7973        68
   E-RACE     1.0000    1.0000    1.0000        14
   B-CONT     0.9655    1.0000    0.9825        28
   B-NAME     0.9341    0.7589    0.8374       112
   M-NAME     0.7660    0.8780 

## 训练并评估BiLSTM+CRF模型

In [10]:
crf_word2id, crf_tag2id = extend_maps(word2id, tag2id)
# 还要对数据进行一些额外处理
train_word_lists, train_tag_lists = preprocess_data_for_lstmcrf(train_word_lists, train_tag_lists)
dev_word_lists, dev_tag_lists = preprocess_data_for_lstmcrf(dev_word_lists, dev_tag_lists)
test_word_lists, test_tag_lists = preprocess_data_for_lstmcrf(test_word_lists, test_tag_lists, test=True)


lstmcrf_pred = bilstm_train_and_eval(
    (train_word_lists, train_tag_lists),
    (dev_word_lists, dev_tag_lists),
    (test_word_lists, test_tag_lists),
    crf_word2id, crf_tag2id
)

Epoch 1 | step/total_step: 15/60 25.00% | Loss: 5.4931
Epoch 1 | step/total_step: 30/60 50.00% | Loss: 2.2889
Epoch 1 | step/total_step: 45/60 75.00% | Loss: 1.0327
Epoch 1 | step/total_step: 60/60 100.00% | Loss: 0.4880
Epoch 2 | step/total_step: 15/60 25.00% | Loss: 1.8321
Epoch 2 | step/total_step: 30/60 50.00% | Loss: 1.1308
Epoch 2 | step/total_step: 45/60 75.00% | Loss: 0.5760
Epoch 2 | step/total_step: 60/60 100.00% | Loss: 0.3126
Epoch 3 | step/total_step: 15/60 25.00% | Loss: 0.9648
Epoch 3 | step/total_step: 30/60 50.00% | Loss: 0.6490
Epoch 3 | step/total_step: 45/60 75.00% | Loss: 0.3618
Epoch 3 | step/total_step: 60/60 100.00% | Loss: 0.1841
Epoch 4 | step/total_step: 15/60 25.00% | Loss: 0.6024
Epoch 4 | step/total_step: 30/60 50.00% | Loss: 0.4230
Epoch 4 | step/total_step: 45/60 75.00% | Loss: 0.2552
Epoch 4 | step/total_step: 60/60 100.00% | Loss: 0.1357
Epoch 5 | step/total_step: 15/60 25.00% | Loss: 0.4383
Epoch 5 | step/total_step: 30/60 50.00% | Loss: 0.2991
Epoch 



           precision    recall  f1-score   support
   E-NAME     0.9358    0.9107    0.9231       112
   B-CONT     0.9032    1.0000    0.9492        28
   M-CONT     0.9138    1.0000    0.9550        53
   E-CONT     1.0000    1.0000    1.0000        28
    B-EDU     0.9505    0.8571    0.9014       112
    M-LOC     0.9333    0.6667    0.7778        21
  E-TITLE     0.9754    0.9741    0.9747       772
    E-PRO     0.9118    0.9394    0.9254        33
    B-LOC     0.0000    0.0000    0.0000         6
   E-RACE     1.0000    1.0000    1.0000        14
    M-PRO     0.6742    0.8824    0.7643        68
    B-PRO     0.8333    0.6061    0.7018        33
  B-TITLE     0.9296    0.9067    0.9180       772
    E-LOC     1.0000    0.1667    0.2857         6
    E-ORG     0.9268    0.8698    0.8974       553
    E-EDU     0.9541    0.9286    0.9412       112
        O     0.9682    0.9509    0.9595      5190
   B-NAME     0.9694    0.8482    0.9048       112
  M-TITLE     0.9196    0.8866 