In [1]:
import gc
from tqdm import tqdm
import pickle
import multiprocessing
from collections import defaultdict
from nltk.corpus import conll2000
from nltk.chunk import tree2conlltags
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchcrf import CRF

In [3]:
def str_encoder(raw_list, max_pad=99999):
    # 将字符进行编码， raw_list:经过标记的语料
    max_sent_lenth = 0
    all_tag_list = []
    for raw_in_sent in raw_list:
        all_tag_list.extend(raw_in_sent)
        if len(raw_in_sent) > max_sent_lenth:
            max_sent_lenth = len(raw_in_sent)
    
    if max_pad > max_sent_lenth:
        max_pad = max_sent_lenth

    label_encoder = LabelEncoder()
    label_encoder.fit(all_tag_list)
    encoded_list = np.zeros((len(raw_list), max_sent_lenth))
    for pos, raw_in_sent in enumerate(raw_list):
        encoded_list[pos] = text_padding(label_encoder.transform(raw_in_sent)+1, max_pad)
        
    return encoded_list, label_encoder

def text_padding(real_value_sequence, max_pad):
    # 使所有句子的长度相同（不足的补0）
    if len(real_value_sequence) <= max_pad:
        remain_space = max_pad - len(real_value_sequence)
        return np.concatenate((real_value_sequence, np.zeros(remain_space)))
    
    else:
        return real_value_sequence[:max_pad]

In [4]:
class ConllDataset(Dataset):
    # 构建输入神经网路的数据集
    def __init__(self, part_speech, iob):
        self.part_speech = part_speech
        self.iob = iob
    
    def __len__(self):
        return len(self.part_speech)
    
    def __getitem__(self, idx):
        return (self.part_speech[idx], self.iob[idx])
    
class TagNet(nn.Module):
    # LSTM+CRF模型构造
    def __init__(self, n_words, n_tags, embedding_dim=50, lstm_dim=32):
        #初始化构造
        super(TagNet, self).__init__()
        
        self.embedding_layer = nn.Embedding(n_words, embedding_dim=embedding_dim)
        self.lstm_layer = nn.LSTM(embedding_dim, lstm_dim, num_layers=2, 
                                  dropout=0.2, bidirectional=True, batch_first=True)
        self.linear_layer = nn.Linear(2*lstm_dim, n_tags)
        self.crf_layer = CRF(n_tags, batch_first=True)
        
    def forward(self, x, y, mask, decode=False):
        #前向传播
        x = self.embedding_layer(x)
        x, state = self.lstm_layer(x)
        x = F.relu(x)
        output = self.linear_layer(x)
        if decode == False:
            output = self.crf_layer(output, y, mask=mask, reduction='mean') #带负号的损失函数
            return -output
        else:
            return self.crf_decoder(output, mask)
        
    def crf_decoder(self, x, mask):
        #标签解码
        return self.crf_layer.decode(x, mask)

In [5]:
# 提取coll2000语料的词性和IOB标记，逐句存放
train_part_speech_list = []
train_iob_list = []
for tree in conll2000.chunked_sents():
    train_part_speech_in_sent = []
    train_iob_in_sent = []
    tags = tree2conlltags(tree)
    for tg in tags:
        train_part_speech_in_sent.append(tg[1])
        train_iob_in_sent.append(tg[2])
    train_part_speech_list.append(train_part_speech_in_sent)
    train_iob_list.append(train_iob_in_sent)

In [6]:
# 分别对词性和IOB进行编码
encoded_part_speech, part_speech_encoder = str_encoder(train_part_speech_list)
encoded_iob, iob_encoder = str_encoder(train_iob_list)

# 完成数据输入准备
conll_corpus_dataset = ConllDataset(encoded_part_speech, encoded_iob)
conll_loader = DataLoader(conll_corpus_dataset, batch_size=64, shuffle=True, num_workers=multiprocessing.cpu_count())

# 统计词性和iob的种类个数
sample_num = encoded_part_speech.shape[0]
n_words = np.unique(encoded_part_speech).shape[0]
n_tags = np.unique(encoded_iob).shape[0]

In [107]:
# 网络结构、优化器、损失函数初始化
net = TagNet(n_words, n_tags).to('cuda')
optimizer = torch.optim.Adam(net.parameters())

# 训练（使用GPU）
EPOCHS = 4
for epk in range(EPOCHS):
    mean_crf_loss = 0
    
    for pos, (x, y) in tqdm(enumerate(conll_loader)):
        x = x.long().to('cuda')
        y = y.long().to('cuda')
        mask = torch.where(y > 0, y, torch.zeros(1).long().to('cuda')).bool()
        
        optimizer.zero_grad() #优化器梯度清零
        output = net(x, y, mask) #前向传播
        output.backward() #反向传播
        optimizer.step() #更新参数
        
        mean_crf_loss += output
    
    mean_crf_loss /= (pos + 1)
    print("Loss on epoch %d: %.5f" % (epk, mean_crf_loss))
        
    torch.cuda.empty_cache()

172it [00:18,  9.32it/s]

Loss on epoch 0: 25.18171



172it [00:18,  9.40it/s]

Loss on epoch 1: 7.06643



172it [00:19,  8.99it/s]

Loss on epoch 2: 5.48121



172it [00:23,  7.38it/s]

Loss on epoch 3: 4.93338





In [108]:
# 保存模型
torch.save(net.state_dict(), "../models/lstmcrf")

In [7]:
# 读取模型
model = torch.load("../models/lstmcrf")
net = TagNet(n_words, n_tags)
net.load_state_dict(model)
net.eval()

TagNet(
  (embedding_layer): Embedding(45, 50)
  (lstm_layer): LSTM(50, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (linear_layer): Linear(in_features=64, out_features=8, bias=True)
  (crf_layer): CRF(num_tags=8)
)

In [8]:
def wikidata_dataset(dic_datas, encoder):
    # 对维基数据进行编码
    data_dict = defaultdict(list)
    for key, value in tqdm(dic_datas.items()):
        for sentence in value:
            sent_raw_container = []
            sent_tag_container = []
            for word_tag_pair in sentence[1]:
                sent_raw_container.append(word_tag_pair[0])
                sent_tag_container.append(word_tag_pair[1])
            try:
                data_dict[key].append((sent_raw_container, sent_tag_container, encoder.transform(sent_tag_container) + 1))
            except ValueError:
                continue
            
    return data_dict

def wiki_tagger(token_content, max_len=78):
    # 对数据进行标注
    test = np.zeros((len(token_content), 78))
    
    for pos, sentence in enumerate(token_content):
        test[pos] = text_padding(sentence, max_len)
        
    return test

In [9]:
gc.collect()

# 读取词标注文本
with open("../datas/token/animal_text_word_token.pkl", 'rb') as f:
    animal_tokens = pickle.load(f)
with open("../datas/token/plant_text_word_token.pkl", 'rb') as f:
    plant_tokens = pickle.load(f)

In [10]:
animal_tokens_dict = wikidata_dataset(animal_tokens, part_speech_encoder)
plant_tokens_dict = wikidata_dataset(plant_tokens, part_speech_encoder)

100%|██████████| 2419/2419 [00:21<00:00, 114.77it/s]
100%|██████████| 14510/14510 [03:00<00:00, 80.21it/s] 


In [11]:
species_collector = defaultdict(list) # 收集结果

# 预测标注值
net = net.to('cuda')
with torch.no_grad():
    for text_dict in [animal_tokens_dict, plant_tokens_dict]:
        for key in tqdm(text_dict.keys()):
            text = []
            tag_label_encoded = []
            for t in text_dict[key]:
                text.append(t[0])
                tag_label_encoded.append(t[2])
            in_tensor = torch.from_numpy(wiki_tagger(tag_label_encoded)).long().to('cuda')
            mask = torch.where(in_tensor > 0, in_tensor, torch.zeros(1).long().to('cuda')).bool()
            output = net(in_tensor, None, mask, decode=True) 
            
            # 标签解码，并保存为“(词性，iob标注)”的形式
            for t, iob in zip(text, output):
                species_collector[key].append((t, iob_encoder.inverse_transform(np.asarray(iob) - 1)))
                
        gc.collect()

100%|██████████| 2398/2398 [03:14<00:00, 12.31it/s]
100%|██████████| 14497/14497 [27:54<00:00,  8.65it/s] 


In [12]:
del animal_tokens, plant_tokens, animal_tokens_dict, plant_tokens_dict
gc.collect()
with open('../datas/tuple/data_tagger_after_lstmcrf.pkl', 'wb') as bfile:
    pickle.dump(species_collector, bfile, protocol=4)