# 训练char2vec

In [4]:
# 输出每个cell的运行时间
%load_ext autotime
# https://github.com/cpcloud/ipython-autotime
import warnings
warnings.filterwarnings("ignore")

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 981 µs


In [5]:
root_path = "data/round1/train/"
w2v_input_path = "model_file/char2vec_prepareData.txt"
w2v_output_path = "model_file/char2vec.model"

from Model import Char2VecTrainer
char2vec = Char2VecTrainer(root=root_path,w2v_file_path=w2v_output_path)

Using TensorFlow backend.


time: 7.17 s


In [6]:
char2vec.prepare_data()
char2vec.train(w2v_output_path,emb_size=256)

time: 7.7 s


In [7]:
char2vec_model = char2vec.load()

w2v的模型维度是：256
w2v的模型的词表总长是：2301
time: 50 ms


# 数据预处理

## 创建word2idx

In [8]:
from common.Entity import Document
from common.Utils import scan_files
from Data import DataSet

from sklearn.model_selection import ShuffleSplit

file_names = scan_files(root_path)

rs = ShuffleSplit(n_splits=1, test_size=.15, random_state=2019)
train_idx,test_idx = next(rs.split(file_names))

train_file_names = [file_names[idx] for idx in train_idx]
test_file_names = [file_names[idx] for idx in test_idx]

whole_set = DataSet(root_path,file_names,vocab_size=-1)
char2idx = whole_set.char2idx
del whole_set

time: 2.4 s


In [11]:
import pickle
pickle.dump(char2idx,open('test_data_model/word2idx.pkl','wb'))
char2idx = pickle.load(open('test_data_model/word2idx.pkl','rb'))
len(char2idx)

3246

time: 10 ms


## 创建emb_matrix

In [8]:
import numpy as np

vec_size = char2vec_model.wv.vector_size
emb_matrix = np.zeros(vec_size)

def random_vec(vec_size):
    vec = np.random.random(size=vec_size)
    vec = vec - vec.mean()
    return vec

for c in char2idx.keys():
    if c is "_padding":
        char2idx[c] = 0
    elif c is "_unk":
        emb = random_vec(vec_size)
        emb_matrix = np.vstack((emb_matrix,emb))
        char2idx[c] = 1
    else:
        if c in [" ","\n"]:
            idx = emb_matrix.shape[0]
            emb = random_vec(vec_size)
            emb_matrix = np.vstack((emb_matrix,emb))
            char2idx[c] = idx
        elif c not in char2vec_model.wv.vocab.keys():
            idx = char2idx["_unk"]
            char2idx[c] = idx
        else:
            idx = emb_matrix.shape[0]
            emb = char2vec_model.wv[c]
            emb_matrix = np.vstack((emb_matrix,emb))
            char2idx[c] = idx

time: 3.37 s


In [9]:
len(char2vec_model.wv.vocab)

2301

time: 2.99 ms


In [12]:
import pickle
pickle.dump(char2vec_model,open('test_data_model/emb_matrix.pkl','wb'))
emb_matrix = pickle.load(open('test_data_model/emb_matrix.pkl','rb'))

time: 29 ms


In [13]:
emb_matrix

<gensim.models.word2vec.Word2Vec at 0x2a29e9ee7f0>

time: 2 ms


## 读取并切分数据集

In [10]:
import pickle

rs = ShuffleSplit(n_splits=1, test_size=.20, random_state=2019)
train_idx,val_idx = next(rs.split(train_file_names))

train_file_names = [file_names[idx] for idx in train_idx]
val_file_names = [file_names[idx] for idx in val_idx]

trainset = DataSet(root_path,train_file_names,char2idx)
valset = DataSet(root_path,val_file_names,char2idx)
testset = DataSet(root_path,test_file_names,char2idx)

# 持久化
pickle.dump(trainset,open('pickle_file/trainset.pkl','wb'))
pickle.dump(valset,open('pickle_file/valset.pkl','wb'))
pickle.dump(testset,open('pickle_file/testset.pkl','wb'))

time: 1.1 s


### 向量化 + 滑动窗切分句子

In [11]:
from Data import DataProcessor
import pickle
data_processors = []

for dataset in [trainset,valset,testset]:
    processor = DataProcessor(dataset).data4NER(window=70,pad=10)
    data_processors.append(processor)

# 持久化
pickle.dump(data_processors,open('pickle_file/data_processors.pkl','wb'))

time: 6.52 s


### 创建X-Y

In [12]:
from Data import DataProcessor
from typing import List
import numpy as np
import pickle

data_processors = pickle.load(open('pickle_file/data_processors.pkl','rb')) #type:List[DataProcessor]

train_X,train_Y = data_processors[0].get_ner_data()
train_Y = np.expand_dims(train_Y,-1)

val_X,val_Y = data_processors[1].get_ner_data()
val_Y = np.expand_dims(val_Y,-1)

test_X,_ = data_processors[2].get_ner_data()

print(train_X.shape,train_Y.shape)
print(val_X.shape,val_Y.shape)
print(test_X.shape)

(30780, 90) (30780, 90, 1)
(7647, 90) (7647, 90, 1)
(7161, 90)
time: 4.97 s


In [13]:
train_X

array([[   0.,    0.,    0., ...,  754.,    2.,    2.],
       [ 529.,  437.,  511., ...,  529.,  437.,  511.],
       [ 104.,   94.,  309., ...,  437.,  511.,   68.],
       ...,
       [ 996.,   24.,  220., ..., 1045.,   24.,  861.],
       [  98.,  169.,   21., ...,   24.,  454., 1335.],
       [   3., 1841.,   24., ...,    0.,    0.,    0.]])

time: 2.99 ms


# biLSTM-CRF实现

In [16]:
from Model import BiLstmCrfTrainer
from Data import CATEGORY
from keras.callbacks import EarlyStopping

BATCH_SIZE = 64
EPOCH = 20

model = BiLstmCrfTrainer(category_count = len(CATEGORY)+1,
                         seq_len = train_X.shape[1],
                         lstm_units=256,
                         vocab_size = emb_matrix.shape[0],
                         emb_matrix = emb_matrix).build()

early_stopping = EarlyStopping(monitor='val_crf_viterbi_accuracy', patience=2, mode='max')

print('开始训练啦！！')
print(20*"===")
history = model.fit(train_X,train_Y,batch_size=BATCH_SIZE,
                    epochs = EPOCH,
                    class_weight="auto",
                    callbacks = [early_stopping],
                    validation_data = (val_X,val_Y,)
                    )

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 90)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 90, 256)           590080    
_________________________________________________________________
dropout_1 (Dropout)          (None, 90, 256)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 90, 512)           1050624   
_________________________________________________________________
dropout_2 (Dropout)          (None, 90, 512)           0         
_________________________________________________________________
crf_1 (CRF)                  (None, 90, 16)            8496      
Total params: 1,649,200
Trainable params: 1,059,120
Non-trainable params: 590,080
___________________________________________

In [17]:
import datetime
time = datetime.datetime.now()
model.save(filepath="model_file/bi_lstm_crf_{}_{}_{}_{}.h5".format(str(time.month),str(time.day),str(time.hour),str(time.minute)),overwrite=True)

time: 382 ms


## 预测结果

In [19]:
import keras
import keras_contrib
import pickle

model = keras.models.load_model("model_file/bi_lstm_crf_12_3_0_42.h5",
                                custom_objects={"CRF": keras_contrib.layers.CRF, "crf_loss": keras_contrib.losses.crf_loss,
                                                "crf_viterbi_accuracy": keras_contrib.metrics.crf_viterbi_accuracy})

time: 4.26 s


In [20]:
data_processors = pickle.load(open('pickle_file/data_processors.pkl','rb')) #type:List[DataProcessor]
test_X,_ = data_processors[2].get_ner_data()

preds = model.predict(test_X, batch_size=16, verbose=True)

time: 1min 10s


In [21]:
from Evaluator import *
from common.Entity import Document
from Data import DataSet
from typing import List

testset = pickle.load(open('pickle_file/testset.pkl','rb')) # type:DataSet
pre_docs = merge_preds4ner(testset,data_processors[2],preds) # type:List[Document]
source_docs = testset.docs

f1,prediction,recall = f1_score4ner(pre_docs,source_docs,'all')
print("【严格相交】F1:{:.4f}  -  Predicition:{:.4f}  -  Recall:{:.4f}".format(f1,prediction,recall))

f1,prediction,recall = f1_score4ner(pre_docs,source_docs,'others')
print("【不严格相交】F1:{:.4f}  -  Predicition:{:.4f}  -  Recall:{:.4f}".format(f1,prediction,recall))

【严格相交】F1:0.7676  -  Predicition:0.7571  -  Recall:0.7784
【不严格相交】F1:0.8120  -  Predicition:0.8009  -  Recall:0.8234
time: 5.09 s


# biLSTM-LAN模型来实现

In [15]:
from Model import BiLstm_Lan_Trainer
from Data import CATEGORY
from keras.callbacks import EarlyStopping

BATCH_SIZE = 16
EPOCH = 50

model = BiLstm_Lan_Trainer(category_count = len(CATEGORY)+1,
                         seq_len = train_X.shape[1],
                         lstm_units=[256,256],
                         vocab_size = emb_matrix.shape[0],
                         emb_matrix = emb_matrix).build()

early_stopping = EarlyStopping(monitor='val_acc', patience=2, mode='max')

print('开始训练啦！！')
print(20*"===")
history = model.fit(train_X,train_Y,batch_size=BATCH_SIZE,
                    epochs = EPOCH,
                    class_weight="auto",
                    callbacks = [early_stopping],
                    validation_data = (val_X,val_Y)
                    )

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 90)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 90, 256)      590080      input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 90, 256)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 90, 512)      1050624     dropout_1[0][0]                  
____________________________________________________________________________________________

KeyboardInterrupt: 

time: 17.3 s


## 预测结果

In [9]:
preds = model.predict(test_X, batch_size=16, verbose=True)
from Evaluator import merge_preds,f1_score
from Prepare_sents import Sentences
testset = pickle.load(open('pickle_data/testset.pkl','rb'))
pre_docs = merge_preds(testset,preds,70,10)
source_docs = testset.docs
f1,prediction,recall = f1_score(pre_docs,source_docs,'all')
print("【严格相交】F1:{:.4f}  -  Predicition:{:.4f}  -  Recall:{:.4f}".format(f1,prediction,recall))

f1,prediction,recall = f1_score(pre_docs,source_docs,'others')
print("【不严格相交】F1:{:.4f}  -  Predicition:{:.4f}  -  Recall:{:.4f}".format(f1,prediction,recall))

【严格相交】F1:0.7369  -  Predicition:0.6846  -  Recall:0.7979
【不严格相交】F1:0.8043  -  Predicition:0.7472  -  Recall:0.8710
time: 1min 5s


# 换bert-bilstm（Kashgari）实现

## 数据转换

这里 train_x和 train_y都是一个list，

train_x: [[char_seq1],[char_seq2],[char_seq3],..... ]

train_y:[[label_seq1],[label_seq2],[label_seq3],..... ]

其中 char_seq1:["我"，"爱"，"荆"，"州"]

对应的的label_seq1:["O"，"O"，"B_LOC"，"I_LOC"]

数据预处理成一个字对应一个label就可以了，是不是很方便

In [1]:
from Model import BertTrainer
from Data import DataProcessor
from keras.callbacks import EarlyStopping,TensorBoard

import pickle

trainset = pickle.load(open('pickle_file/trainset.pkl','rb'))
valtset = pickle.load(open('pickle_file/valset.pkl','rb'))

train_x,train_y = DataProcessor(trainset).data4NER_Bert()
val_x,val_y = DataProcessor(valtset).data4NER_Bert()

Using TensorFlow backend.


In [4]:
label = []
for line in train_y:
    k = set(line)
    label.extend(k)
len(set(label))

31

In [8]:
len(val_x)

4460

In [5]:
import numpy as np
train_len = [len(x) for x in train_x]
val_len = [len(x) for x in val_x]
print(np.percentile(train_len,95),np.percentile(val_len,95))

252.0 250.0


In [6]:
bert_model_folder = "bert_model/wwm/chinese_wwm_ext_L-12_H-768_A-12"
seq_len = 260
fine_tune = False

model = BertTrainer(folder=bert_model_folder,fine_tune = fine_tune,seq_len="auto").build()
tf_board_callback = TensorBoard(log_dir='BLSTMModel_tf_dir', update_freq=10)
early_stopping = EarlyStopping(
    monitor='val_acc',
    min_delta=0,
    patience=4,
    verbose=1,
    mode='auto'
)
model.fit(train_x,train_y,val_x,val_y,epochs = 50,batch_size=16,callbacks=[tf_board_callback, early_stopping])



Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 260)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 260)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 260, 768), ( 16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 260, 768)     1536        Input-Segment[0][0]              
____________________________________________________________________________________________

<tensorflow.python.keras.callbacks.History at 0x2013b046c88>

## 模型保存

In [9]:
model.save("model_file/bert-bilstm")

## 模型加载并测试

In [4]:
from Data import DataProcessor
import pickle

testset = pickle.load(open('pickle_file/testset.pkl','rb'))
test_x,test_y = DataProcessor(testset).data4NER_Bert()

In [10]:
print(test_x[10][:50])

['不', '同', '组', '织', '的', ' ', 'S', 'U', 'R', '\n', '存', ' ', '在', ' ', '差', ' ', '异', ',', ' ', '不', ' ', '同', ' ', '胰', ' ', '岛', ' ', '素', ' ', '促', ' ', '泌', ' ', '剂', ' ', '会', ' ', '与', ' ', 'β', ' ', '细', ' ', '胞', ' ', '不', ' ', '同', ' ', '分']


In [9]:
print(test_y[10][:50])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_Anatomy', 'I_Anatomy', 'I_Anatomy', 'I_Anatomy', 'I_Anatomy', 'O', 'O', 'O', 'O', 'O', 'O']


In [22]:
import kashgari
model = kashgari.utils.load_model("model_file/bert-bilstm")
test_0 = model.predict(test_x[0:2])



In [23]:
len(test_0[0])

122

In [26]:
print(test_0[0][:50])

['O', 'O', 'O', 'O', 'O', 'B_Disease', 'I_Disease', 'I_Disease', 'I_Disease', 'I_Disease', 'B_Drug', 'I_Drug', 'I_Drug', 'O', 'I_Drug', 'I_Drug', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


# bert-biGRU-CRF

In [1]:
from Model import BertTrainer
from Data import DataProcessor
from keras.callbacks import EarlyStopping,TensorBoard

import pickle

trainset = pickle.load(open('pickle_file/trainset.pkl','rb'))
valtset = pickle.load(open('pickle_file/valset.pkl','rb'))

train_x,train_y = DataProcessor(trainset).data4NER_Bert()
val_x,val_y = DataProcessor(valtset).data4NER_Bert()

Using TensorFlow backend.


In [2]:
bert_model_folder = "bert_model/wwm/chinese_wwm_ext_L-12_H-768_A-12"
seq_len = 260
fine_tune = False

model = BertTrainer(folder=bert_model_folder,fine_tune = fine_tune,seq_len="auto").build()
tf_board_callback = TensorBoard(log_dir='BLSTMModel_tf_dir', update_freq=10)
early_stopping = EarlyStopping(
    monitor='val_acc',
    min_delta=0,
    patience=4,
    verbose=1,
    mode='auto'
)
model.fit(train_x,train_y,val_x,val_y,epochs = 50,batch_size=16,callbacks=[tf_board_callback, early_stopping])



Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 252)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 252)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 252, 768), ( 16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 252, 768)     1536        Input-Segment[0][0]              
____________________________________________________________________________________________



Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
 145/1092 [==>...........................] - ETA: 14:14 - loss: 5.0608 - accuracy: 0.9722

KeyboardInterrupt: 