### 基于Deep LSTM的中文分词
    - 步骤1：读入有标注的训练语料库，处理成keras需要的数据格式。
    - 步骤2：根据训练数据建模，使用deep LSTM方法
    - 步骤3：读入无标注的检验语料库，用两层LSTM模型进行分词标注，用更多层效果会更好一点
    - 步骤4：检查最终的效果 F值0.949

- 步骤1：训练数据读取和转换

In [1]:
import sys
import os
import nltk 
import codecs
import pandas as pd
import numpy as np
from os import path
from nltk.probability import FreqDist 
from gensim.models import word2vec
from  cPickle import load, dump

Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled)


In [2]:
# 根据微软语料库计算词向量

# 读单个文本
def load_file(input_file):
    input_data = codecs.open(input_file, 'r', 'utf-8')
    input_text = input_data.read()
    return input_text

# 读取目录下文本成为一个字符串
def load_dir(input_dir):
    files = list_dir(input_dir)
    seg_files_path = [path.join(input_dir, f) for f in files]
    output = []
    for txt in seg_files_path:
        output.append(load_file(txt))
    return '\n'.join(output)

# nltk  输入文本，输出词频表
def freq_func(input_txt):
    corpus = nltk.Text(input_txt) 
    fdist = FreqDist(corpus) 
    w = fdist.keys() 
    v = fdist.values() 
    freqdf = pd.DataFrame({'word':w,'freq':v}) 
    freqdf.sort('freq',ascending =False, inplace=True)
    freqdf['idx'] = np.arange(len(v))
    return freqdf

# word2vec建模
def trainW2V(corpus, epochs=20, num_features = 100,sg=1,\
             min_word_count = 1, num_workers = 4,\
             context = 4, sample = 1e-5, negative = 5):
    w2v = word2vec.Word2Vec(workers = num_workers,
                          sample = sample,
                          size = num_features,
                          min_count=min_word_count,
                          window = context)
    np.random.shuffle(corpus)
    w2v.build_vocab(corpus)  
    for epoch in range(epochs):
        print('epoch' + str(epoch))
        np.random.shuffle(corpus)
        w2v.train(corpus)
        w2v.alpha *= 0.9  
        w2v.min_alpha = w2v.alpha  
    print("word2vec DONE.")
    return w2v
    

def save_w2v(w2v, idx2word):
    # 保存词向量lookup矩阵，按idx位置存放
    init_weight_wv = []
    for i in range(len(idx2word)):
        init_weight_wv.append(w2v[idx2word[i]])
    return init_weight_wv



In [3]:
input_file = 'icwb2-data/training/msr_training.utf8'
input_text = load_file(input_file) # 读入全部文本
txtwv = [line.split() for line in input_text.split('\n') if line != '']  # 为词向量准备的文本格式
txtnltk = [w for w in input_text.split()]   # 为计算词频准备的文本格式
freqdf = freq_func(txtnltk) # 计算词频表
maxfeatures = freqdf.shape[0] # 词汇个数
#  建立两个映射字典
word2idx = dict((c, i) for c, i in zip(freqdf.word, freqdf.idx))
idx2word = dict((i, c) for c, i in zip(freqdf.word, freqdf.idx))
# word2vec
w2v = trainW2V(txtwv)
# 存向量
init_weight_wv = save_w2v(w2v,idx2word)


epoch0
epoch1
epoch2
epoch3
epoch4
epoch5
epoch6
epoch7
epoch8
epoch9
epoch10
epoch11
epoch12
epoch13
epoch14
epoch15
epoch16
epoch17
epoch18
epoch19
word2vec DONE.


In [4]:
# 定义'U'为未登陆新字, 'P'为两头padding用途，并增加两个相应的向量表示
char_num = len(init_weight_wv)
idx2word[char_num] = u'U'
word2idx[u'U'] = char_num
idx2word[char_num+1] = u'P'
word2idx[u'P'] = char_num+1

init_weight_wv.append(np.random.randn(100,))
init_weight_wv.append(np.zeros(100,))

In [5]:
# 读取数据，将格式进行转换为带四种标签 S B M E
output_file = 'icwb2-data/training/msr_training.tagging.utf8'

In [6]:
import codecs
import sys

def character_tagging(input_file, output_file):
    input_data = codecs.open(input_file, 'r', 'utf-8')
    output_data = codecs.open(output_file, 'w', 'utf-8')
    for line in input_data.readlines():
        word_list = line.strip().split()
        for word in word_list:
            if len(word) == 1:
                output_data.write(word + "/S ")
            else:
                output_data.write(word[0] + "/B ")
                for w in word[1:len(word)-1]:
                    output_data.write(w + "/M ")
                output_data.write(word[len(word)-1] + "/E ")
        output_data.write("\n")
    input_data.close()
    output_data.close()

character_tagging(input_file, output_file)

In [7]:
# 分离word 和 label
with open(output_file) as f:
    lines = f.readlines()
    train_line = [[w[0] for w in line.decode('utf-8').split()] for line in lines]
    train_label = [w[2] for line in lines for w in line.decode('utf-8').split()]

In [8]:
# 文档转数字list
import numpy as np
def sent2num(sentence, word2idx = word2idx, context = 7):
    predict_word_num = []
    for w in sentence:
        # 文本中的字如果在词典中则转为数字，如果不在则设置为'U
        if w in word2idx:
            predict_word_num.append(word2idx[w])
        else:
            predict_word_num.append(word2idx[u'U'])
    # 首尾padding
    num = len(predict_word_num)
    pad = int((context-1)*0.5)
    for i in range(pad):
        predict_word_num.insert(0,word2idx[u'P'] )
        predict_word_num.append(word2idx[u'P'] )
    train_x = []
    for i in range(num):
        train_x.append(predict_word_num[i:i+context])
    return train_x

In [9]:
# 输入字符list，输出数字list
sent2num(train_line[0])

[[88120, 88120, 88120, 9, 21, 107, 1976],
 [88120, 88120, 9, 21, 107, 1976, 26],
 [88120, 9, 21, 107, 1976, 26, 1116],
 [9, 21, 107, 1976, 26, 1116, 1397],
 [21, 107, 1976, 26, 1116, 1397, 7],
 [107, 1976, 26, 1116, 1397, 7, 10],
 [1976, 26, 1116, 1397, 7, 10, 538],
 [26, 1116, 1397, 7, 10, 538, 2300],
 [1116, 1397, 7, 10, 538, 2300, 2353],
 [1397, 7, 10, 538, 2300, 2353, 378],
 [7, 10, 538, 2300, 2353, 378, 0],
 [10, 538, 2300, 2353, 378, 0, 46],
 [538, 2300, 2353, 378, 0, 46, 2118],
 [2300, 2353, 378, 0, 46, 2118, 18],
 [2353, 378, 0, 46, 2118, 18, 2610],
 [378, 0, 46, 2118, 18, 2610, 1],
 [0, 46, 2118, 18, 2610, 1, 1172],
 [46, 2118, 18, 2610, 1, 1172, 2183],
 [2118, 18, 2610, 1, 1172, 2183, 78],
 [18, 2610, 1, 1172, 2183, 78, 7],
 [2610, 1, 1172, 2183, 78, 7, 16],
 [1, 1172, 2183, 78, 7, 16, 141],
 [1172, 2183, 78, 7, 16, 141, 70],
 [2183, 78, 7, 16, 141, 70, 110],
 [78, 7, 16, 141, 70, 110, 1],
 [7, 16, 141, 70, 110, 1, 2300],
 [16, 141, 70, 110, 1, 2300, 2353],
 [141, 70, 110, 1,

In [10]:
# 将所有训练文本转成数字list
train_word_num = []
for line in train_line:
    train_word_num.extend(sent2num(line))

In [11]:
print len(train_word_num)
print len(train_label)

4050469
4050469


In [2]:
from  cPickle import load
#dump(train_word_num, open('train_word_num.pickle', 'wb'))
#train_word_num = load(open('train_word_num.pickle','rb'))

In [12]:
nb_classes = len(np.unique(train_label))

- 步骤3：训练模型

In [56]:
from __future__ import absolute_import
from __future__ import print_function

from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential,Graph
from keras.layers.core import Dense, Dropout, Activation, TimeDistributedDense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Reshape, Flatten ,Dropout
from keras.regularizers import l1,l2
from keras.layers.convolutional import Convolution2D, MaxPooling2D,MaxPooling1D

In [14]:
# 建立两个字典
label_dict = dict(zip(np.unique(train_label), range(4)))
num_dict = {n:l  for l,n  in label_dict.iteritems()}
print(label_dict)
print(num_dict)
# 将目标变量转为数字
train_label = [label_dict[y] for y in train_label]

{u'M': 2, u'S': 3, u'B': 0, u'E': 1}
{0: u'B', 1: u'E', 2: u'M', 3: u'S'}


In [15]:
# 切分数据集
from sklearn.cross_validation import train_test_split
train_word_num = np.array(train_word_num)
train_X, test_X, train_y, test_y = train_test_split(train_word_num, train_label , train_size=0.9, random_state=1)

In [16]:
Y_train = np_utils.to_categorical(train_y, nb_classes)
Y_test = np_utils.to_categorical(test_y, nb_classes)

In [17]:
print(len(train_X), 'train sequences')
print(len(test_X), 'test sequences')

3645422 train sequences
405047 test sequences


In [18]:
# 初始字向量格式准备
init_weight = [np.array(init_weight_wv)]

In [19]:
batch_size = 128

In [20]:
maxfeatures = init_weight[0].shape[0] # 词典大小
word_dim = 100
maxlen = 7
hidden_units = 100

In [None]:
# stacking LSTM

In [35]:
print('stacking  LSTM...')
model = Sequential()
model.add(Embedding(maxfeatures, word_dim,input_length=maxlen))
model.add(LSTM(output_dim=hidden_units, return_sequences =True))
model.add(LSTM(output_dim=hidden_units, return_sequences =False))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

stacking  LSTM...


In [36]:
# train_X, test_X, Y_train, Y_test
print("Train...")
result = model.fit(train_X, Y_train, batch_size=batch_size, 
                   nb_epoch=20, validation_data = (test_X,Y_test), show_accuracy=True)

Train...
Train on 3645422 samples, validate on 405047 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# bidirectional_lstm

In [31]:
graph = Graph()
graph.add_input(name='input', input_shape=(maxlen,), dtype=int)
graph.add_node(Embedding(maxfeatures, word_dim, input_length=maxlen),
               name='embedding', input='input')
graph.add_node(LSTM(output_dim=hidden_units), name='forward', input='embedding')
graph.add_node(LSTM(output_dim=hidden_units, go_backwards =True), name='backward', input='embedding')
graph.add_node(Dropout(0.5), name='dropout', inputs=['forward', 'backward'])
graph.add_node(Dense(nb_classes, activation='softmax'), name='softmax', input='dropout')
graph.add_output(name='output', input='softmax')
graph.compile(loss = {'output': 'categorical_crossentropy'}, optimizer='adam')

In [34]:
result2 = graph.fit({'input':train_X, 'output':Y_train}, 
                  batch_size=batch_size, 
                   nb_epoch=20, validation_data = ({'input':test_X,'output':Y_test}))

Train on 3645422 samples, validate on 405047 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [37]:
import theano
# 模型待学习参数如下：即W的规模
weight_len = len(model.get_weights())
for i in range(weight_len):
    print(model.get_weights()[i].shape)
# lstm 分别是embeding, 4种不同的门 ， dense_w, dense_b

(88121, 100)
(100, 100)
(100, 100)
(100,)
(100, 100)
(100, 100)
(100,)
(100, 100)
(100, 100)
(100,)
(100, 100)
(100, 100)
(100,)
(100, 100)
(100, 100)
(100,)
(100, 100)
(100, 100)
(100,)
(100, 100)
(100, 100)
(100,)
(100, 100)
(100, 100)
(100,)
(100, 4)
(4,)


In [90]:
layer = theano.function([model.layers[0].input],model.layers[3].get_output(train=False),allow_input_downcast=True) 
layer_out = layer(test_X[:10]) 
layer_out.shape # 前10篇文章的第0层输出，经过reku计算

(10, 4)

- 步骤4：用test文本进行预测，评估效果

In [38]:
temp_txt = u'国家食药监总局发布通知称，酮康唑口服制剂因存在严重肝毒性不良反应，即日起停止生产销售使用。'
temp_txt = list(temp_txt)

In [39]:
temp_num = sent2num(temp_txt)
temp_num[:5]

[[88120, 88120, 88120, 309, 223, 5082, 1522],
 [88120, 88120, 309, 223, 5082, 1522, 6778],
 [88120, 309, 223, 5082, 1522, 6778, 396],
 [309, 223, 5082, 1522, 6778, 396, 1773],
 [223, 5082, 1522, 6778, 396, 1773, 1053]]

In [49]:
# 根据输入得到标注推断
def predict_num(input_num,input_txt, \
                model,\
                label_dict=label_dict,\
                num_dict=num_dict):
    input_num = np.array(input_num)
    predict_prob = model.predict_proba(input_num, verbose=False)
    predict_lable = model.predict_classes(input_num, verbose=False)
    for i , lable in enumerate(predict_lable[:-1]):
        # 如果是首字 ，不可为E, M
        if i==0:
            predict_prob[i, label_dict[u'E']] = 0
            predict_prob[i, label_dict[u'M']] = 0      
        # 前字为B，后字不可为B,S
        if lable == label_dict[u'B']:
            predict_prob[i+1,label_dict[u'B']] = 0
            predict_prob[i+1,label_dict[u'S']] = 0
        # 前字为E，后字不可为M,E
        if lable == label_dict[u'E']:
            predict_prob[i+1,label_dict[u'M']] = 0
            predict_prob[i+1,label_dict[u'E']] = 0
        # 前字为M，后字不可为B,S
        if lable == label_dict[u'M']:
            predict_prob[i+1,label_dict[u'B']] = 0
            predict_prob[i+1,label_dict[u'S']] = 0
        # 前字为S，后字不可为M,E
        if lable == label_dict[u'S']:
            predict_prob[i+1,label_dict[u'M']] = 0
            predict_prob[i+1,label_dict[u'E']] = 0
        predict_lable[i+1] = predict_prob[i+1].argmax()
    predict_lable_new = [num_dict[x]  for x in predict_lable]
    result =  [w+'/' +l  for w, l in zip(input_txt,predict_lable_new)]
    return ' '.join(result) + '\n'

In [50]:
temp = predict_num(temp_num,temp_txt, model = model)
print(temp)

国/B 家/M 食/M 药/M 监/M 总/M 局/E 发/B 布/E 通/B 知/E 称/S ，/S 酮/B 康/E 唑/B 口/E 服/S 制/B 剂/E 因/S 存/B 在/E 严/B 重/E 肝/S 毒/B 性/E 不/B 良/E 反/B 应/E ，/S 即/B 日/E 起/S 停/B 止/E 生/B 产/E 销/B 售/E 使/B 用/E 。/S



In [51]:
test_file = 'icwb2-data/testing/msr_test.utf8'
with open(test_file,'r') as f:
    lines = f.readlines()
    test_texts = [list(line.decode('utf-8').strip()) for line in lines]

In [52]:
test_output = []
for line in test_texts:
    test_num = sent2num(line)
    output_line = predict_num(test_num,input_txt=line,model = model)
    test_output.append(output_line.encode('utf-8'))

In [45]:
with open('icwb2-data/testing/msr_test_output.utf8','w') as f:
    f.writelines(test_output)

In [46]:
input_file = 'icwb2-data/testing/msr_test_output.utf8'
output_file = 'icwb2-data/testing/msr_test.split.tag2word.utf8'

In [47]:
import codecs
import sys

def character_2_word(input_file, output_file):
    input_data = codecs.open(input_file, 'r', 'utf-8')
    output_data = codecs.open(output_file, 'w', 'utf-8')
    # 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)
    for line in input_data.readlines():
        char_tag_list = line.strip().split()
        for char_tag in char_tag_list:
            char_tag_pair = char_tag.split('/')
            char = char_tag_pair[0]
            tag = char_tag_pair[1]
            if tag == 'B':
                output_data.write(' ' + char)
            elif tag == 'M':
                output_data.write(char)
            elif tag == 'E':
                output_data.write(char + ' ')
            else: # tag == 'S'
                output_data.write(' ' + char + ' ')
        output_data.write("\n")
    input_data.close()
    output_data.close()

character_2_word(input_file, output_file)

#### - 最终使用perl脚本检验的F值为0.949

In [48]:
! ./icwb2-data/scripts/score ./icwb2-data/gold/msr_training_words.utf8 ./icwb2-data/gold/msr_test_gold.utf8 ./icwb2-data/testing/msr_test.split.tag2word.utf8 > deep.score