In [1]:
from keras.models import Model
from keras.layers import Input, Embedding, Bidirectional, LSTM, Dropout, ZeroPadding1D, Conv1D, Dense, TimeDistributed, concatenate, Flatten
from keras.layers import AveragePooling1D
from keras_contrib.layers import CRF
import keras.backend as K
from keras.callbacks import ModelCheckpoint,Callback
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.utils import to_categorical


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:

class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        

class NERModel:
    def __init__(self, maxlen, word_dict_size, word_vec_size, class_label_count):
        self.maxlen = maxlen
        self.word_dict_size = word_dict_size
        self.word_vec_size = word_vec_size
        self.class_label_count = class_label_count
        self.model = self._build_model()
        
    def _build_model(self):
        input_layer = Input(shape=(self.maxlen,), dtype='int32', name='input_layer')
        embedding_layer = Embedding(self.word_dict_size, self.word_vec_size, name='embedding_layer')(input_layer)
        bilstm = Bidirectional(LSTM(32, return_sequences=True))(embedding_layer)
        bilstm_d = Dropout(0.1)(bilstm)
#         half_window_size = 2
#         paddinglayer = ZeroPadding1D(padding=half_window_size)(embedding_layer)

#         conv = Conv1D(nb_filter=50, filter_length=(2 * half_window_size + 1), border_mode='valid')(paddinglayer)
#         conv_d = Dropout(0.1)(conv)
#         dense_conv = TimeDistributed(Dense(50))(conv_d)
#         rnn_cnn_merge = concatenate([bilstm_d, dense_conv], axis=2)
#         dense = TimeDistributed(Dense(self.class_label_count))(rnn_cnn_merge)
        crf = CRF(self.class_label_count, sparse_target=True)
#         crf_output = crf(dense)
        crf_output = crf(bilstm_d)
        model = Model(input=[input_layer], output=[crf_output])
        model.compile(loss=crf.loss_function, optimizer='adam', metrics=[crf.accuracy])
        model.summary()
    
        return model
    
    def train(self, data, label):
        checkpointer = ModelCheckpoint(filepath="../model/bilstm_1102_k205_tf130.w", verbose=0, save_best_only=True, save_weights_only=True) #save_weights_only=True
        history = LossHistory()
        data = pad_sequences(data, self.maxlen, padding = 'post', truncating='post')
        label = pad_sequences(label, self.maxlen, padding = 'post', truncating='post')
        label = np.expand_dims(label,2)
        print('---->',data.shape)
        print('---->',label.shape)
        self.model.fit(data, label,
                       batch_size=32, epochs=500,#validation_data = ([x_test, seq_lens_test], y_test),
                       callbacks=[checkpointer, history],
                       verbose=1,
                       validation_split=0.1,
                      )
        
    def predict(self, data, id2chunk):
        output_result = []
        self.model.load_weights("../model/bilstm_1102_k205_tf130.w")
        data = pad_sequences(data, self.maxlen, padding = 'post', truncating='post')
        result = self.model.predict(data)
        for i in range(len(result)):
            output_result.append([id2chunk.get(item[1]) for item in np.argwhere(result[i])])
        return output_result
    

In [3]:
class CategoriesClassifier:
    def __init__(self):
        print()
        
    def _build_model(self):
        pass
    
    

In [4]:
class PolaritiesClassifier:
    def __init__(self, content_max_len, aspect_max_len, option_max_len, polarity_num, word_dict_size, word_vec_size):
        self.content_max_len = content_max_len
        self.aspect_max_len = aspect_max_len
        self.option_max_len = option_max_len
        self.polarity_num = polarity_num
        self.word_vec_size = word_vec_size
        self.word_dict_size = word_dict_size
        self.model = self._build_model()
        
    def _build_model(self):
        content_input_layer = Input(shape=(self.content_max_len,), dtype='int32', name='content_input_layer')
        aspect_input_layer = Input(shape=(self.aspect_max_len,), dtype='int32', name='aspect_input_layer')
        option_input_layer = Input(shape=(self.option_max_len,), dtype='int32', name='option_input_layer')
        shared_embedding_layer = Embedding(self.word_dict_size, self.word_vec_size, name='embedding_layer')
        content_vec = shared_embedding_layer(content_input_layer)
        aspect_vec = shared_embedding_layer(aspect_input_layer)
        option_vec = shared_embedding_layer(option_input_layer)
        
        half_window_size = 2
        padding = ZeroPadding1D(padding=half_window_size)
        
        content_padding_layer = padding(content_vec)
        aspect_padding_layer = padding(aspect_vec)
        option_padding_layer = padding(option_vec)
        
        conv_layer = Conv1D(nb_filter=50, filter_length=(2 * half_window_size + 1), border_mode='valid')
        pooling_layer = AveragePooling1D(2, 2,name='pooling')
        content_conv = conv_layer(content_padding_layer)
        aspect_conv = conv_layer(aspect_padding_layer)
        option_conv = conv_layer(option_padding_layer)
        
        content_dense_conv = pooling_layer(content_conv)
        aspect_dense_conv = pooling_layer(aspect_conv)
        option_dense_conv = pooling_layer(option_conv)

        cnn_merge = concatenate([content_dense_conv, aspect_dense_conv, option_dense_conv], axis=1)
        dense = Flatten()(cnn_merge)
        output = Dense(self.polarity_num, activation='softmax')(dense)
        model = Model(input=[content_input_layer, aspect_input_layer, option_input_layer], output=output)
        
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        model.summary()
        return model
    
    def train(self, content_data, aspect_data, option_data, label):
        checkpointer = ModelCheckpoint(filepath="../model/polarity.w", verbose=0, save_best_only=True, save_weights_only=True) #save_weights_only=True
        history = LossHistory()
        content_data = pad_sequences(content_data, self.content_max_len, padding = 'post', truncating='post')
        aspect_data = pad_sequences(aspect_data, self.aspect_max_len, padding = 'post', truncating='post')
        option_data = pad_sequences(option_data, self.option_max_len, padding = 'post', truncating='post')
        encoded=to_categorical(label)
        self.model.fit([content_data, aspect_data, option_data], encoded,
                       batch_size=32, epochs=500,#validation_data = ([x_test, seq_lens_test], y_test),
                       callbacks=[checkpointer, history],
                       verbose=1,
                       validation_split=0.1,
                      )
            
    def predict(self, content_data, aspect_data, option_data):
#         output_result = []
        self.model.load_weights("../model/polarity.w")
        content_data = pad_sequences(content_data, self.content_max_len, padding = 'post', truncating='post')
        aspect_data = pad_sequences(aspect_data, self.aspect_max_len, padding = 'post', truncating='post')
        option_data = pad_sequences(option_data, self.option_max_len, padding = 'post', truncating='post')
        data = [content_data, aspect_data, option_data]
        result = self.model.predict(data)
        return result
#             for i in range(len(result)):
#                 output_result.append([id2chunk.get(item[1]) for item in np.argwhere(result[i])])
#             return output_result

In [5]:
import json
import numpy as np
class GetData:
    def __init__(self):
        self.data_save_path_root = '../data/'
        self.chunk_tags = ['O', 'B_AT', 'I_AT', 'B_OT', 'I_OT']
        self.polarities = ['价格', '使用体验', '其他', '功效', '包装', '尺寸', '成分', '整体', '新鲜度', '服务', '气味', '物流', '真伪']
        self.chunk2id, self.id2chunk, self.polarities2id, self.id2polarities = self._reshape_data('all')
        self.polarities2id = {'正面': 1, '负面': 0}
        self.id2polarities = {1: '正面', 0: '负面'}
        
    def _reshape_data(self, mode):
        """
            mode ->  'all' : 标注在一起
                     ->  'at' : AspectTerm only
                     ->  'ot' : OpinionTerms only
        """
        if mode == 'all':
            chunk_tags = self.chunk_tags
        elif mode == 'at':
            chunk_tags = self.chunk_tags[:3]
        elif mode == 'ot':
            chunk_tags = self.chunk_tags[2:]
            
        chunk2id = {item: _id for _id, item in enumerate(chunk_tags)}
        id2chunk = {v: k for k, v in chunk2id.items()}
        polarities2id = {item: _id for _id, item in enumerate(self.polarities)}
        id2polarities = {v: k for k, v in polarities2id.items()}
        return chunk2id, id2chunk, polarities2id, id2polarities
    
    def get_ner_train_data(self, granularity):
        with open(self.data_save_path_root + granularity + '_level.json', 'r', encoding='utf8') as fr:
            vocab_dict = json.load(fr)
        with open(self.data_save_path_root + 'ner_' + granularity + '_data.json', 'r', encoding='utf8') as fr:
            ner_data = json.load(fr)
        with open(self.data_save_path_root + 'ner_' + granularity + '_label.json', 'r', encoding='utf8') as fr:
            ner_label = json.load(fr)
            
            
        vocab2id = vocab_dict.get(granularity + '2id')
        id2vocab = vocab_dict.get('id2' + granularity)
        
        for i in range(len(ner_data)):
            ner_data[i] = np.asarray([vocab2id.get(item, 1) for item in ner_data[i]])
            
        ner_data = np.asarray(ner_data)
        
        for i in range(len(ner_label)):
            ner_label[i] = np.asarray([self.chunk2id.get(item, 1) for item in ner_label[i]])
            
        ner_data = np.asarray(ner_data)
        ner_label = np.asarray(ner_label)
        
        
        return self.chunk2id, self.id2chunk, vocab2id, id2vocab, ner_data, ner_label
    
    
    def get_polarities_train_data(self, granu):
        fr = open('../data/aspect_opinion_' + granu + '_data.json', 'r')
        polarities_train_data = json.load(fr)
        return polarities_train_data.get('content'), polarities_train_data.get('aspect'), polarities_train_data.get('opinion'), polarities_train_data.get('category')

In [6]:
get_data_obj = GetData()
chunk2id, id2chunk, vocab2id, id2vocab, ner_data, ner_label = get_data_obj.get_ner_train_data('char')
content_data, aspect_data, opinion_data, category_label = get_data_obj.get_polarities_train_data('char')

In [7]:
max_len = 0
for item in opinion_data:
    if len(item) > max_len:
        max_len = len(item)
        
print(max_len)

12


In [8]:
pc = PolaritiesClassifier(content_max_len = 69, aspect_max_len=8, option_max_len=12, polarity_num=len(set(category_label)), word_dict_size=len(vocab2id), word_vec_size=128)

W0821 18:42:08.885962 4501763520 deprecation_wrapper.py:119] From /Users/zhaoliang/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0821 18:42:08.901056 4501763520 deprecation_wrapper.py:119] From /Users/zhaoliang/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0821 18:42:08.915845 4501763520 deprecation_wrapper.py:119] From /Users/zhaoliang/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0821 18:42:08.949307 4501763520 deprecation_wrapper.py:119] From /Users/zhaoliang/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3980: The name tf.nn.avg_pool is deprecated. Please use tf.nn.avg_pool2d instead.

W0821 18:42

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
content_input_layer (InputLayer (None, 69)           0                                            
__________________________________________________________________________________________________
aspect_input_layer (InputLayer) (None, 8)            0                                            
__________________________________________________________________________________________________
option_input_layer (InputLayer) (None, 12)           0                                            
__________________________________________________________________________________________________
embedding_layer (Embedding)     multiple             165120      content_input_layer[0][0]        
                                                                 aspect_input_layer[0][0]         
          

In [9]:
pc.train(content_data, aspect_data, opinion_data, category_label)

W0821 18:42:11.718528 4501763520 deprecation.py:323] From /Users/zhaoliang/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0821 18:42:11.902522 4501763520 deprecation_wrapper.py:119] From /Users/zhaoliang/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 5969 samples, validate on 664 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
 960/5969 [===>..........................] - ETA: 2s - loss: 0.0151 - acc: 0.9969

KeyboardInterrupt: 

In [30]:
ner = NERModel(maxlen = 30, word_dict_size = len(vocab2id), word_vec_size = 128, class_label_count = len(chunk2id))
ner.train(ner_data, ner_label )



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     (None, 30)                0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 30, 128)           197888    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 30, 64)            41216     
_________________________________________________________________
dropout_5 (Dropout)          (None, 30, 64)            0         
_________________________________________________________________
crf_5 (CRF)                  (None, 30, 5)             360       
Total params: 239,464
Trainable params: 239,464
Non-trainable params: 0
_________________________________________________________________
----> (3229, 30)
----> (3229, 30, 1)
Train on 2906 samples, validate on 323 samples
Epoch 1/500


KeyboardInterrupt: 

In [31]:
result = ner.predict(ner_data, id2chunk)

In [33]:
for i in range(len(result)):
    print(result[i])
    
    if i == 3:
        break

['B_OT', 'I_OT', 'O', 'B_OT', 'O', 'B_OT', 'I_OT', 'I_OT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['B_OT', 'I_OT', 'O', 'B_AT', 'I_AT', 'B_OT', 'I_OT', 'O', 'O', 'B_OT', 'I_OT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['B_AT', 'B_OT', 'I_OT', 'I_OT', 'O', 'O', 'O', 'B_AT', 'O', 'O', 'O', 'O', 'O', 'B_OT', 'I_OT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'B_OT', 'I_OT', 'I_OT', 'O', 'O', 'O', 'O', 'B_OT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [21]:
for item in np.argwhere(result[0]):
    print(item[1])

3
4
0
3
0
3
4
4
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [36]:
ner_data[0]

array([2, 3, 4, 5, 4, 2, 3, 6])