In [7]:
from keras.models import Model
from keras.layers import Input, Embedding, Bidirectional, LSTM, Dropout, ZeroPadding1D, Conv1D, Dense, TimeDistributed, concatenate, Flatten
from keras.layers import AveragePooling1D
from keras_contrib.layers import CRF
import keras.backend as K
from keras.callbacks import ModelCheckpoint,Callback,EarlyStopping
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.utils import to_categorical


In [40]:

class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        

class NERModel:
    def __init__(self, maxlen, word_dict_size, word_vec_size, class_label_count):
        self.maxlen = maxlen
        self.word_dict_size = word_dict_size
        self.word_vec_size = word_vec_size
        self.class_label_count = class_label_count
        self.model = self._build_model()
        
    def _build_model(self):
        input_layer = Input(shape=(self.maxlen,), dtype='int32', name='input_layer')
        embedding_layer = Embedding(self.word_dict_size, self.word_vec_size, name='embedding_layer')(input_layer)
        bilstm = Bidirectional(LSTM(32, return_sequences=True))(embedding_layer)
        bilstm_d = Dropout(0.1)(bilstm)
#         half_window_size = 2
#         paddinglayer = ZeroPadding1D(padding=half_window_size)(embedding_layer)

#         conv = Conv1D(nb_filter=50, filter_length=(2 * half_window_size + 1), border_mode='valid')(paddinglayer)
#         conv_d = Dropout(0.1)(conv)
#         dense_conv = TimeDistributed(Dense(50))(conv_d)
#         rnn_cnn_merge = concatenate([bilstm_d, dense_conv], axis=2)
#         dense = TimeDistributed(Dense(self.class_label_count))(rnn_cnn_merge)
        crf = CRF(self.class_label_count, sparse_target=True)
#         crf_output = crf(dense)
        crf_output = crf(bilstm_d)
        model = Model(input=[input_layer], output=[crf_output])
        model.compile(loss=crf.loss_function, optimizer='adam', metrics=[crf.accuracy])
        model.summary()
    
        return model
    
    def train(self, data, label):
        checkpointer = ModelCheckpoint(filepath="../model/bilstm_1102_k205_tf130.w", verbose=0, save_best_only=True, save_weights_only=True) #save_weights_only=True
        history = LossHistory()
        earlystop = EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='min')
        
        data = pad_sequences(data, self.maxlen, padding = 'post', truncating='post')
        label = pad_sequences(label, self.maxlen, padding = 'post', truncating='post')
        label = np.expand_dims(label,2)
        print('---->',data.shape)
        print('---->',label.shape)
        self.model.fit(data, label,
                       batch_size=32, epochs=500,#validation_data = ([x_test, seq_lens_test], y_test),
                       callbacks=[checkpointer, history,earlystop],
                       verbose=1,
                       validation_split=0.1,
                      )
        
    def predict(self, data, id2chunk):
        output_result = []
        self.model.load_weights("../model/bilstm_1102_k205_tf130.w")
        data = pad_sequences(data, self.maxlen, padding = 'post', truncating='post')
        result = self.model.predict(data)
        for i in range(len(result)):
            output_result.append([id2chunk.get(item[1]) for item in np.argwhere(result[i])])
        return output_result
    

In [41]:
class PolaritiesClassifier:
    def __init__(self, content_max_len, aspect_max_len, option_max_len, word_dict_size, word_vec_size):
        self.content_max_len = content_max_len
        self.aspect_max_len = aspect_max_len
        self.option_max_len = option_max_len
        self.word_vec_size = word_vec_size
        self.word_dict_size = word_dict_size
        self.model = self._build_model()
        
    def _build_model(self):
        content_input_layer = Input(shape=(self.content_max_len,), dtype='int32', name='content_input_layer')
        aspect_input_layer = Input(shape=(self.aspect_max_len,), dtype='int32', name='aspect_input_layer')
        option_input_layer = Input(shape=(self.option_max_len,), dtype='int32', name='option_input_layer')
        shared_embedding_layer = Embedding(self.word_dict_size, self.word_vec_size, name='embedding_layer')
        content_vec = shared_embedding_layer(content_input_layer)
        aspect_vec = shared_embedding_layer(aspect_input_layer)
        option_vec = shared_embedding_layer(option_input_layer)
        
        half_window_size = 2
        padding = ZeroPadding1D(padding=half_window_size)
        
        content_padding_layer = padding(content_vec)
        aspect_padding_layer = padding(aspect_vec)
        option_padding_layer = padding(option_vec)
        
        conv_layer = Conv1D(nb_filter=50, filter_length=(2 * half_window_size + 1), border_mode='valid')
        pooling_layer = AveragePooling1D(2, 2,name='pooling')
        content_conv = conv_layer(content_padding_layer)
        aspect_conv = conv_layer(aspect_padding_layer)
        option_conv = conv_layer(option_padding_layer)
        
        content_dense_conv = pooling_layer(content_conv)
        aspect_dense_conv = pooling_layer(aspect_conv)
        option_dense_conv = pooling_layer(option_conv)

        cnn_merge = concatenate([content_dense_conv, aspect_dense_conv, option_dense_conv], axis=1)
        dense = Flatten()(cnn_merge)
        output = Dense(1, activation='sigmoid')(dense)
        model = Model(input=[content_input_layer, aspect_input_layer, option_input_layer], output=output)
        
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.summary()
        return model
    
    def train(self, content_data, aspect_data, option_data, label):
        checkpointer = ModelCheckpoint(filepath="../model/polarity.w", verbose=0, save_best_only=True, save_weights_only=True) #save_weights_only=True
        history = LossHistory()
        earlystop = EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='min')
        
        content_data = pad_sequences(content_data, self.content_max_len, padding = 'post', truncating='post')
        aspect_data = pad_sequences(aspect_data, self.aspect_max_len, padding = 'post', truncating='post')
        option_data = pad_sequences(option_data, self.option_max_len, padding = 'post', truncating='post')
        self.model.fit([content_data, aspect_data, option_data], label,
                       batch_size=32, epochs=500,#validation_data = ([x_test, seq_lens_test], y_test),
                       callbacks=[checkpointer, history,earlystop],
                       verbose=1,
                       validation_split=0.1,
                      )
            
    def predict(self, content_data, aspect_data, option_data, id2polarities):
        output_result = []
        self.model.load_weights("../model/polarity.w")
        content_data = pad_sequences(content_data, self.content_max_len, padding = 'post', truncating='post')
        aspect_data = pad_sequences(aspect_data, self.aspect_max_len, padding = 'post', truncating='post')
        option_data = pad_sequences(option_data, self.option_max_len, padding = 'post', truncating='post')
        data = [content_data, aspect_data, option_data]
        result = self.model.predict(data)
        
        for item in result:
            if item[0] > 0.5:
                output_result.append('正面')
            else:
                output_result.append('负面')
        return output_result


In [42]:
class CategoriesClassifier:
    def __init__(self, content_max_len, aspect_max_len, option_max_len, categories_num, word_dict_size, word_vec_size):
        self.content_max_len = content_max_len
        self.aspect_max_len = aspect_max_len
        self.option_max_len = option_max_len
        self.categories_num = categories_num
        self.word_vec_size = word_vec_size
        self.word_dict_size = word_dict_size
        self.model = self._build_model()
        
    def _build_model(self):
        content_input_layer = Input(shape=(self.content_max_len,), dtype='int32', name='content_input_layer')
        aspect_input_layer = Input(shape=(self.aspect_max_len,), dtype='int32', name='aspect_input_layer')
        option_input_layer = Input(shape=(self.option_max_len,), dtype='int32', name='option_input_layer')
        shared_embedding_layer = Embedding(self.word_dict_size, self.word_vec_size, name='embedding_layer')
        content_vec = shared_embedding_layer(content_input_layer)
        aspect_vec = shared_embedding_layer(aspect_input_layer)
        option_vec = shared_embedding_layer(option_input_layer)
        
        half_window_size = 2
        padding = ZeroPadding1D(padding=half_window_size)
        
        content_padding_layer = padding(content_vec)
        aspect_padding_layer = padding(aspect_vec)
        option_padding_layer = padding(option_vec)
        
        conv_layer = Conv1D(nb_filter=50, filter_length=(2 * half_window_size + 1), border_mode='valid')
        pooling_layer = AveragePooling1D(2, 2,name='pooling')
        content_conv = conv_layer(content_padding_layer)
        aspect_conv = conv_layer(aspect_padding_layer)
        option_conv = conv_layer(option_padding_layer)
        
        content_dense_conv = pooling_layer(content_conv)
        aspect_dense_conv = pooling_layer(aspect_conv)
        option_dense_conv = pooling_layer(option_conv)

        cnn_merge = concatenate([content_dense_conv, aspect_dense_conv, option_dense_conv], axis=1)
        dense = Flatten()(cnn_merge)
        output = Dense(self.categories_num, activation='softmax')(dense)
        model = Model(input=[content_input_layer, aspect_input_layer, option_input_layer], output=output)
        
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        model.summary()
        return model
    
    def train(self, content_data, aspect_data, option_data, label):
        checkpointer = ModelCheckpoint(filepath="../model/categories.w", verbose=0, save_best_only=True, save_weights_only=True) #save_weights_only=True
        history = LossHistory()
        earlystop = EarlyStopping(monitor='val_loss', patience=20, verbose=2, mode='min')
        
        content_data = pad_sequences(content_data, self.content_max_len, padding = 'post', truncating='post')
        aspect_data = pad_sequences(aspect_data, self.aspect_max_len, padding = 'post', truncating='post')
        option_data = pad_sequences(option_data, self.option_max_len, padding = 'post', truncating='post')
        encoded=to_categorical(label)
        self.model.fit([content_data, aspect_data, option_data], encoded,
                       batch_size=32, epochs=500,#validation_data = ([x_test, seq_lens_test], y_test),
                       callbacks=[checkpointer, history,earlystop],
                       verbose=1,
                       validation_split=0.1,
                      )
            
    def predict(self, content_data, aspect_data, option_data, id2categories):
        output_result = []
        self.model.load_weights("../model/categories.w")
        content_data = pad_sequences(content_data, self.content_max_len, padding = 'post', truncating='post')
        aspect_data = pad_sequences(aspect_data, self.aspect_max_len, padding = 'post', truncating='post')
        option_data = pad_sequences(option_data, self.option_max_len, padding = 'post', truncating='post')
        data = [content_data, aspect_data, option_data]
        result = self.model.predict(data)

        for i in range(len(result)):
            output_result.append(id2categories.get(str(np.argmax(result[i]))))
        return output_result

In [54]:
import json
import numpy as np
import jieba
import pandas as pd
class GetData:
    def __init__(self, granu):
        self.data_save_path_root = '../data/'
        self.chunk_tags = ['O', 'B_AT', 'I_AT', 'B_OT', 'I_OT']
        self.categories = ['价格', '使用体验', '其他', '功效', '包装', '尺寸', '成分', '整体', '新鲜度', '服务', '气味', '物流', '真伪']
        self.chunk2id, self.id2chunk, self.categories2id, self.id2categories = self._reshape_data('all')
        self.polarities2id = {'正面': 1, '负面': 0}
        self.id2polarities = {1: '正面', 0: '负面'}
        fr = open('../data/aspect_opinion_' + granu + '_data.json', 'r')
        self.polarities_categories_train_data = json.load(fr)
        self.granu = granu
    def _reshape_data(self, mode):
        """
            mode ->  'all' : 标注在一起
                     ->  'at' : AspectTerm only
                     ->  'ot' : OpinionTerms only
        """
        if mode == 'all':
            chunk_tags = self.chunk_tags
        elif mode == 'at':
            chunk_tags = ['O', 'B_AT', 'I_AT']
        elif mode == 'ot':
            chunk_tags = ['O', 'B_OT', 'I_OT']
            
        chunk2id = {item: _id for _id, item in enumerate(chunk_tags)}
        id2chunk = {v: k for k, v in chunk2id.items()}
        categories2id = {item: _id for _id, item in enumerate(self.categories)}
        id2categories = {v: k for k, v in categories2id.items()}
        return chunk2id, id2chunk, categories2id, id2categories
    
    def get_ner_train_data(self):
        with open(self.data_save_path_root + self.granu + '_level.json', 'r', encoding='utf8') as fr:
            vocab_dict = json.load(fr)
        with open(self.data_save_path_root + 'ner_' + self.granu + '_data.json', 'r', encoding='utf8') as fr:
            ner_data = json.load(fr)
        with open(self.data_save_path_root + 'ner_' + self.granu + '_label.json', 'r', encoding='utf8') as fr:
            ner_label = json.load(fr)
            
            
        vocab2id = vocab_dict.get(self.granu + '2id')
        id2vocab = vocab_dict.get('id2' + self.granu)
        
        for i in range(len(ner_data)):
            ner_data[i] = np.asarray([vocab2id.get(item, 1) for item in ner_data[i]])
            
        ner_data = np.asarray(ner_data)
        
        for i in range(len(ner_label)):
            ner_label[i] = np.asarray([self.chunk2id.get(item, 1) for item in ner_label[i]])
            
        ner_data = np.asarray(ner_data)
        ner_label = np.asarray(ner_label)
        
        
        return self.chunk2id, self.id2chunk, vocab2id, id2vocab, ner_data, ner_label
    
    def get_categories_train_data(self):
        return self.polarities_categories_train_data.get('content'), self.polarities_categories_train_data.get('aspect'), self.polarities_categories_train_data.get('opinion'), self.polarities_categories_train_data.get('category'), self.polarities_categories_train_data.get('id2categories')
    def get_polarities_train_data(self):
        return self.polarities_categories_train_data.get('content'), self.polarities_categories_train_data.get('aspect'), self.polarities_categories_train_data.get('opinion'), self.polarities_categories_train_data.get('polarity'), self.id2polarities
    
    def get_test_data(self, test_file_path, vocab2id):
        test_id = []
        test_data = []
        table = pd.read_csv(test_file_path)
        for index, row in table.iterrows():
            _id = row['id']
            content = row['Reviews']
            if self.granu == 'char':
                words = list(content)
            else:
                words = jieba.lcut(content)
            test_id.append(_id)
            test_data.append([vocab2id.get(w, 1) for w in words])
            
        return test_id, test_data

In [55]:
get_data_obj = GetData('char')
ids, data = get_data_obj.get_test_data('../data/Train_reviews.csv',vocab2id)

In [44]:
get_data_obj = GetData('char')
chunk2id, id2chunk, vocab2id, id2vocab, ner_data, ner_label = get_data_obj.get_ner_train_data()
content_data, aspect_data, opinion_data, category_label, id2categories = get_data_obj.get_categories_train_data()
content_data, aspect_data, opinion_data, polarity_label, id2polarities = get_data_obj.get_polarities_train_data()

In [45]:
cc = CategoriesClassifier(content_max_len = 69, aspect_max_len=8, option_max_len=12, categories_num=len(id2categories), word_dict_size=len(vocab2id), word_vec_size=128)
cc.train(content_data, aspect_data, opinion_data, category_label)
result = cc.predict(content_data, aspect_data, opinion_data, id2categories)
result

In [46]:
pc = PolaritiesClassifier(content_max_len = 69, aspect_max_len=8, option_max_len=12, word_dict_size=len(vocab2id), word_vec_size=128)
pc.train(content_data, aspect_data, opinion_data, polarity_label)
result = pc.predict(content_data, aspect_data, opinion_data, id2categories)
result



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
content_input_layer (InputLayer (None, 69)           0                                            
__________________________________________________________________________________________________
aspect_input_layer (InputLayer) (None, 8)            0                                            
__________________________________________________________________________________________________
option_input_layer (InputLayer) (None, 12)           0                                            
__________________________________________________________________________________________________
embedding_layer (Embedding)     multiple             165120      content_input_layer[0][0]        
                                                                 aspect_input_layer[0][0]         
          

['正面',
 '正面',
 '正面',
 '正面',
 '负面',
 '正面',
 '负面',
 '负面',
 '负面',
 '负面',
 '负面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '负面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '负面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '负面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '负面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '负面',
 '负面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '负面',
 '负面',
 '正面',
 '负面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',
 '正面',

In [47]:
ner = NERModel(maxlen = 30, word_dict_size = len(vocab2id), word_vec_size = 128, class_label_count = len(chunk2id))
ner.train(ner_data, ner_label )
result = ner.predict(ner_data, id2chunk)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     (None, 30)                0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 30, 128)           165120    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 30, 64)            41216     
_________________________________________________________________
dropout_2 (Dropout)          (None, 30, 64)            0         
_________________________________________________________________
crf_2 (CRF)                  (None, 30, 5)             360       
Total params: 206,696
Trainable params: 206,696
Non-trainable params: 0
_________________________________________________________________
----> (3229, 30)
----> (3229, 30, 1)
Train on 2906 samples, validate on 323 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoc

In [58]:
result

[['B_OT',
  'I_OT',
  'O',
  'B_OT',
  'I_OT',
  'O',
  'B_OT',
  'I_OT',
  'I_OT',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B_OT',
  'I_OT',
  'O',
  'B_AT',
  'I_AT',
  'I_AT',
  'I_AT',
  'B_OT',
  'I_OT',
  'I_OT',
  'O',
  'O',
  'O',
  'B_OT',
  'I_OT',
  'I_OT',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B_AT',
  'I_AT',
  'B_OT',
  'I_OT',
  'I_OT',
  'I_OT',
  'O',
  'O',
  'O',
  'B_AT',
  'I_AT',
  'I_AT',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B_OT',
  'I_OT',
  'I_OT',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B_OT',
  'I_OT',
  'I_OT',
  'B_OT',
  'I_OT',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B_OT',
  'I_OT',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B_AT',
  'I_AT',
  'I_AT',
  'B_OT',
  