In [1]:
import os
import re
import yaml
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import model_from_yaml

Using TensorFlow backend.


In [2]:
# set parameters:
batch_size = 1024
epochs = 30

In [3]:
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [4]:
def read_files(filetype):
    path = "dataset/"
    file_list = []

    postive_path = path + filetype + "/pos/"
    for f in os.listdir(postive_path):
        file_list += [postive_path + f]
    
    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]

    print('read', filetype, 'files:', len(file_list))
    
    if filetype == 'train':
        all_labels = ([1] * 12000 + [0] * 12000)
    else:
        all_labels = ([1] * 500 + [0] * 500)
        
    all_texts = []
    for fi in file_list:
        with open(fi, encoding = 'utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]

    return all_labels, all_texts

In [5]:
def model_training(x_train, y_train, x_test, y_test):
    model = Sequential()
    model.add(Embedding(output_dim = 64,
                        input_dim = 1000,
                        input_length = 300))
    model.add(Dropout(0.4))
    model.add(Dense(units = 64,
                    activation = 'relu'))      
    model.add(LSTM(32))
    model.add(Dense(units = 128,
                    activation = 'relu'))                    
    model.add(Dropout(0.4))
    model.add(Dense(units = 64,
                    activation = 'relu'))
    model.add(Dropout(0.4))
    model.add(Dense(units = 32,
                    activation = 'relu'))    
    model.add(Dropout(0.4))    
    model.add(Dense(units = 1,
                    activation = 'sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])               
    model.summary()
    model.fit(x_train, y_train, batch_size = batch_size, epochs = epochs, verbose = 2, validation_split = 0.2)

    score = model.evaluate(x_test, y_test, verbose = 1, batch_size = batch_size)
    print(score[1])

    yaml_string = model.to_yaml()
    with open('lstm_data/lstm.yml', 'w') as outfile:
        outfile.write(yaml.dump(yaml_string, default_flow_style=True))
    model.save_weights('lstm_data/lstm.h5')

In [6]:
def sensitive_dict(postive):
    if int(postive) == '1':
        return '正向'
    else:
        return '負向'

In [7]:
y_train, train_text = read_files("train")
y_test, test_text = read_files("test")
token = Tokenizer(num_words = 1000)
token.fit_on_texts(train_text)

x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

x_train = sequence.pad_sequences(x_train_seq, maxlen = 300)
x_test = sequence.pad_sequences(x_test_seq, maxlen = 300)

print('-------------------------------------------------------------')
print('Start Training................................................')
model_training(x_train, y_train, x_test, y_test)
print('-------------------------------------------------------------')
print('All Finish!................................................')

read train files: 24000
read test files: 1000
-------------------------------------------------------------
Start Training................................................
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 64)           64000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 300, 64)           0         
_________________________________________________________________
dense_1 (Dense)              (None, 300, 64)           4160      
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_2 (Dense)              (None, 128)               4224      
_________________________________________________________________
dropout_2 (Drop

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 19200 samples, validate on 4800 samples
Epoch 1/30
 - 27s - loss: 0.6827 - accuracy: 0.6027 - val_loss: 0.9479 - val_accuracy: 0.0000e+00
Epoch 2/30
 - 27s - loss: 0.6629 - accuracy: 0.6243 - val_loss: 0.9567 - val_accuracy: 0.0000e+00
Epoch 3/30
 - 28s - loss: 0.6304 - accuracy: 0.6248 - val_loss: 0.9508 - val_accuracy: 0.0000e+00
Epoch 4/30
 - 29s - loss: 0.5683 - accuracy: 0.6812 - val_loss: 0.7698 - val_accuracy: 0.6452
Epoch 5/30
 - 30s - loss: 0.4749 - accuracy: 0.7857 - val_loss: 0.7312 - val_accuracy: 0.6727
Epoch 6/30
 - 34s - loss: 0.4028 - accuracy: 0.8314 - val_loss: 0.4560 - val_accuracy: 0.8062
Epoch 7/30
 - 36s - loss: 0.3716 - accuracy: 0.8478 - val_loss: 0.3532 - val_accuracy: 0.8544
Epoch 8/30
 - 36s - loss: 0.3570 - accuracy: 0.8566 - val_loss: 0.4446 - val_accuracy: 0.8175
Epoch 9/30
 - 37s - loss: 0.3385 - accuracy: 0.8682 - val_loss: 0.4045 - val_accuracy: 0.8429
Epoch 10/30
 - 38s - loss: 0.3159 - accuracy: 0.8784 - val_loss: 0.3516 - val_accuracy: 0.858