In [1]:
import os
import re
import yaml
import pandas as pd
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import model_from_yaml

Using TensorFlow backend.


In [2]:
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [3]:
def read_files(filetype):
    path = "dataset/"
    file_list = []

    postive_path = path + filetype + "/pos/"
    for f in os.listdir(postive_path):
        file_list += [postive_path + f]
    
    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]

    print('read', filetype, 'files:', len(file_list))

    if filetype == 'train':
        all_labels = ([1] * 12000 + [0] * 12000)
    else:
        all_labels = ([1] * 500 + [0] * 500)

    all_texts = []
    for fi in file_list:
        with open(fi, encoding = 'utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]

    return all_labels, all_texts

In [4]:
def lstm_predict(input_text, input_List, x_test, y_test):
    with open('lstm_data/lstm.yml', 'r') as f:
        yaml_string = yaml.load(f)
    model = model_from_yaml(yaml_string)
    model.load_weights('lstm_data/lstm.h5')
    model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
    score = model.evaluate(x_test, y_test, verbose = 1, batch_size = 1024)
    print('準確度 : ' + str(score[1]))
        
    with open('output.csv', 'w+') as new_csv:
        pass
        
    csv_lst = []
    i = 0
    for item in input_List:
        i += 1
        predict_result = model.predict_classes(item)
        csv_lst.append([i, sensitive_dict(predict_result[0][0])])
        
    df = pd.DataFrame(data=csv_lst, columns=['Id', 'Label'])
    df = df.append(df, ignore_index=False)
    df.to_csv('output.csv', sep=',', encoding='utf_8_sig', index=False)    
    

In [5]:
def sensitive_dict(postive):
    if int(postive) == 1:
        return 'pos'
    else:
        return 'neg'

In [6]:
y_train, train_text = read_files("train")
y_test, test_text = read_files("test")
token = Tokenizer(num_words = 1000)
token.fit_on_texts(train_text)

x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

x_train = sequence.pad_sequences(x_train_seq, maxlen = 300)
x_test = sequence.pad_sequences(x_test_seq, maxlen = 300)

df = pd.read_csv('test_dataset.csv', header = None)
df = pd.DataFrame(df[1].astype(str))
input_lst = []

for data in df[1]:
    data = data.replace('[換行字元]', '\n')
    input_seq = token.texts_to_sequences([data])
    pad_input_seq = sequence.pad_sequences(input_seq, maxlen = 300)
    input_lst.append(pad_input_seq)

print('-------------------------------------------------------------')
print('Start Predicting................................................')
lstm_predict(df[1], input_lst, x_test, y_test)
print('-------------------------------------------------------------')
print('All Finish!................................................')


read train files: 24000
read test files: 1000
-------------------------------------------------------------
Start Predicting................................................


  This is separate from the ipykernel package so we can avoid doing imports until


準確度 : 0.8389999866485596
-------------------------------------------------------------
All Finish!................................................
