In [None]:
"""
2019 08 07
RNN 실습하기 - 영화리뷰 실습하기
"""

#### 데이터 전처리
import re
import konlpy
import pandas as pd
from konlpy.tag import Okt

train_data = pd.read_table('./data/ratings_train.txt')
test_data = pd.read_table('./data/ratings_test.txt')

# null 값 제거
train_data = train_data.dropna(how='any')
test_data = test_data.dropna(how='any')

# 알파벳, 공백 제외
rgexp = '[^ㄱ-ㅎㅏ-ㅣ가-힣 ]'
train_data['document'] = train_data['document'].str.replace(rgexp,'')
test_data['document'] = test_data['document'].str.replace(rgexp,'')

print(train_data['document'])
# 불용어(조사, 어미 , 접속사) 제거
stopwords = ['의','가','이','은','들','는','좀','과','도','를','으로','에','와']

okt = Okt()

x_train=[]
for sentence in train_data['document']: #15만번 반복
    temp_x = []
    temp_x = okt.morphs(sentence, stem=True)
    temp_x = [word for word in temp_x if not word in stopwords]
    x_train.append(temp_x)

print(x_train[0])
print(x_train[1])
print(x_train[2])
print(x_train[3])
print(x_train[4])

x_test=[]
for sentence in test_data['document']: #15만번 반복
    temp_x = []
    temp_x = okt.morphs(sentence, stem=True)
    temp_x = [word for word in temp_x if not word in stopwords]
    x_test.append(temp_x)

print(x_test[0])
print(x_test[1])
print(x_test[2])
print(x_test[3])
print(x_test[4])

# 텍스트 단어 부호화(정수 인코딩)
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)


#### 딥러닝
from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

# 데이터 준비
train_label = train_data['label']
test_label = test_data['label']

# 데이터 정규화
x_train = pad_sequences(x_train,maxlen=30)
x_test = pad_sequences(x_test,maxlen=30)

# 모델 생성
model = Sequential()

model.add(Embedding(30000,100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# 모델 설정
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# 모델 학습
model.fit(x_train,
          train_label,
          epochs=3,
          batch_size=60,
          validation_split=0.2)

# 모델 평가
result = model.evaluate(x_test, test_label)
print('loss =', result[0])
print('accuracy=', result[1])