In [62]:
!pip install konlpy
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import *
from tensorflow.keras.losses import *
from tensorflow.nn import *


from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")





('ratings_test.txt', <http.client.HTTPMessage at 0x7fa1e43cf0b8>)

In [63]:
def preprocessing(data):
  data = data.drop_duplicates(subset=['document'])
  data = data.dropna(how = 'any')
  data['document'] = data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
  data['document'] = data['document'].replace('', np.nan)
  data = data.dropna(how = 'any')
  return data

In [64]:
def Token(data, okt, stopwords):
  now = 0
  res = list()
  print("start token")
  for sentence in data['document']:
    if now%10000 == 0:
      print(f"token : {now}/{len(data)}")
    now = now +1
    temp = list()
    temp = okt.morphs(sentence, stem=True)
    temp = [word for word in temp if not word in stopwords]
    res.append(temp)
  print("end token")
  return res

In [65]:
def Tokenizing(data, tokenizer):
  return tokenizer.texts_to_sequences(data)

In [66]:
def rmEmpty(data, label):
  drop_data = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]
  data = np.delete(data, drop_data, axis=0)
  label = np.delete(label, drop_data, axis=0)
  return data, label

In [67]:
def sentiment_predict(sentence, okt, stopwords, tokenizer):
  sentence = okt.morphs(sentence, stem=True)
  sentence = [word for word in sentence if not word in stopwords]
  encoded = tokenizer.texts_to_sequences([sentence])
  padding_sentence = pad_sequences(encoded, maxlen = 30)
  score = float(loaded_model.predict(padding_sentence))
  if score > 0.5:
    print(f"긍정 / Score : {score}")
  else:
    print(f"부정 / Score : {score}")


In [68]:
# 데이터 tokenizing 하기
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')
X_train = list()
X_test = list()
okt = Okt()
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

train_data = preprocessing(train_data)
test_data = preprocessing(test_data)

X_train = Token(train_data, okt, stopwords)
X_test = Token(test_data, okt, stopwords)

tokenizer = Tokenizer(19417, oov_token = 'OOV')
tokenizer.fit_on_texts(X_train)

X_train = Tokenizing(X_train, tokenizer)
X_test = Tokenizing(X_test, tokenizer)


start token
token : 0/145791
token : 10000/145791
token : 20000/145791
token : 30000/145791
token : 40000/145791
token : 50000/145791
token : 60000/145791
token : 70000/145791
token : 80000/145791
token : 90000/145791
token : 100000/145791
token : 110000/145791
token : 120000/145791
token : 130000/145791
token : 140000/145791
end token
start token
token : 0/48995
token : 10000/48995
token : 20000/48995
token : 30000/48995
token : 40000/48995
end token


In [69]:
# label 데이터 생성
Y_train = np.array(train_data['label'])
Y_test = np.array(test_data['label'])

X_train, Y_train = rmEmpty(X_train, Y_train)

# padding
X_train = pad_sequences(X_train, maxlen = 30, padding='post')
X_test = pad_sequences(X_test, maxlen = 30, padding='post')

In [70]:
print(len(X_train))

145380


In [71]:
earlyStop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
modelCP = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)


In [72]:
# Model
input = Input(shape=(30,))
x = Embedding(19417, 128)(input)
x = LeakyReLU()(x)
x = Dropout(0.5)(x)
x = Bidirectional(LSTM(32, return_sequences=True))(x)
x = LeakyReLU()(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs = [input], outputs=output)

optimizer = Adam(learning_rate=0.0001)
loss_function = BinaryCrossentropy()
model.compile(optimizer=optimizer, loss=loss_function, metrics=['acc'])
model.summary()

Model: "functional_53"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_31 (InputLayer)        [(None, 30)]              0         
_________________________________________________________________
embedding_30 (Embedding)     (None, 30, 128)           2485376   
_________________________________________________________________
leaky_re_lu_27 (LeakyReLU)   (None, 30, 128)           0         
_________________________________________________________________
dropout_31 (Dropout)         (None, 30, 128)           0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 30, 64)            41216     
_________________________________________________________________
leaky_re_lu_28 (LeakyReLU)   (None, 30, 64)            0         
_________________________________________________________________
dense_27 (Dense)             (None, 30, 1)           

In [73]:
history = model.fit(X_train, Y_train, epochs=15, callbacks=[earlyStop, modelCP], batch_size=128, validation_split=0.2)

Epoch 1/15
Epoch 00001: val_acc improved from -inf to 0.81383, saving model to best_model.h5
Epoch 2/15
Epoch 00002: val_acc improved from 0.81383 to 0.83337, saving model to best_model.h5
Epoch 3/15
Epoch 00003: val_acc improved from 0.83337 to 0.83809, saving model to best_model.h5
Epoch 4/15
Epoch 00004: val_acc improved from 0.83809 to 0.84154, saving model to best_model.h5
Epoch 5/15
Epoch 00005: val_acc improved from 0.84154 to 0.84248, saving model to best_model.h5
Epoch 6/15
Epoch 00006: val_acc improved from 0.84248 to 0.84260, saving model to best_model.h5
Epoch 7/15
Epoch 00007: val_acc improved from 0.84260 to 0.84351, saving model to best_model.h5
Epoch 8/15
Epoch 00008: val_acc improved from 0.84351 to 0.84362, saving model to best_model.h5
Epoch 9/15
Epoch 00009: val_acc did not improve from 0.84362
Epoch 10/15
Epoch 00010: val_acc did not improve from 0.84362
Epoch 11/15
Epoch 00011: val_acc did not improve from 0.84362
Epoch 00011: early stopping


Embedding 128, LSTM 128, Dense 1 sigmoid   
Score : 0.8499

Embedding 128, LSTM 128, Dense 1 sigmoid, dropout 0.5
Score : 0.8505

Embedding 128, LSTM 128, Dense 1 sigmoid, dropout 0.5, resnet dropout 0.5
Score : 0.8531

0.8451

padding post 

# Model
Embedding 100, LSTM 128, Dense 1 sigmoid, batch_size = 60
val_acc : 0.8605

Embedding 256, LSTM 256, Dense 1 sigmoid, batch_size = 60
val_acc : 0.8591

Embedding 256, LSTM 256, Dense 1 sigmoid, batch_size = 128
val_acc : 0.8550

Embedding 256, LSTM 256, Dense 1 sigmoid, batch_size = 128, adam
val_acc : 0.8481

Embedding 256, LSTM 256, Dense 1 sigmoid, batch_size = 128, adam, dropout 0.2
val_acc : 0.8510

Embedding 256, LSTM 256, Dense 1 sigmoid, batch_size = 128, adam, dropout 0.5
val_acc : 0.8490

Embedding 256, LSTM 256, Dense 1 sigmoid, batch_size = 128, adam, dropout 0.5
val_acc : 0.8525

Embedding 256, GRU 256, Dense 1 sigmoid, batch_size = 128, adam, dropout 0.5
val_acc : 0.8537

Embedding 256, GRU 256, Dense 1 sigmoid, batch_size = 128, adam, change dropout 0.5
val_acc : 0.8552

Embedding 256, GRU 256, Dense 1 sigmoid, batch_size = 128, adam, change dropout 0.5, amsgrad
val_acc : 0.8556

In [None]:
loaded_model = load_model('best_model.h5')

sentiment_predict("이 영화 재미없어요...", okt, stopwords, tokenizer)

부정 / Score : 0.028914857655763626
