In [2]:
!pip install konlpy
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import *
from tensorflow.keras.losses import *
from tensorflow.nn import *
from nltk.util import ngrams
from sklearn.model_selection import *
from tensorflow.keras.utils import plot_model

from tensorflow.keras.activations import *
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")





('ratings_test.txt', <http.client.HTTPMessage at 0x7f9079ad1198>)

In [2]:
def preprocessing(data):
  data = data.drop_duplicates(subset=['document'])
  data = data.dropna(how = 'any')
  data['document'] = data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
  data['document'] = data['document'].replace('', np.nan)
  data = data.dropna(how = 'any')
  return data

In [4]:
def Token(data, okt, stopwords):
  now = 0
  res = list()
  
  print("start token")
  for sentence in data['document']:
    if now%10000 == 0:
      print(f"token : {now}/{len(data)}")
    now = now +1
    temp = list()
    for i in range(len(sentence)):
      temp.append(sentence[i])
    res.append(temp)
  print("end token")
  return res

In [5]:
def Tokenizing(data, tokenizer):
  return tokenizer.texts_to_sequences(data)

In [6]:
def rmEmpty(data, label):
  drop_data = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]
  data = np.delete(data, drop_data, axis=0)
  label = np.delete(label, drop_data, axis=0)
  return data, label

In [None]:
def sentiment_predict(sentence, okt, stopwords, tokenizer):
  sentence = okt.morphs(sentence, stem=True)
  sentence = [word for word in sentence if not word in stopwords]
  encoded = tokenizer.texts_to_sequences([sentence])
  padding_sentence = pad_sequences(encoded, maxlen = 30)
  score = float(loaded_model.predict(padding_sentence))
  if score > 0.5:
    print(f"긍정 / Score : {score}")
  else:
    print(f"부정 / Score : {score}")


In [7]:
# 데이터 tokenizing 하기
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')
X_train = list()
X_test = list()
okt = Okt()
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

train_data = preprocessing(train_data)
test_data = preprocessing(test_data)

X_train = Token(train_data, okt, stopwords)
X_test = Token(test_data, okt, stopwords)




start token
token : 0/145791
token : 10000/145791
token : 20000/145791
token : 30000/145791
token : 40000/145791
token : 50000/145791
token : 60000/145791
token : 70000/145791
token : 80000/145791
token : 90000/145791
token : 100000/145791
token : 110000/145791
token : 120000/145791
token : 130000/145791
token : 140000/145791
end token
start token
token : 0/48995
token : 10000/48995
token : 20000/48995
token : 30000/48995
token : 40000/48995
end token


In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = Tokenizing(X_train, tokenizer)
X_test = Tokenizing(X_test, tokenizer)

In [9]:
# label 데이터 생성
Y_train = np.array(train_data['label'])
Y_test = np.array(test_data['label'])

X_train, Y_train = rmEmpty(X_train, Y_train)

In [10]:
# padding
X_train = pad_sequences(X_train, maxlen = 100, padding='post')
X_test = pad_sequences(X_test, maxlen = 100, padding='post')

In [11]:
earlyStop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
modelCP = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)


In [3]:
embedding_dim = 256
hidden_dim = 512
dropout_rate = 0.6

# Model
input = Input(shape=(100,))
x = Embedding(2542, embedding_dim)(input)
x = Dropout(dropout_rate)(x)
x = Conv1D(hidden_dim, 5, padding="same")(x)
x = BatchNormalization()(x)
x = LeakyReLU()(x)
x_res = x
x = Bidirectional(LSTM(int(hidden_dim/2), return_sequences=True))(x)
x= x+x_res
x = LeakyReLU()(x)
x = LSTM(hidden_dim, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(dropout_rate)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs = [input], outputs=output)

optimizer = Adam(learning_rate=0.001)
loss_function = BinaryCrossentropy()
model.compile(optimizer=optimizer, loss=loss_function, metrics=['acc'])
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 256)     650752      input_1[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 100, 256)     0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 100, 512)     655872      dropout[0][0]                    
_______________________________________________________________________________________

In [30]:
# history = model.fit(X_train, Y_train, epochs=100, callbacks=[earlyStop, modelCP], batch_size=128, validation_split=0.2)
history = model.fit(X_train, Y_train, epochs=100, callbacks=[earlyStop, modelCP], batch_size=256, validation_split=0.2)

Epoch 1/100
Epoch 00001: val_acc did not improve from 0.86337
Epoch 2/100
Epoch 00002: val_acc did not improve from 0.86337
Epoch 3/100
Epoch 00003: val_acc did not improve from 0.86337
Epoch 4/100
Epoch 00004: val_acc did not improve from 0.86337
Epoch 5/100
Epoch 00005: val_acc did not improve from 0.86337
Epoch 6/100
Epoch 00006: val_acc did not improve from 0.86337
Epoch 7/100
Epoch 00007: val_acc did not improve from 0.86337
Epoch 8/100
Epoch 00008: val_acc did not improve from 0.86337
Epoch 9/100
Epoch 00009: val_acc did not improve from 0.86337
Epoch 10/100
Epoch 00010: val_acc did not improve from 0.86337
Epoch 11/100
Epoch 00011: val_acc did not improve from 0.86337
Epoch 12/100
Epoch 00012: val_acc did not improve from 0.86337
Epoch 13/100
Epoch 00013: val_acc did not improve from 0.86337
Epoch 14/100
Epoch 00014: val_acc did not improve from 0.86337
Epoch 15/100
Epoch 00015: val_acc improved from 0.86337 to 0.86622, saving model to best_model.h5
Epoch 16/100
Epoch 00016: val