In [1]:
# !pip install konlpy

In [2]:
import konlpy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import re
from konlpy.tag import Okt
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [3]:
seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

### 파일 업로드
- ratings_train.txt
- ratings_test.txt

In [22]:
# from google.colab import files
# uploaded, uploaded2 = files.upload(), files.upload()
# filename = list(uploaded.keys())[0]
# testfilename = list(uploaded2.keys())[0]

In [5]:
filename,testfilename = 'ratings_train.txt','ratings_test.txt'

### 데이터 전처리

In [6]:
train_df= pd.read_table(filename)
test_df= pd.read_table(testfilename)
train_df.shape,test_df.shape

((150000, 3), (50000, 3))

In [10]:
# 중복샘플 제거
train_df.drop_duplicates(subset=['document'],inplace=True)

In [12]:
# Null 값이 존재하는 행 제거
train_df=train_df.dropna(how='any') 

테스트 데이터에도 적용

In [14]:
test_df.drop_duplicates(subset=['document'],inplace=True)
test_df=test_df.dropna(how='any') 

### 한글 텍스트 전처리

In [16]:
train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [17]:
test_df['document'] = test_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,,0
2,8544678,뭐야 이 평점들은 나쁘진 않지만 점 짜리는 더더욱 아니잖아,0


In [18]:
train_df.document.replace('',np.nan,inplace=True)
train_df=train_df.dropna(how='any')

In [19]:
test_df.document.replace('',np.nan,inplace=True)
test_df=test_df.dropna(how='any')

### 한글 토큰화와 불용어 추출

In [21]:
import tqdm.notebook as tn
stopwords=['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
okt = Okt()

X_train=[]
for sentence in tn.tqdm(train_df['document']):
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    X_train.append(temp_X)

HBox(children=(FloatProgress(value=0.0, max=145791.0), HTML(value='')))




In [23]:
X_test=[]
for sentence in tn.tqdm(test_df['document']):
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    X_test.append(temp_X)

HBox(children=(FloatProgress(value=0.0, max=48995.0), HTML(value='')))




In [25]:
# 정수 인코딩
max_words = 35000
tokenizer = Tokenizer(num_words=max_words) # 상위 35,000개의 단어만 보존
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [29]:
#  X_train과 X_test의 모든 샘플의 길이를 동일하게 30으로 셋팅
max_len=30
# 전체 데이터의 길이는 30으로 맞춘다.
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [30]:
y_train = train_df.label.values
y_test = test_df.label.values

### LSTM 모델 정의/설정/학습/평가

#### 기본

In [31]:
from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential

In [132]:
model = Sequential([
                    Embedding(max_words, 100),
                    LSTM(128),
                    Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_24 (Embedding)     (None, None, 100)         3500000   
_________________________________________________________________
lstm_29 (LSTM)               (None, 128)               117248    
_________________________________________________________________
dense_25 (Dense)             (None, 1)                 129       
Total params: 3,617,377
Trainable params: 3,617,377
Non-trainable params: 0
_________________________________________________________________


In [133]:
model.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=['accuracy'])

In [134]:
from keras.callbacks import ModelCheckpoint,EarlyStopping
earlyStopping = EarlyStopping(monitor='val_loss', verbose=0, patience=4)
modelpath = "model/naver-lstm-best-model.hdf5"
checkpointer = ModelCheckpoint(filepath=modelpath, monitor='val_loss', 
                               verbose=1, save_best_only=True)

In [135]:
history = model.fit(X_train, y_train, epochs=10, batch_size=60,
                    validation_split=0.2, verbose=1, callbacks=[checkpointer,earlyStopping])

Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.34964, saving model to model/naver-lstm-best-model.hdf5
Epoch 2/10

Epoch 00002: val_loss improved from 0.34964 to 0.34372, saving model to model/naver-lstm-best-model.hdf5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.34372
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.34372
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.34372
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.34372


In [136]:
# Best Model 선택
from keras.models import load_model
del model
model = load_model(modelpath)

In [137]:
print("\n 테스트 정확도: %.4f" % (model.evaluate(X_test, y_test, 
                                           verbose=2)[1]))

1532/1532 - 4s - loss: 0.3517 - accuracy: 0.8467

 테스트 정확도: 0.8467


#### 수정된 모델

In [155]:
model2 = Sequential([
                    Embedding(max_words, 100),
                    MaxPooling1D(pool_size=4),
                    Dropout(0.25),
                    Conv1D(64, 5, padding='valid', activation='relu', strides=1),
                    LSTM(128),
                    Dropout(0.25),
                    Dense(1, activation='sigmoid')
])
model2.summary()

Model: "sequential_31"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_29 (Embedding)     (None, None, 100)         3500000   
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, None, 100)         0         
_________________________________________________________________
dropout_28 (Dropout)         (None, None, 100)         0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, None, 64)          32064     
_________________________________________________________________
lstm_34 (LSTM)               (None, 128)               98816     
_________________________________________________________________
dropout_29 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_30 (Dense)             (None, 1)               

In [156]:
model2.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=['accuracy'])

In [157]:
from keras.callbacks import ModelCheckpoint
modelpath = "model/naver-best-model.hdf5"
earlyStopping = EarlyStopping(monitor='val_loss', verbose=0, patience=4)
checkpointer = ModelCheckpoint(filepath=modelpath, monitor='val_loss', 
                               verbose=1, save_best_only=True)

In [158]:
history2 = model2.fit(X_train, y_train, epochs=30, batch_size=32,
                    validation_split=0.2, verbose=1, callbacks=[checkpointer,earlyStopping])

Epoch 1/30

Epoch 00001: val_loss improved from inf to 0.44511, saving model to model/naver-best-model.hdf5
Epoch 2/30

Epoch 00002: val_loss improved from 0.44511 to 0.44369, saving model to model/naver-best-model.hdf5
Epoch 3/30

Epoch 00003: val_loss did not improve from 0.44369
Epoch 4/30

Epoch 00004: val_loss did not improve from 0.44369
Epoch 5/30

Epoch 00005: val_loss did not improve from 0.44369
Epoch 6/30

Epoch 00006: val_loss did not improve from 0.44369


In [159]:
# Best Model 선택
from keras.models import load_model
del model2
model2 = load_model(modelpath)

In [160]:
print("\n 테스트 정확도: %.4f" % (model2.evaluate(X_test, y_test, 
                                           verbose=2)[1]))

1532/1532 - 3s - loss: 0.4509 - accuracy: 0.7752

 테스트 정확도: 0.7752
