# Sentiment Analysis in Korean (using Naver Sentiment Movie Corpus)
- Sentiment Analysis with RNN (GRU)
- Dataset source: https://github.com/e9t/nsmc
- To install & use ```Mecab``` tagger in Windows, refer to https://groups.google.com/forum/#!topic/konlpy/SuMc8EkCT_M

In [112]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from konlpy.tag import Mecab
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from keras.preprocessing.sequence import pad_sequences
from keras.models import *
from keras.layers import *

### 1. Import & process dataset

In [113]:
df = pd.read_table("ratings.txt")
df.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [114]:
mecab = Mecab("C:\\mecab\\mecab-ko-dic")

In [115]:
mecab.morphs(df["document"][0])

['어릴', '때', '보', '고', '지금', '다시', '봐도', '재밌', '어요', 'ㅋㅋ']

In [116]:
%%time
reviews = []
labels = []
all_tokens = []
unique_tokens = dict()

for i in range(len(df)):
    try:
        tokens = mecab.morphs(df["document"][i])
        reviews.append(tokens)
        labels.append(df["label"][i])
        
        all_tokens += tokens
        for t in tokens:
            if t in unique_tokens.keys():
                unique_tokens[t] += 1
            else:
                unique_tokens[t] = 1
    except:
        pass
    
print("Number of Reviews: ", len(reviews), len(labels))
print("Number of tokens: ", len(all_tokens))
print("Number of unique tokens: ", len(unique_tokens))

Number of Reviews:  199992 199992
Number of tokens:  3669567
Number of unique tokens:  61039
Wall time: 24.4 s


In [117]:
def create_dictionary(unique_tokens, threshold):
    token_to_idx = dict()
    idx_to_token = dict()
    unique_token_keys = list(unique_tokens.keys())
    
    j = 0
    for i in range(len(unique_token_keys)):
        if unique_tokens[unique_token_keys[i]] > threshold:
            token_to_idx[unique_token_keys[i]] = j
            idx_to_token[j] = unique_token_keys[i]
            j += 1
    
    return token_to_idx, idx_to_token

In [118]:
token_to_idx, idx_to_token = create_dictionary(unique_tokens, 100)

print(len(token_to_idx), len(idx_to_token))

2437 2437


In [119]:
%%time
for i in range(len(reviews)):
    for j in range(len(reviews[i])):
        if reviews[i][j] in token_to_idx.keys():
            reviews[i][j] = token_to_idx[reviews[i][j]]
        else:
            reviews[i][j] = None
            
    reviews[i] = [x for x in reviews[i] if x != None]

Wall time: 2.12 s


In [120]:
i = 0
for r in reviews:
    if len(r)!=0:
        i += 1
        
print("Number of non-empty reviews: ", i)

Number of non-empty reviews:  198701


In [121]:
for i in range(len(reviews)):
    if len(reviews[i]) == 0:
        labels[i] = None

reviews = [x for x in reviews if len(x) != 0]
labels = [x for x in labels if x != None]

print(len(reviews), len(labels))

198701 198701


In [129]:
X_data = pad_sequences(reviews, maxlen = 30)
y_data = np.asarray(labels)

print(len(X_data), len(y_data))

198701 198701


### 2. Create model
- Model with two GRU layers (with 50 cells each)

In [130]:
X_train, X_test, y_train, y_test = train_test_split(np.asarray(X_data), np.asarray(y_data),test_size = 0.2)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(158960, 30) (39741, 30) (158960,) (39741,)


In [131]:
def simple_rnn_model(gpu = False):
    model = Sequential()
    model.add(Embedding(len(token_to_idx), 50, input_length = X_train.shape[1]))
    if gpu:
        model.add(CuDNNGRU(50, return_sequences = True))
        model.add(CuDNNGRU(50))
    else:
        model.add(GRU(50, return_sequences = True))
        model.add(GRU(50))
    model.add(Dense(50, activation = "relu"))
    model.add(Dense(1, activation = "sigmoid"))
    model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["acc"])
    return model

In [134]:
# If you do not have GPU, set gpu parameter as False!
model = simple_rnn_model(True)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 50)            121850    
_________________________________________________________________
cu_dnngru_23 (CuDNNGRU)      (None, 30, 50)            15300     
_________________________________________________________________
cu_dnngru_24 (CuDNNGRU)      (None, 50)                15300     
_________________________________________________________________
dense_37 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_38 (Dense)             (None, 1)                 51        
Total params: 155,051
Trainable params: 155,051
Non-trainable params: 0
_________________________________________________________________


### 3. Model Training & Evaluation

In [135]:
hist = model.fit(X_train, y_train, validation_split = 0.1, epochs = 10, batch_size = 1000)

Train on 143064 samples, validate on 15896 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [137]:
print("Test Accuracy: ", model.evaluate(X_test, y_test)[1])

