# 네이버 영화 리뷰 감성 분류하기

모델은 고정해두고   
(1) 데이터 양을 다르게 하고 (절반/전체)   
(2) 전처리를 다르게 해서 (불용어 처리 유무, oov를 전체/일부/최소)  
다양한 전처리 단계를 통해 분석량 변화  

### 1. 네이버 영화 리뷰 데이터에 대한 이해와 전처리

In [1]:
!pip install konlpy

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import re # 정규 표현식 관련 모델
from konlpy.tag import Okt #Open Korean Text 형태소 분리 모듈
from tqdm import tqdm # 진행 상황을 보여주는 모듈

from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [3]:
# 한글 깨짐 방지
import platform

if platform.system() == 'Windows': 
    path = r'c:\Windows\Fonts\gulim.ttf'
elif platform.system() == 'Darwin': # Mac OS
    path = r'/System/Library/Fonts/AppleGothic'
else:
    path = r'/usr/share/fonts/truetype/name/NanumMyeongjo.ttf'

##### [1] 데이터 로드하기

In [4]:
data = pd.read_table('/content/drive/MyDrive/data.txt')

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/data.txt'

In [None]:
data.head(20)

##### [2] 데이터 정제하기

[2 - 1] 정규식을 통한 한글과 공백을 제외한 문자 제거

In [None]:
data['document'] = data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

[2 - 2] 중복 유무 확인

In [None]:
# document열의 중복여부 확인
print('전체 데이터 :', len(data))
print('고유값 :', data['document'].nunique())

전체 데이터에서 고유값과 차이가 난다는 것은 그만큼의 중복값이 있다는 것이니 제거한다

In [None]:
# 중복 데이터 제거
data.drop_duplicates(subset=['document'], inplace = True)

[2 - 3] 결측치 확인

In [None]:
data.isna().sum()

In [None]:
# 결측 데이터 확인
data.loc[data.document.isnull()]

In [None]:
# 결측치 제거
data = data.dropna(how = 'any')

In [None]:
plt.figure(figsize=(8,6))
plt.bar(['1', '0'], data['label'].value_counts(), color = ['steelblue', 'tomato'])
plt.show()

print(data.groupby('label').size().reset_index(name = 'count'))

거의 같은 데이터 자료로 구성되어 있음

##### [3] 토큰화

[3 - 1] 불용어 제거

https://www.ranks.nl/stopwords/korean

참고) 기본적인 한국어 불용어 리스트 100개
- https://bab2min.tistory.com/544
    * 단, 이 리스트는 구어나 인터넷의 가벼운 글들을 반영하지 않아서 블로그/SNS 등에는 적합하지 않음

**불용어 지정 - 분기 (1)**

In [None]:
with open("/content/drive/MyDrive/stopwords.txt", "r", encoding = "utf-8") as f:
    stopwords = f.readlines()
    stopwords = stopwords[0].split(" ")

In [None]:
print(len(stopwords))
print(stopwords[:20])

889개의 불용어로 구성되어 있는데 조금 지나치게 많은거 아닌가?

In [None]:
stopwords_01 = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [None]:
okt = Okt()

In [None]:
# 토큰화만 형태소로 진행된 경우
X_train_00 = []
# 토큰화와 불용어 적게 제거한 경우
X_train_01 = []
# 토큰화와 형태소, 불용어 많이 제거한 경우
X_train_02 = []

# 00
for sentence in tqdm(data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    X_train_00.append(tokenized_sentence)

# 01
for sentence in tqdm(data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords_01] # 불용어 적게 제거
    X_train_01.append(stopwords_removed_sentence)

# 02
for sentence in tqdm(data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 많이 제거
    X_train_02.append(stopwords_removed_sentence)

##### [4] 정수 인코딩

In [None]:
# 각각의 데이터에 대한 Tokenizer 생성
tokenizer_0 = Tokenizer()
tokenizer_1 = Tokenizer()
tokenizer_2 = Tokenizer()
# 각각의 Tokenizer 훈련
tokenizer_0.fit_on_texts(X_train_00)
tokenizer_1.fit_on_texts(X_train_01)
tokenizer_2.fit_on_texts(X_train_02)

In [None]:
print('불용어를 제거하지 않는 경우  :', len(tokenizer_0.word_index))
print('불용어를 일부만 제거한 경우  :', len(tokenizer_1.word_index))
print('불용어를 많이 제거한 경우    :', len(tokenizer_2.word_index))

In [None]:
def checkToken(tokenizer, threshold) :
    total_cnt = len(tokenizer.word_index) # 단어의 수
    rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
    total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
    rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

    # 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
    for key, value in tokenizer.word_counts.items():
        total_freq = total_freq + value

        # 단어의 등장 빈도수가 threshold보다 작으면
        if(value < threshold):
            rare_cnt = rare_cnt + 1
            rare_freq = rare_freq + value
    

    total = (rare_freq / total_freq)*100

    print('단어 집합(vocabulary)의 크기 :',total_cnt)
    print('등장 빈도가 %s번 미만인 희귀 단어의 수: %s'%(threshold, rare_cnt))
    print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
    print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)
    return (rare_freq / total_freq) * 100

**분기점 2 - 희귀 단어 수 설정**

In [None]:
num_pro_0_30 = checkToken(tokenizer_0, 30)
print('------------------------------')
num_pro_0_3 = checkToken(tokenizer_0, 3)
print('------------------------------')
num_pro_1_30 = checkToken(tokenizer_1, 30)
print('------------------------------')
num_pro_1_3 = checkToken(tokenizer_1, 3)
print('------------------------------')
num_pro_2_30 = checkToken(tokenizer_2, 30)
print('------------------------------')
num_pro_2_3 = checkToken(tokenizer_2, 3)

list_pro_30 = []
list_pro_3 = []

list_pro_30.append(num_pro_0_30)
list_pro_30.append(num_pro_1_30)
list_pro_30.append(num_pro_2_30)
list_pro_3.append(num_pro_0_3)
list_pro_3.append(num_pro_1_3)
list_pro_3.append(num_pro_2_3)

In [None]:
plt.figure(figsize=(24,10))
plt.subplot(131)
plt.bar(['Stopword X', 'Stopword S', 'Stopword L'], list_pro_30)
plt.ylim(0,13)
plt.subplot(132)
plt.bar(['Stopword X', 'Stopword S', 'Stopword L'], list_pro_3, color = 'tomato')
plt.ylim(0,13)
plt.subplot(133)
plt.bar(['Stopword X', 'Stopword S', 'Stopword L'], list_pro_3, color = 'tomato')
plt.ylim(1,2.5)
plt.show()

In [None]:
total_cnt_0 = 43770
total_cnt_1 = 43752
total_cnt_2 = 43431

rare_cnt_0_1  = 39276
rare_cnt_0_2  = 24340

rare_cnt_1_1  = 39189
rare_cnt_1_2  = 24307

vocab_size_0_00 = total_cnt_0 + 1
vocab_size_0_03 = total_cnt_0 - rare_cnt_0_1 + 1
vocab_size_0_30 = total_cnt_0 - rare_cnt_0_2 + 1

vocab_size_1_00 = total_cnt_1 + 1
vocab_size_1_03 = total_cnt_1 - rare_cnt_0_1 + 1
vocab_size_1_30 = total_cnt_1 - rare_cnt_0_2 + 1

vocab_size_2_00 = total_cnt_2 + 1
vocab_size_2_03 = total_cnt_2 - rare_cnt_1_1 + 1
vocab_size_2_30 = total_cnt_2 - rare_cnt_1_2 + 1

print(vocab_size_0_00)
print(vocab_size_0_03)
print(vocab_size_0_30)
print(vocab_size_1_00)
print(vocab_size_1_03)
print(vocab_size_1_30)
print(vocab_size_2_00)
print(vocab_size_2_03)
print(vocab_size_2_30)

텍스트 시퀀스를 정수 시퀀스로 변환

In [None]:
tokenizer_0_0 = Tokenizer(vocab_size_0_00)
tokenizer_0_3 = Tokenizer(vocab_size_0_03)
tokenizer_0_30 = Tokenizer(vocab_size_0_30)
tokenizer_1_0 = Tokenizer(vocab_size_1_00)
tokenizer_1_3 = Tokenizer(vocab_size_1_03)
tokenizer_1_30 = Tokenizer(vocab_size_1_30)
tokenizer_1_0 = Tokenizer(vocab_size_2_00)
tokenizer_1_3 = Tokenizer(vocab_size_2_03)
tokenizer_1_30 = Tokenizer(vocab_size_2_30)

In [None]:
X_train_0_00 = tokenizer_0_0.fit_on_texts(X_train_00)
X_train_0_03 = tokenizer_0_3.fit_on_texts(X_train_00)
X_train_0_30 = tokenizer_0_30.fit_on_texts(X_train_00)
X_train_1_00 = tokenizer_1_0.fit_on_texts(X_train_01)
X_train_1_03 = tokenizer_1_3.fit_on_texts(X_train_01)
X_train_1_30 = tokenizer_1_30.fit_on_texts(X_train_01)
X_train_2_00 = tokenizer_1_0.fit_on_texts(X_train_02)
X_train_2_03 = tokenizer_1_3.fit_on_texts(X_train_02)
X_train_2_30 = tokenizer_1_30.fit_on_texts(X_train_02)

In [None]:
X_train_0_00 = tokenizer_0_0.texts_to_sequences(X_train_00)
X_train_0_03 = tokenizer_0_3.texts_to_sequences(X_train_00)
X_train_0_30 = tokenizer_0_30.texts_to_sequences(X_train_00)
X_train_1_00 = tokenizer_1_0.texts_to_sequences(X_train_01)
X_train_1_03 = tokenizer_1_3.texts_to_sequences(X_train_01)
X_train_1_30 = tokenizer_1_30.texts_to_sequences(X_train_01)
X_train_2_00 = tokenizer_1_0.texts_to_sequences(X_train_02)
X_train_2_03 = tokenizer_1_3.texts_to_sequences(X_train_02)
X_train_2_30 = tokenizer_1_30.texts_to_sequences(X_train_02)

In [None]:
y_train = np.array(data['label'])

In [None]:
len(y_train)

[5] 빈 샘플 제거

In [None]:
drop_train_0_00 = [index for index, sentence in enumerate(X_train_0_00) if len(sentence) < 1]
drop_train_0_03 = [index for index, sentence in enumerate(X_train_0_03) if len(sentence) < 1]
drop_train_0_30 = [index for index, sentence in enumerate(X_train_0_30) if len(sentence) < 1]

drop_train_1_00 = [index for index, sentence in enumerate(X_train_1_00) if len(sentence) < 1]
drop_train_1_03 = [index for index, sentence in enumerate(X_train_1_03) if len(sentence) < 1]
drop_train_1_30 = [index for index, sentence in enumerate(X_train_1_30) if len(sentence) < 1]

drop_train_2_00 = [index for index, sentence in enumerate(X_train_2_00) if len(sentence) < 1]
drop_train_2_03 = [index for index, sentence in enumerate(X_train_2_03) if len(sentence) < 1]
drop_train_2_30 = [index for index, sentence in enumerate(X_train_2_30) if len(sentence) < 1]

In [None]:
# 빈 샘플들을 제거
X_train_0_00 = np.delete(X_train_0_00, drop_train_0_00, axis=0)
X_train_0_03 = np.delete(X_train_0_03, drop_train_0_03, axis=0)
X_train_0_30 = np.delete(X_train_0_30, drop_train_0_30, axis=0)
X_train_1_00 = np.delete(X_train_1_00, drop_train_1_00, axis=0)
X_train_1_03 = np.delete(X_train_1_03, drop_train_1_03, axis=0)
X_train_1_30 = np.delete(X_train_1_30, drop_train_1_30, axis=0)
X_train_2_00 = np.delete(X_train_2_00, drop_train_2_00, axis=0)
X_train_2_03 = np.delete(X_train_2_03, drop_train_2_03, axis=0)
X_train_2_30 = np.delete(X_train_2_30, drop_train_2_30, axis=0)

print(len(X_train_0_00))
print(len(X_train_0_03))
print(len(X_train_0_30))
print(len(X_train_1_00))
print(len(X_train_1_03))
print(len(X_train_1_30))
print(len(X_train_2_00))
print(len(X_train_2_03))
print(len(X_train_2_30))

In [None]:
y_train_0_00 = np.delete(y_train, drop_train_0_00, axis=0)
y_train_0_03 = np.delete(y_train, drop_train_0_03, axis=0)
y_train_0_30 = np.delete(y_train, drop_train_0_30, axis=0)
y_train_1_00 = np.delete(y_train, drop_train_1_00, axis=0)
y_train_1_03 = np.delete(y_train, drop_train_1_03, axis=0)
y_train_1_30 = np.delete(y_train, drop_train_1_30, axis=0)
y_train_2_00 = np.delete(y_train, drop_train_2_00, axis=0)
y_train_2_03 = np.delete(y_train, drop_train_2_03, axis=0)
y_train_2_30 = np.delete(y_train, drop_train_2_30, axis=0)

print(len(y_train_0_00))
print(len(y_train_0_03))
print(len(y_train_0_30))
print(len(y_train_1_00))
print(len(y_train_1_03))
print(len(y_train_1_30))
print(len(y_train_2_00))
print(len(y_train_2_03))
print(len(y_train_2_30))

In [None]:
print(len(y_train))

[6] 패딩

In [None]:
# 데이터의 길이 확인
def checkLen(X_data) :
  list_len = []
  for i in range(0, len(X_data)):
      list_len.append(len(X_data[i]))
    
  print('리뷰의 최대 길이 :', max(list_len))
  print('리뷰의 평균 길이 :', np.mean(list_len))
  print('리뷰의 중간 값 길이 :', np.median(list_len))
  return list_len

In [None]:
list_len_0_00 = checkLen(X_train_0_00)

In [None]:
list_len_2_30 = checkLen(X_train_2_30)

In [None]:
plt.figure(figsize=(24,10))
plt.subplot(2,2,1)
plt.hist(list_len_0_00)
plt.xlabel('length of samples')
plt.ylabel('number of samples')

plt.subplot(2,2,2)
plt.boxplot(list_len_0_00)

plt.subplot(2,2,3)
plt.hist(list_len_2_30)
plt.xlabel('length of samples')
plt.ylabel('number of samples')

plt.subplot(2,2,4)
plt.boxplot(list_len_2_30)
plt.show()

In [None]:
def below_threshold_len(max_len, nested_list):
  count = 0
  for sentence in nested_list:
    if(len(sentence) <= max_len):
        count = count + 1
  print(nested_list,'전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (count / len(nested_list))*100))

In [None]:
below_threshold_len(40, X_train_0_00)

In [None]:
below_threshold_len(40, X_train_2_30)

In [None]:
max_len = 40

X_data_0_00 = pad_sequences(X_train_0_00, maxlen = max_len)
X_data_0_03 = pad_sequences(X_train_0_03, maxlen = max_len)
X_data_0_30 = pad_sequences(X_train_0_30, maxlen = max_len)
X_data_1_00 = pad_sequences(X_train_1_00, maxlen = max_len)
X_data_1_03 = pad_sequences(X_train_1_03, maxlen = max_len)
X_data_1_30 = pad_sequences(X_train_1_30, maxlen = max_len)
X_data_2_00 = pad_sequences(X_train_2_00, maxlen = max_len)
X_data_2_03 = pad_sequences(X_train_2_03, maxlen = max_len)
X_data_2_30 = pad_sequences(X_train_2_30, maxlen = max_len)

In [None]:
embedding_dim = 100
hidden_units = 128

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)

In [None]:
def checkModel(X_train, y_train, vocab_size, name):
  model = Sequential()
  model.add(Embedding(vocab_size, embedding_dim))
  model.add(LSTM(hidden_units))
  model.add(Dense(1, activation='sigmoid'))
  
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

  name = (f'{name}.h5')
  mc = ModelCheckpoint('.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

  history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)
  return history

In [None]:
history_0_00 = checkModel(X_data_0_00, y_train_0_00, vocab_size_0_00, 'X_data_0_00')
history_0_03 = checkModel(X_data_0_03, y_train_0_03, vocab_size_0_03, 'X_data_0_03')
history_0_30 = checkModel(X_data_0_30, y_train_0_30, vocab_size_0_30, 'X_data_0_30')

history_1_00 = checkModel(X_data_1_00, y_train_1_00, vocab_size_1_00, 'X_data_1_00')
history_1_03 = checkModel(X_data_1_03, y_train_1_03, vocab_size_1_03, 'X_data_1_03')
history_1_30 = checkModel(X_data_1_30, y_train_1_30, vocab_size_1_30, 'X_data_1_30')

history_2_00 = checkModel(X_data_2_00, y_train_2_00, vocab_size_2_00, 'X_data_2_00')
history_2_03 = checkModel(X_data_2_03, y_train_2_03, vocab_size_2_03, 'X_data_2_03')
history_2_30 = checkModel(X_data_2_30, y_train_2_30, vocab_size_2_30, 'X_data_2_30')

In [None]:
df_history = pd.DataFrame([history_0_00.history['val_acc'], history_0_03.history['val_acc'], history_0_30.history['val_acc'], 
                           history_1_00.history['val_acc'], history_1_03.history['val_acc'], history_1_30.history['val_acc'], 
                           history_2_00.history['val_acc'], history_2_03.history['val_acc'], history_2_30.history['val_acc'],
                           history_0_00.history['acc'], history_0_03.history['acc'], history_0_30.history['acc'], 
                           history_1_00.history['acc'], history_1_03.history['acc'], history_1_30.history['acc'], 
                           history_2_00.history['acc'], history_2_03.history['acc'], history_2_30.history['acc']]).T

In [None]:
list_index = ['X_0_00', 'X_0_03', 'X_0_30',
              'X_1_00', 'X_1_03', 'X_1_30',
              'X_2_00', 'X_2_03', 'X_2_30']

In [None]:
list_val_acc = df_history.loc[:,:8]
list_acc = df_history.loc[:,9:]

In [None]:
def checkMax(data):
  list_acc = []
  for i in range(0, len(data.columns)) :
    list_acc.append(max(data.iloc[:, i]))
  return list_acc

In [None]:
list_v_a = checkMax(list_val_acc)

In [None]:
list_a = checkMax(list_acc)

In [None]:
plt.figure(figsize=(24,8))
plt.subplot(121)
plt.barh(list_index, list_a)
plt.xlim(0.83,1)
plt.title('Acc')

plt.subplot(122)
plt.barh(list_index, list_v_a)
plt.xlim(0.83,0.93)
plt.title('Val_Acc')

plt.show()

**알 수 있는 사실**  
(1) 불용어 제거 : 지나치면 안하느니 못하다.  
(2) 낮은 빈도 수 단어 제거 : 적당히 하면 과적합을 줄일 수 있다.

### 예측해보기

In [None]:
model = Sequential()
model.add(Embedding(vocab_size_1_03, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(1, activation='sigmoid'))
  
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_data_1_03, y_train_1_03, epochs=15, callbacks=[es], batch_size=64, validation_split=0.2)

In [None]:
def sentiment_predict(new_sentence):
    new_sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', new_sentence)
    new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
    new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
    encoded = tokenizer_1_3.texts_to_sequences([new_sentence]) # 정수 인코딩
    pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
    score = float(model.predict(pad_new)) # 예측
    if(score > 0.5):
        print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score * 100))
    else:
        print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1 - score) * 100))

In [None]:
# 일반적인 긍정 리뷰
sentiment_predict('너무 재밌어요^^')

In [None]:
# 일반적인 부정 리뷰
sentiment_predict('진짜 심각하네요...')

In [None]:
# 평론가 긍정 리뷰 : 이동진 - '그랜드 부다페스트 호텔'
sentiment_predict('지나온 적 없는 어제의 세계들에 대한 근원적 노스탤지어.')

In [None]:
# 평론가 부정 리뷰 : 이동진 - '7광구'
sentiment_predict('소재만 있었지, 할 이야기 자체가 없었던 영화.')

In [None]:
# 비꼬기 리뷰
sentiment_predict('감독님은 이 영화 덕분에 만수무강하시겠네요 ^^')