In [1]:
!pip install gensim==4.1.2



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline

In [3]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [4]:
X_morphs = np.load("/gdrive/My Drive/Colab Notebooks/final_X_morphs.npy", allow_pickle=True)
y_morphs = np.load("/gdrive/My Drive/Colab Notebooks/final_y_morphs.npy", allow_pickle=True)

In [5]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
y_morphs = enc.fit_transform(y_morphs)

In [6]:
from sklearn.model_selection import train_test_split
Xm_train, Xm_test, ym_train, ym_test = train_test_split(X_morphs, y_morphs, test_size=0.3, random_state=777, stratify=y_morphs)

In [7]:
from gensim.models import FastText
from gensim.models import Word2Vec

ft = FastText(sentences=Xm_train, vector_size=100, window=2, min_count=5, sg=1)
w2v = Word2Vec(sentences=Xm_train, vector_size=100, window=2, min_count=5, sg=1)

In [40]:
w2v.wv.most_similar('눈물')

[('자꾸', 0.9706909656524658),
 ('그렇게', 0.9573221802711487),
 ('조금', 0.9549165964126587),
 ('맘', 0.9537326097488403),
 ('갑자기', 0.9529294967651367),
 ('이렇게', 0.9521529078483582),
 ('겁', 0.9513979554176331),
 ('본적', 0.9491633772850037),
 ('기억', 0.9474244713783264),
 ('처음', 0.9472329020500183)]

In [41]:
ft.wv.most_similar('눈물')

[('가슴', 0.9452393054962158),
 ('빠르다', 0.9356359839439392),
 ('괜히', 0.9344004392623901),
 ('자꾸', 0.9306739568710327),
 ('오르다', 0.928200364112854),
 ('생각나다', 0.9254056811332703),
 ('눈물나다', 0.9247233271598816),
 ('두렵다', 0.9235146045684814),
 ('조금', 0.9216457605361938),
 ('갑자기', 0.9202181696891785)]

In [42]:
w2v.wv.most_similar('더럽다')

[('억울하다', 0.9859548211097717),
 ('선동', 0.9854364991188049),
 ('벌레', 0.984391450881958),
 ('권력', 0.9842827320098877),
 ('미개하다', 0.9842439889907837),
 ('공무원', 0.9841915965080261),
 ('국회의원', 0.9834848642349243),
 ('하나같이', 0.982538104057312),
 ('사기꾼', 0.9817013144493103),
 ('똑같이', 0.9816969037055969)]

In [43]:
ft.wv.most_similar('더럽다')

[('미개하다', 0.9711623787879944),
 ('도둑', 0.9690681099891663),
 ('설치다', 0.9689338207244873),
 ('끼리', 0.9660525321960449),
 ('넘치다', 0.9656282067298889),
 ('똑똑하다', 0.9646580219268799),
 ('뭉치다', 0.9635692834854126),
 ('진정', 0.9632050395011902),
 ('닥치다', 0.9620828032493591),
 ('망치다', 0.9615488052368164)]

In [44]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(Xm_train)
sequences = tokenizer.texts_to_sequences(Xm_train)
Xm_test_tok = tokenizer.texts_to_sequences(Xm_test)

Xm_train_pad = pad_sequences(sequences, maxlen=30)
Xm_test_pad = pad_sequences(Xm_test_tok, maxlen=30)

In [54]:
VOCAB_SIZE = len(tokenizer.index_word) + 1
EMBEDDING_DIM = 100

embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

for word, idx in tokenizer.word_index.items():
    embedding_vector = ft.wv[word] if word in ft.wv else None
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector
        
embedding_matrix.shape

(20043, 100)

In [56]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, InputLayer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

hidden_units = 128
num_classes = 7

model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
model.add(LSTM(hidden_units))
model.add(Dense(num_classes, activation='softmax'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4, restore_best_weights=True)
mc = ModelCheckpoint('/gdrive/My Drive/Colab Notebooks/ft_lstm.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
history = model.fit(Xm_train_pad, ym_train, epochs=20, callbacks=[es, mc], batch_size=128, validation_split=0.15)

Epoch 1/20
Epoch 00001: val_acc improved from -inf to 0.39157, saving model to /gdrive/My Drive/Colab Notebooks/ft_lstm.h5
Epoch 2/20
Epoch 00002: val_acc improved from 0.39157 to 0.40818, saving model to /gdrive/My Drive/Colab Notebooks/ft_lstm.h5
Epoch 3/20
Epoch 00003: val_acc did not improve from 0.40818
Epoch 4/20
Epoch 00004: val_acc did not improve from 0.40818
Epoch 5/20
Epoch 00005: val_acc improved from 0.40818 to 0.41859, saving model to /gdrive/My Drive/Colab Notebooks/ft_lstm.h5
Epoch 6/20
Epoch 00006: val_acc did not improve from 0.41859
Epoch 7/20
Epoch 00007: val_acc did not improve from 0.41859
Epoch 8/20
Epoch 00008: val_acc improved from 0.41859 to 0.41884, saving model to /gdrive/My Drive/Colab Notebooks/ft_lstm.h5
Epoch 9/20
Epoch 00009: val_acc improved from 0.41884 to 0.42082, saving model to /gdrive/My Drive/Colab Notebooks/ft_lstm.h5
Epoch 10/20
Epoch 00010: val_acc improved from 0.42082 to 0.42751, saving model to /gdrive/My Drive/Colab Notebooks/ft_lstm.h5
Ep

In [57]:
VOCAB_SIZE = len(tokenizer.index_word) + 1
EMBEDDING_DIM = 100

embedding_matrix2 = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

for word, idx in tokenizer.word_index.items():
    embedding_vector = w2v.wv[word] if word in w2v.wv else None
    if embedding_vector is not None:
        embedding_matrix2[idx] = embedding_vector
        
embedding_matrix2.shape

(20043, 100)

In [58]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, InputLayer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

hidden_units = 128
num_classes = 7

model2 = Sequential()
model2.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, weights=[embedding_matrix2], trainable=False))
model2.add(LSTM(hidden_units))
model2.add(Dense(num_classes, activation='softmax'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4, restore_best_weights=True)
mc = ModelCheckpoint('/gdrive/My Drive/Colab Notebooks/w2v_lstm.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
history = model2.fit(Xm_train_pad, ym_train, epochs=20, callbacks=[es, mc], batch_size=128, validation_split=0.15)

Epoch 1/20
Epoch 00001: val_acc improved from -inf to 0.37348, saving model to /gdrive/My Drive/Colab Notebooks/w2v_lstm.h5
Epoch 2/20
Epoch 00002: val_acc improved from 0.37348 to 0.37546, saving model to /gdrive/My Drive/Colab Notebooks/w2v_lstm.h5
Epoch 3/20
Epoch 00003: val_acc improved from 0.37546 to 0.38786, saving model to /gdrive/My Drive/Colab Notebooks/w2v_lstm.h5
Epoch 4/20
Epoch 00004: val_acc improved from 0.38786 to 0.40620, saving model to /gdrive/My Drive/Colab Notebooks/w2v_lstm.h5
Epoch 5/20
Epoch 00005: val_acc did not improve from 0.40620
Epoch 6/20
Epoch 00006: val_acc did not improve from 0.40620
Epoch 7/20
Epoch 00007: val_acc did not improve from 0.40620
Epoch 8/20
Epoch 00008: val_acc improved from 0.40620 to 0.41165, saving model to /gdrive/My Drive/Colab Notebooks/w2v_lstm.h5
Epoch 9/20
Epoch 00009: val_acc did not improve from 0.41165
Epoch 10/20
Epoch 00010: val_acc did not improve from 0.41165
Epoch 11/20
Epoch 00011: val_acc did not improve from 0.41165


In [59]:
model.evaluate(Xm_test_pad, ym_test)



[1.4564201831817627, 0.4426613450050354]

In [60]:
model2.evaluate(Xm_test_pad, ym_test)



[1.5001394748687744, 0.4192401170730591]

In [61]:
# 기본 표현
sent_list = ['정말 행복해',
             '정말 슬퍼',
             '정말 무서워',
             '정말 싫어',
             '정말 화가 나',
             '깜짝 놀랐어',
             '종이는 하얗다']

# 복잡한 표현
sent_list2 = ['ㅋㅋㅋ 유재석 김태호 조합은 믿고 보는거지~',
              '그저께 어머니가 돌아가시고 세상을 잃은 기분입니다..',
              '아이가 어제부터 토를 계속 하는데 어떻게 해야하죠??',
              '이놈이나 저놈이나 다 똑같은 놈들이야',
              '범죄자들 얼굴을 왜 가리나? 신상공개하라!!',
              '이런거보면 참 우주는 대단한 듯.. 37억이라는 세월.. 짐작도 안간다.',
              '저는 법학도이고 현재는 로스쿨에 재학중입니다.']

# 비문과 이모티콘
sent_list3 = ['넘넘 추카해요~~~^^',
              '하.... 나 시험 개망햇다...ㅠ..',
              '나 이번달에 생리를 안하는데?? 어떡하징 ㄷㄷ',
              '표절가수 얼굴 보기도 실타ㅋㅋ 나오지 마라',
              '아 징짜 ㅡㅡ 초딩은 사람도 아님?',
              '헐 마라탕 위생 문졔 있다구?? 나 마라탕 어제도 먹엇는데??',
              '이번네 삼성에서 새로운 핸드폰이 출시됩비다.']

In [62]:
sentence = sent_list + sent_list2 + sent_list3

In [64]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 6.8 MB/s 
[?25hCollecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 37.9 MB/s 
[?25hCollecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.4 MB/s 
Installing collected packages: JPype1, colorama, beautifulsoup4, konlpy
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed JPype1-1.3.0 beautifulsoup4-4.6.0 colorama-0.4.4 konlpy-0.5.2


In [66]:
from konlpy.tag import Okt
okt = Okt()

def sent_to_morphs(sent, stopwords, norm):
  tokenized = okt.morphs(sent, stem=True)
  sw_removed = [word for word in tokenized if not word in stopwords]
  normalized = []
  for word in sw_removed:
    changed = False
    for n in norm: ## 두번 이상 나온 이모티콘 정규화
      if n in word:
        normalized.append(n)
        changed = True
    if 'ㅜ' in word: ## ㅜ를 ㅠ로 변환
      normalized.append('ㅠ')
      changed = True
    
    if not changed:
      normalized.append(word)
  return normalized

In [67]:
stopwords = []
norm = ['!', '?', 'ㅠ', 'ㅋ', 'ㅎ', '.', '~', ',', ';', '^', 'ㄷ', 'ㅡ', 'ㅉ']

sent_m = []

for sent in sentence:
  m = sent_to_morphs(sent, stopwords, norm)

  sent_m.append(m)

sent_m_tok = tokenizer.texts_to_sequences(sent_m)
sent_m_pad = pad_sequences(sent_m_tok, maxlen=30)

In [68]:
pred = []

pred1 = model.predict(sent_m_pad)
pred2 = model2.predict(sent_m_pad)


pred.append(np.argmax(pred1,axis=1))
pred.append(np.argmax(pred2,axis=1))

result = []

result.append(enc.inverse_transform(pred[0]))
result.append(enc.inverse_transform(pred[1]))

In [69]:
for i in range(len(sentence)):
  print(sentence[i])
  print('\t\t\t\t\t\t\t\t', result[0][i], result[1][i])

정말 행복해
								 행복 행복
정말 슬퍼
								 슬픔 행복
정말 무서워
								 공포 슬픔
정말 싫어
								 행복 행복
정말 화가 나
								 놀람 혐오
깜짝 놀랐어
								 행복 행복
종이는 하얗다
								 중립 중립
ㅋㅋㅋ 유재석 김태호 조합은 믿고 보는거지~
								 행복 행복
그저께 어머니가 돌아가시고 세상을 잃은 기분입니다..
								 놀람 공포
아이가 어제부터 토를 계속 하는데 어떻게 해야하죠??
								 공포 공포
이놈이나 저놈이나 다 똑같은 놈들이야
								 분노 분노
범죄자들 얼굴을 왜 가리나? 신상공개하라!!
								 분노 분노
이런거보면 참 우주는 대단한 듯.. 37억이라는 세월.. 짐작도 안간다.
								 놀람 혐오
저는 법학도이고 현재는 로스쿨에 재학중입니다.
								 슬픔 공포
넘넘 추카해요~~~^^
								 행복 행복
하.... 나 시험 개망햇다...ㅠ..
								 슬픔 슬픔
나 이번달에 생리를 안하는데?? 어떡하징 ㄷㄷ
								 공포 공포
표절가수 얼굴 보기도 실타ㅋㅋ 나오지 마라
								 행복 중립
아 징짜 ㅡㅡ 초딩은 사람도 아님?
								 놀람 놀람
헐 마라탕 위생 문졔 있다구?? 나 마라탕 어제도 먹엇는데??
								 놀람 놀람
이번네 삼성에서 새로운 핸드폰이 출시됩비다.
								 놀람 놀람
