In [3]:
from konlpy.tag import Okt
from tensorflow.keras import models, metrics, losses, optimizers, layers
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import plot_model
from google.colab import drive
from sklearn.model_selection import train_test_split as sspl
import numpy as np
import json
import os
import nltk

# 아래 경로안에 해당 파일들이 모두 들어있어야 합니다.
# 1. train_docs.json
# 2. test_docs.json
# 3. ssukzip_Model.h5

drive.mount('/content/drive')
baseURL = '/content/drive/MyDrive/ssukzip/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
def read_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:] # txt 파일의 헤더(id document label)는 제외하기
    print(data)
    return data

In [9]:
def tokenize(doc):
    # norm은 정규화, stem은 근어로 표시하기를 나타냄
    val = ['/'.join(t) for t in Okt().pos(doc, norm=True, stem=True)]
    return val

In [15]:
def term_frequency(doc):
    with open(baseURL + 'train_docs.json', encoding="utf-8") as f:
        train_docs = json.load(f)
    with open(baseURL + 'test_docs.json', encoding="utf-8") as f:
        test_docs = json.load(f)
    print(f"train_docs length: {len(train_docs)}")
    print(f"test_docs length: {len(test_docs)}")
    tokens = [t for d in train_docs for t in d[0]]
    print(f"tokens : {tokens}")
    text = nltk.Text(tokens, name='NMSC')
    selected_words = [f[0] for f in text.vocab().most_common(10000)]
    return [doc.count(word) for word in selected_words]

In [12]:
def predict_pos_neg(model, review):
    token = tokenize(review)
    print(f"token : {token}")
    tf = term_frequency(token)
    print(f"tf : {tf}")
    data = np.expand_dims(np.asarray(tf).astype('float32'), axis=0)
    print(f"data : {data}")
    score = float(model.predict(data))
    if(score > 0.5):
        print("[{}]는 {:.2f}% 확률로 긍정 리뷰이지 않을까 추측해봅니다.^^\n".format(review, score * 100))
    else:
        print("[{}]는 {:.2f}% 확률로 부정 리뷰이지 않을까 추측해봅니다.^^;\n".format(review, (1 - score) * 100))
    return score

In [13]:
def classifyReview(modelName, review):
    model = load_model(modelName)
    plot_model(model, to_file='./ssukzip_model_shapes.png', show_shapes=True)
    return predict_pos_neg(model, review)


In [None]:
!pwd

/content


In [16]:
if __name__=="__main__":
    score = classifyReview(baseURL + "ssukzip_Model.h5", input("Your Review : "))
    print(score)

Your Review : 개별로임
token : ['개별/Noun', '로/Josa', '임/Noun']
train_docs length: 150000
test_docs length: 50000


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tf : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0