## 1.install konlpy

In [1]:
!apt-get update
!apt-get install g++ openjdk-8-jdk python-dev python3-dev
!pip3 install JPype1-py3
!pip3 install konlpy
!JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64"

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.161)] [Waiting for headers] [Wa0% [1 InRelease gpgv 3,609 B] [Connecting to archive.ubuntu.com (91.189.88.161)                                                                               Hit:2 http://security.ubuntu.com/ubuntu bionic-security InRelease
0% [1 InRelease gpgv 3,609 B] [Connecting to archive.ubuntu.com (91.189.88.161)                                                                               Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,609 B] [Waiting for headers] [Waiting for headers] [Wait                                                                               Hit:4 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
0% [1 InRelease gpgv 3,609 B] [Waiting for headers] [Waiting for headers] [Conn                   

## 2.Preprocess Tools

In [0]:
import numpy as np
import re
from collections import Counter
from konlpy.tag import Okt
import gensim

okt = Okt()


def morphs_extractor(sentence):
    """
    extract morphs
    """
    tokens = okt.morphs(sentence, norm=True, stem=True)
    return tokens
        
    
def morphs_process(lines):
    tokens = []
    for line in lines:
        token = morphs_extractor(line)
        tokens.append(token)
    return tokens

  
def sentence_to_index_morphs(lines, vocab, max_length=0):
    tokens = []
    indexes = []
    max_len = max_length

    assert (type(lines) is list or tuple), "Input type must be list or tuple."

    if max_len == 0:
        for line in lines:
            token = morphs_extractor(line)
            tokens.append(token)
            length = len(token)
            if max_len < length:
                max_len = length
    else:
        for line in lines:
            token = morphs_extractor(line)
            tokens.append(token)            

    for token in tokens:
        if len(token) < max_len:
            temp = token
            for _ in range(len(temp), max_len):
                temp.append('<PAD>')
        else:
            temp = token[:max_len]
        index = []
        for char in temp:
            if char in vocab.keys():
                index.append(vocab[char])
            else:
                index.append(vocab['<UNK>'])
        indexes.append(index)

    return indexes

  
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]
            
            
def make_embedding_vectors(data, embedding_size=300):
    tokens = morphs_process(data)
    wv_model = gensim.models.Word2Vec(min_count=1, window=5, size=embedding_size)
    wv_model.build_vocab(tokens)
    wv_model.train(tokens, total_examples=wv_model.corpus_count, epochs=wv_model.epochs)
    word_vectors = wv_model.wv
    
    vocab = dict()
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    idx = 2
    for word in word_vectors.vocab:
        vocab[word] = idx
        idx += 1
        
    embedding = []
    embedding.append(np.random.normal(size=300))
    embedding.append(np.random.normal(size=300))
    for word in word_vectors.vocab:
        embedding.append(word_vectors[word])
    embedding = np.asarray(embedding)
    vocab_size = len(embedding)
    
    return embedding, vocab, vocab_size

## 3.Build Model

In [0]:
import tensorflow as tf

class TextCNN(object):
    """
    The implementation is based on following:
    dennybritz: simplified implementation of Kim's Convolutional Neural Networks for Sentence Classification paper in Tensorflow.
    """

    def __init__(
            self, sess, vocab_size, sequence_length=30, embedding_size=300,
            filter_sizes=(3, 4, 5), num_filters=128, n_class=2, lr=1e-2, trainable=True):
        self.sess = sess
        self.vocab_size = vocab_size
        self.sequence_length = sequence_length
        self.embedding_size = embedding_size
        self.filter_sizes = filter_sizes
        self.num_filters = num_filters
        self.n_class = n_class
        self.lr = lr
        self.trainable = trainable
        self._build_net()

    def _build_net(self):
        # Placeholders for input, output
        with tf.variable_scope("placeholder"):
            self.input_x = tf.placeholder(tf.int32, (None, self.sequence_length))
            self.input_y = tf.placeholder(tf.int32, (None,))
            self.embedding_placeholder = tf.placeholder(tf.float32, (self.vocab_size, self.embedding_size))

        # Embedding layer for input
        with tf.variable_scope("embedding", reuse=tf.AUTO_REUSE):
            W = tf.get_variable("W", dtype=tf.float32,
                                initializer=tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0),
                                trainable=self.trainable)
            self.embedding_init = W.assign(self.embedding_placeholder)
            embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
            embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)

        # Create a convolution + max_pool layer for each filter size
        pooled_outputs = []
        for filter_size in self.filter_sizes:
            with tf.variable_scope("conv-maxpool-%s" % filter_size, reuse=tf.AUTO_REUSE):
                # Convolution Layer
                filter_shape = (filter_size, self.embedding_size, 1, self.num_filters)
                W = tf.get_variable("W", dtype=tf.float32,
                                    initializer=tf.truncated_normal(filter_shape, stddev=0.1))
                b = tf.get_variable("b", dtype=tf.float32,
                                    initializer=tf.constant(0.1, shape=(self.num_filters,)))
                conv = tf.nn.conv2d(
                    embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, self.sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = self.num_filters * len(self.filter_sizes)
        h_pool = tf.concat(pooled_outputs, 3)
        h_pool_flat = tf.reshape(h_pool, (-1, num_filters_total))

        # Final (unnormalized) scores and predictions
        with tf.variable_scope("output", reuse=tf.AUTO_REUSE):
            W = tf.get_variable(
                "W",
                shape=(num_filters_total, self.n_class),
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=(self.n_class,)), name="b")
            logits = tf.nn.xw_plus_b(h_pool_flat, W, b, name="logits")
            self.prob = tf.reduce_max(tf.nn.softmax(logits), axis=1, name="prob")
            self.prediction = tf.cast(tf.argmax(logits, 1), tf.int32, name="predictions")

        # Calculate mean cross-entropy loss
        with tf.variable_scope("loss"):
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.input_y)
            self.loss = tf.reduce_mean(losses)

        with tf.variable_scope("train", reuse=tf.AUTO_REUSE):
            global_step = tf.Variable(0, trainable=False)
            learning_rate = tf.train.exponential_decay(self.lr,
                                                       global_step,
                                                       1e+3,
                                                       0.9,
                                                       staircase=True)
            optimizer = tf.train.AdamOptimizer(learning_rate)
            self.train_op = optimizer.minimize(self.loss, global_step=global_step)

        # Accuracy
        with tf.variable_scope("accuracy"):
            correct = tf.equal(self.prediction, self.input_y)
            self.accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

        self.sess.run(tf.global_variables_initializer())

    def embedding_assign(self, embedding):
        return self.sess.run(self.embedding_init, feed_dict={self.embedding_placeholder: embedding})

    def train(self, input_x, input_y):
        return self.sess.run([self.loss, self.train_op], feed_dict={self.input_x: input_x, self.input_y: input_y})

    def predict(self, input_x):
        return self.sess.run((self.prediction, self.prob), feed_dict={self.input_x: input_x})

    def get_accuracy(self, input_x, input_y):
        return self.sess.run(self.accuracy, feed_dict={self.input_x: input_x, self.input_y: input_y})

## 4.Train Model

In [0]:
import os, json
import pandas as pd

tf.reset_default_graph()
DIR = "sentiment-model"

# build dataset
data = pd.read_csv('sentiment.txt', delimiter='\t')
x_input = data.document
y_input = data.label
max_length = 30
print('데이터로부터 정보를 얻는 중입니다.')
embedding, vocab, vocab_size = make_embedding_vectors(list(x_input))
print('완료되었습니다.')

# save vocab, vocab_size, max_length
with open('path.join(DIR, 'vocab.json')', 'w') as fp:
    json.dump(vocab, fp)

# save configuration
with open('config.txt', 'w') as f:
    f.write(str(vocab_size) + '\n')
    f.write(str(max_length))

# open session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
    # make model instance
    model = TextCNN(sess=sess, vocab_size=vocab_size, sequence_length=max_length, trainable=True)

    # assign pretrained embedding vectors
    model.embedding_assign(embedding)

    # make train batches
    x_input = sentence_to_index_morphs(x_input, vocab, max_length)
    batches = batch_iter(list(zip(x_input, y_input)), batch_size=64, num_epochs=10)

    # model saver
    saver = tf.train.Saver(max_to_keep=1, keep_checkpoint_every_n_hours=0.5)

    # train model
    print('모델 훈련을 시작합니다.')
    avgLoss = []
    for step, batch in enumerate(batches):
        x_train, y_train = zip(*batch)
        l, _ = model.train(x_train, y_train)
        avgLoss.append(l)
        if (step + 1) % 100 == 0:
            print('batch:', '%03d' % (step + 1), 'loss:', '%05f' % np.mean(avgLoss))
            saver.save(sess, os.path.join(DIR, "model"), global_step=step+1)
            avgLoss = []
    
    saver.save(sess, os.path.join(DIR, "model"), global_step=step+1)

## 5.Enjoy Sentiment Analysis!

In [0]:
tf.reset_default_graph()
DIR = "sentiment-model"

# load vocab, vocab_size, max_length
with open('path.join(DIR, 'vocab.json')', 'r') as fp:
    vocab = json.load(fp)

# load configuration
with open('config.txt', 'r') as f:
    vocab_size = int(re.sub('\n', '', f.readline()))
    max_length = int(f.readline())

# open session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
    # make model instance
    model = TextCNN(sess=sess, vocab_size=vocab_size, sequence_length=max_length, trainable=True)

    # load trained model
    saver = tf.train.Saver()
    saver.restore(sess, tf.train.latest_checkpoint(DIR))

    # inference
    while True:
        test = input("User >> ")
        if test == "exit":
            break
        speak = sentence_to_index_morphs([test], vocab, max_length)
        label, prob = model.predict(speak)
        if prob[0] < 0.6:
            response = '차분해 보이시네요 :)'
        else:
            if label[0] == 0:
                response = '기분이 좋지 않아 보여요 :('
            else:
                response = '기분이 좋아 보이시네요!'
        print("Bot >> ", response, "\n")