In [None]:
import gensim
import numpy as np
import pandas as pd
import tensorflow as tf
from data_process import morphs_process, batch_iter, sentence_to_index_morphs
from word2vec import make_embedding_vectors
from models import LSTM, CNN

In [None]:
train = pd.read_csv('./data/train-50T.txt', delimiter='\t')
test = pd.read_csv('./data/test-10T.txt', delimiter='\t')

In [None]:
data = train.append(test)
data = data.document

# Show embedding vectors

In [None]:
tokens = morphs_process(data)
wv_model = gensim.models.Word2Vec(min_count=1, window=5, size=300)
wv_model.build_vocab(tokens)
wv_model.train(tokens, total_examples=wv_model.corpus_count, epochs=wv_model.epochs)
word_vectors = wv_model.wv

In [None]:
print(word_vectors.most_similar('괜찮'))

In [None]:
del wv_model, word_vectors

# Get embedding vectors using word2vec

In [None]:
embedding, vocab, vocab_size = make_embedding_vectors(list(data))

In [None]:
train = pd.read_csv('./data/train-5T.txt', delimiter='\t')
test = pd.read_csv('./data/test-1T.txt', delimiter='\t')
X_train = train.document
Y_train = train.label
X_test = test.document
Y_test = test.label

# Sentiment Analysis with LSTM using morphs & word2vec

In [None]:
batches = batch_iter(list(zip(X_train, Y_train)), batch_size=64, num_epochs=15)

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
tf.reset_default_graph()
sess = tf.Session(config=config)
model = LSTM(sess=sess, vocab_size=vocab_size, lr=1e-2)
model.embedding_assign(embedding)
train_loss = []
train_acc = []
test_loss = []
test_acc = []

for step, batch in enumerate(batches):
    x_train, y_train = zip(*batch)
    x_train = sentence_to_index_morphs(x_train, vocab)
    acc = model.get_accuracy(x_train, y_train)
    l, _ = model.train(x_train, y_train)
    train_loss.append(l)
    train_acc.append(acc)
    
    if step % 100 == 0:
        test_batches = batch_iter(list(zip(X_test, Y_test)), batch_size=64, num_epochs=1)
        for test_batch in test_batches:
            x_test, y_test = zip(*test_batch)
            x_test = sentence_to_index_morphs(x_test, vocab)
            t_acc = model.get_accuracy(x_test, y_test)
            t_loss = model.get_loss(x_test, y_test)
            test_loss.append(t_loss)
            test_acc.append(t_acc)
        print('batch:', '%04d' % step, '\ntrain loss:', '%.5f' % np.mean(train_loss), '\ttest loss:', '%.5f' % np.mean(test_loss))
        print('train accuracy:', '%.3f' % np.mean(train_acc), '\ttest accuracy:', '%.3f' % np.mean(test_acc), '\n')
        train_loss = []
        train_acc = []
        test_loss = []
        test_acc = []

# Sentiment Analysis with CNN using morphs & word2vec

In [None]:
batches = batch_iter(list(zip(X_train, Y_train)), batch_size=64, num_epochs=15)

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
tf.reset_default_graph()
sess = tf.Session(config=config)
max_length = 30
model = CNN(sess=sess, vocab_size=vocab_size, sequence_length=max_length, lr=1e-2)
model.embedding_assign(embedding)
train_loss = []
train_acc = []
test_loss = []
test_acc = []

for step, batch in enumerate(batches):
    x_train, y_train = zip(*batch)
    x_train = sentence_to_index_morphs(x_train, vocab, max_length)
    acc = model.get_accuracy(x_train, y_train)
    l, _ = model.train(x_train, y_train)
    train_loss.append(l)
    train_acc.append(acc)
    
    if step % 100 == 0:
        test_batches = batch_iter(list(zip(X_test, Y_test)), batch_size=64, num_epochs=1)
        for test_batch in test_batches:
            x_test, y_test = zip(*test_batch)
            x_test = sentence_to_index_morphs(x_test, vocab, max_length)
            t_acc = model.get_accuracy(x_test, y_test)
            t_loss = model.get_loss(x_test, y_test)
            test_loss.append(t_loss)
            test_acc.append(t_acc)
        print('batch:', '%04d' % step, '\ntrain loss:', '%.5f' % np.mean(train_loss), '\ttest loss:', '%.5f' % np.mean(test_loss))
        print('train accuracy:', '%.3f' % np.mean(train_acc), '\ttest accuracy:', '%.3f' % np.mean(test_acc), '\n')
        train_loss = []
        train_acc = []
        test_loss = []
        test_acc = []