In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from data_process import *
from models import logistic_regression, LSTM, CNN

In [None]:
train = pd.read_csv('./data/train-50T.txt', delimiter='\t')
test = pd.read_csv('./data/test-10T.txt', delimiter='\t')

In [None]:
X_train = train.document
Y_train = train.label
X_test = test.document
Y_test = test.label

In [None]:
max_vocab = 50000
vocab, _, vocab_size = build_vocab_pos(X_train, max_vocab)

In [None]:
batches = batch_iter(list(zip(X_train, Y_train)), batch_size=64, num_epochs=5)

In [None]:
############ logistic regression ############
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
tf.reset_default_graph()
sess = tf.Session(config=config)
model = logistic_regression(sess=sess, vocab_size=vocab_size)
avgLoss = []
for step, batch in enumerate(batches):
    x_train, y_train = zip(*batch)
    x_train = sentence_to_onehot_pos(x_train, vocab)
    l, _ = model.train(x_train, y_train)
    avgLoss.append(l)
    if step % 500 == 0:
        print('batch:', '%04d' % step, 'loss:', '%05f' % np.mean(avgLoss))
        avgLoss = []

In [None]:
batches = batch_iter(list(zip(X_test, Y_test)), batch_size=64, num_epochs=1)

In [None]:
acc = 0
for batch in batches:
    x_test, y_test = zip(*batch)
    x_test = sentence_to_onehot_pos(x_test, vocab)
    acc += model.get_accuracy(x_test, y_test) * len(x_test)
acc /= len(X_test)
print(acc)

In [None]:
batches = batch_iter(list(zip(X_train, Y_train)), batch_size=64, num_epochs=5)

In [None]:
################ LSTM ################
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
tf.reset_default_graph()
sess = tf.Session(config=config)
model = LSTM(sess=sess, vocab_size=vocab_size)
avgLoss = []
for step, batch in enumerate(batches):
    x_train, y_train = zip(*batch)
    x_train = sentence_to_index_pos(x_train, vocab)
    l, _ = model.train(x_train, y_train)
    avgLoss.append(l)
    if step % 500 == 0:
        print('batch:', '%04d' % step, 'loss:', '%05f' % np.mean(avgLoss))
        avgLoss = []

In [None]:
batches = batch_iter(list(zip(X_test, Y_test)), batch_size=64, num_epochs=1)

In [None]:
acc = 0
for batch in batches:
    x_test, y_test = zip(*batch)
    x_test = sentence_to_index_pos(x_test, vocab, 10)
    acc += model.get_accuracy(x_test, y_test) * len(x_test)
acc /= len(X_test)
print(acc)

In [None]:
tester = sentence_to_index_pos(['너무재밓없었다그래서보는것을추천하지않는다...!', '생각보다꿀잼'], vocab)
print(model.predict(tester))

In [None]:
batches = batch_iter(list(zip(X_train, Y_train)), batch_size=64, num_epochs=5)

In [None]:
################ CNN ################
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
tf.reset_default_graph()
sess = tf.Session(config=config)
max_length = 30
model = CNN(sess=sess, vocab_size=vocab_size, sequence_length=max_length)
avgLoss = []
for step, batch in enumerate(batches):
    x_train, y_train = zip(*batch)
    x_train = sentence_to_index_pos(x_train, vocab, max_length)
    l, _ = model.train(x_train, y_train)
    avgLoss.append(l)
    if step % 500 == 0:
        print('batch:', '%04d' % step, 'loss:', '%05f' % np.mean(avgLoss))
        avgLoss = []

In [None]:
batches = batch_iter(list(zip(X_test, Y_test)), batch_size=64, num_epochs=1)

In [None]:
acc = 0
for batch in batches:
    x_test, y_test = zip(*batch)
    x_test = sentence_to_index_pos(x_test, vocab, max_length)
    acc += model.get_accuracy(x_test, y_test) * len(x_test)
acc /= len(X_test)
print(acc)

In [None]:
tester = sentence_to_index_pos(['너무재밓없었다그래서보는것을추천하지않는다...!', '생각보다꿀잼', '심오하면서잔잔하니감동적이다...', '앞부분 좀 졸림'], vocab, max_length)
print(model.predict(tester))