In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from data_process import build_vocab, batch_iter, sentence_to_onehot, cal_idf, sentence_to_tfidf
from models import logistic_regression

In [None]:
train = pd.read_csv('./data/train-5T.txt', delimiter='\t')
test = pd.read_csv('./data/test-1T.txt', delimiter='\t')

In [None]:
X_train = train.document
Y_train = train.label
X_test = test.document
Y_test = test.label

In [None]:
max_vocab = 50000
vocab, _, vocab_size = build_vocab(X_train, max_vocab)

# Sentiment Analysis with logistic regression using one_hot encoding

In [None]:
batches = batch_iter(list(zip(X_train, Y_train)), batch_size=64, num_epochs=15)

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
tf.reset_default_graph()
sess = tf.Session(config=config)
model = logistic_regression(sess=sess, vocab_size=vocab_size, lr=1e-1)
train_acc = []
avgLoss = []
x_test = sentence_to_onehot(X_test, vocab)

for step, batch in enumerate(batches):
    x_train, y_train = zip(*batch)
    x_train = sentence_to_onehot(x_train, vocab)
    acc = model.get_accuracy(x_train, y_train)
    l, _ = model.train(x_train, y_train)
    train_acc.append(acc)
    avgLoss.append(l)
    if step % 100 == 0:
        test_loss = model.get_loss(x_test, Y_test)
        print('batch:', '%04d' % step, '\ntrain loss:', '%.5f' % np.mean(avgLoss), '\ttest loss:', '%.5f' % test_loss)
        test_acc = model.get_accuracy(x_test, Y_test)
        print('train accuracy:', '%.3f' % np.mean(train_acc), '\ttest accuracy:', '%.3f' % test_acc, '\n')
        avgLoss = []
        train_acc = []

# Sentiment Analysis with logistic regression using tfidf encoding

In [None]:
IDF = cal_idf(X_train, vocab)

In [None]:
batches = batch_iter(list(zip(X_train, Y_train)), batch_size=64, num_epochs=15)

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
tf.reset_default_graph()
sess = tf.Session(config=config)
model = logistic_regression(sess=sess, vocab_size=vocab_size, lr=1e-1)
train_acc = []
avgLoss = []
x_test = sentence_to_tfidf(X_test, vocab, IDF)

for step, batch in enumerate(batches):
    x_train, y_train = zip(*batch)
    x_train = sentence_to_tfidf(x_train, vocab, IDF)
    acc = model.get_accuracy(x_train, y_train)
    l, _ = model.train(x_train, y_train)
    train_acc.append(acc)
    avgLoss.append(l)
    if step % 100 == 0:
        test_loss = model.get_loss(x_test, Y_test)
        print('batch:', '%04d' % step, '\ntrain loss:', '%.5f' % np.mean(avgLoss), '\ttest loss:', '%.5f' % test_loss)
        test_acc = model.get_accuracy(x_test, Y_test)
        print('train accuracy:', '%.3f' % np.mean(train_acc), '\ttest accuracy:', '%.3f' % test_acc, '\n')
        avgLoss = []
        train_acc = []