In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from models import logistic_regression
from data_process import morphs_process, batch_iter
import tensorflow as tf
import numpy as np

In [None]:
train = pd.read_csv('./data/train-5T.txt', delimiter='\t')
test = pd.read_csv('./data/test-1T.txt', delimiter='\t')
X_train = train.document
Y_train = train.label
X_test = test.document
Y_test = test.label

In [None]:
X_train = morphs_process(X_train)
X_test = morphs_process(X_test)

In [None]:
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(X_train)]

In [None]:
num_epochs_per_cycle = 5
num_cycles = 10
vec_size = 100
alpha = 1e-1
min_alpha = 1e-3
dv_model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_count=2, dm=0)
dv_model.build_vocab(tagged_data)
for cycle in range(num_cycles):
    dv_model.train(tagged_data, total_examples=dv_model.corpus_count, epochs=num_epochs_per_cycle, start_alpha=alpha, end_alpha=min_alpha)
    print('cycle:', '%02d' % (cycle+1))

In [None]:
X_train_vector = []
X_test_vector = []
for i in range(len(X_train)):
    X_train_vector.append(dv_model.docvecs[str(i)])
for i in range(len(X_test)):
    X_test_vector.append(dv_model.infer_vector(X_test[i], alpha=alpha, min_alpha=min_alpha, steps=5))

In [None]:
batches = batch_iter(list(zip(X_train_vector, Y_train)), batch_size=64, num_epochs=15)

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
tf.reset_default_graph()
sess = tf.Session(config=config)
model = logistic_regression(sess=sess, vocab_size=vec_size, lr=1e-2)
train_acc = []
avgLoss = []

for step, batch in enumerate(batches):
    x_train, y_train = zip(*batch)
    acc = model.get_accuracy(x_train, y_train)
    l, _ = model.train(x_train, y_train)
    train_acc.append(acc)
    avgLoss.append(l)
    if step % 100 == 0:
        test_loss = model.get_loss(X_test_vector, Y_test)
        print('batch:', '%04d' % step, '\ntrain loss:', '%.5f' % np.mean(avgLoss), '\ttest loss:', '%.5f' % test_loss)
        test_acc = model.get_accuracy(X_test_vector, Y_test)
        print('train accuracy:', '%.3f' % np.mean(train_acc), '\ttest accuracy:', '%.3f' % test_acc, '\n')
        avgLoss = []
        train_acc = []