In [1]:
import math
import pandas as pd
import numpy as np
import scipy as sp
import gensim.models.doc2vec as d2v
import multiprocessing as mp
import datetime as dt

from collections import OrderedDict
from random import shuffle

from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.metrics import log_loss



### Import Data

In [2]:
questions = pd.read_pickle('./pickles.gi/questions_with_tokens_df.pkl')
tagged_doclist = pd.read_pickle('./pickles.gi/tagged_doclist.pkl')

### Train Model

In [11]:
doc_list = tagged_doclist[:]

# Parallelization
cores = mp.cpu_count() - 1
assert d2v.FAST_VERSION > -1, "Doc2Vec will run painfully slow otherwise"

# Build Model: PV-DM w/average
dmm_model = d2v.Doc2Vec(dm=1, size=300, window=3, negative=5, hs=0, min_count=2, workers=cores)

train_model = dmm_model

# Build vocab table
train_model.build_vocab(sentences=doc_list)

# Train model
alpha, min_alpha, passes = (0.025, 0.001, 1)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % dt.datetime.now())

for epoch in range(passes):
    # shuffling gets best results
    shuffle(doc_list)

    print ('Training epoch %s' % epoch)

    # train
    train_model.alpha, train_model.min_alpha = alpha, alpha
    train_model.train(doc_list)

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    print(str(dt.datetime.now()))
    
    alpha -= alpha_delta

train_model.save('./models.gi/one_iter.trained')

print("END %s" % str(dt.datetime.now()))

START 2017-05-02 19:46:25.703002
Training epoch 0
completed pass 1 at alpha 0.025000
2017-05-02 21:05:59.749771
END 2017-05-02 21:06:00.979732
