In [20]:
import math
import pandas as pd
import numpy as np
import scipy as sp
import gensim.models.doc2vec as d2v
import multiprocessing as mp
import datetime as dt

from collections import OrderedDict
from random import shuffle

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.metrics import log_loss

### Parallelization

In [21]:
cores = mp.cpu_count() - 2
assert d2v.FAST_VERSION > -1, "Doc2Vec will run painfully slow otherwise"

### Import Data

In [18]:
# Import training data into dataframe
questions = pd.read_pickle('./pickles.gi/all_questions_df.pkl')
questions = questions.drop('tokens', 1)
questions.to_pickle('./pickles.gi/questions.pkl')

In [28]:
# Preprocess text
def preprocessText(questions):
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer()
    stopset = set(stopwords.words('english'))
    lowered = [str(question).lower() for question in questions]
    tokenized = [tokenizer.tokenize(question) for question in lowered]
    filtered = [[token for token in tokens if token not in stopset] for tokens in tokenized]
    
    return filtered

In [29]:
preprocessed_tokens = questions.copy()
preprocessed_tokens['token_list'] = preprocessText(questions['question'])
preprocessed_tokens.to_pickle('./pickles.gi/preprocessed_questions.pkl')
preprocessed_tokens.head()

Unnamed: 0,test,pid,qid,question,token_list
0,0,0,1,What is the step by step guide to invest in sh...,"[step, step, guide, invest, share, market, india]"
1,0,0,2,What is the step by step guide to invest in sh...,"[step, step, guide, invest, share, market]"
2,0,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...,"[story, kohinoor, koh, noor, diamond]"
3,0,1,4,What would happen if the Indian government sto...,"[would, happen, indian, government, stole, koh..."
4,0,2,5,How can I increase the speed of my internet co...,"[increase, speed, internet, connection, using,..."


In [None]:
lemmatized = [lemmatizer.lemmatize(token) for token in filtered]

### Preprocess Data

In [None]:
# Get tagged documents
train_docs = [d2v.TaggedDocument(row[5], row[1]) for row in preprocessed_tokens.itertuples() if row['test'] == 0]
test_docs = [d2v.TaggedDocument(row[5], row[1]) for row in preprocessed_tokens.itertuples() if row['test'] == 1]
tagged_docs = train_docs + test_docs
doc_list = tagged_docs[:]  # for reshuffling per pass

### Import DBOW Model

In [None]:
# Build Model: PV-DM w/average
dbow_model = Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores)

# Build vocab table
dbow_model.build_vocab(sentences=doc_list)

### Train Model

In [None]:
# Train model
alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes

train_model = dbow_model

print("START %s" % dt.datetime.now())

for epoch in range(passes):
    # shuffling gets best results
    shuffle(doc_list)

    print ('Training epoch %s' % epoch)

    # train
    train_model.alpha, train_model.min_alpha = alpha, alpha
    train_model.train(doc_list)

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    print(str(dt.datetime.now()))
    
    alpha -= alpha_delta

train_model.save('./models.gi/dbow_model.trained')

print("END %s" % str(dt.datetime.now()))

In [None]:
train_model['1']