In [26]:
import math
import pandas as pd
import numpy as np
import scipy as sp
import gensim.models.doc2vec as d2v
import multiprocessing as mp
import datetime as dt

from collections import OrderedDict
from random import shuffle
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.metrics import log_loss

### Parallelization

In [33]:
cores = mp.cpu_count() - 2
assert d2v.FAST_VERSION > -1, "Doc2Vec will run painfully slow otherwise"

### Import Data

In [6]:
# Import training data into dataframe
TRAIN = pd.read_csv('../data.gi/train.csv')
TEST = pd.read_csv('../data.gi/test.csv')

### Preprocessing

In [10]:
# Create lookup table, then pickle
train_lookup_df = TRAIN[['id', 'qid1', 'qid2', 'is_duplicate']]
train_lookup_df.to_pickle('./pickles.gi/train_lookup_df.pkl')

In [11]:
# Then stack, sort, and reindex new dataframe
train_q1_df = TRAIN[['id', 'qid1', 'question1']]
train_q1_df.columns = ['pid', 'qid', 'question']
train_q2_df = TRAIN[['id', 'qid2', 'question2']]
train_q2_df.columns = ['pid', 'qid', 'question']
train_questions_df = pd.concat([train_q1_df, train_q2_df], ignore_index=True).sort_values(by=['pid', 'qid']).reset_index(drop=True)

In [12]:
# Add test set indicator
values = [0] * len(train_questions_df.index)
train_questions_df = train_questions_df.assign(test=values)

In [13]:
# Add qid's for question1 and question2
odd_range = pd.Series(range(1, len(TEST.index) * 2 + 1, 2))
even_range = pd.Series(range(2, len(TEST.index) * 2 + 1, 2))
TEST = TEST.assign(qid1=odd_range, qid2=even_range)

In [14]:
# Separate q1 and q2 into respective dataframes
# Then stack, sort, and reindex new dataframe
test_q1_df = TEST[['test_id', 'qid1', 'question1']]
test_q1_df.columns = ['pid', 'qid', 'question']
test_q2_df = TEST[['test_id', 'qid2', 'question2']]
test_q2_df.columns = ['pid', 'qid', 'question']
test_questions_df = pd.concat([test_q1_df, test_q2_df], ignore_index=True).sort_values(by=['pid', 'qid']).reset_index(drop=True)

In [15]:
# Add test set flag
values = [1] * len(test_questions_df.index)
test_questions_df = test_questions_df.assign(test=values)

In [16]:
# Combine train and test sets
# Move test flag column to first position
combined_questions_df = pd.concat([train_questions_df, test_questions_df], ignore_index=True).sort_values(by=['test', 'pid', 'qid']).reset_index(drop=True)
cols = combined_questions_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
combined_questions_df = combined_questions_df[cols]

In [17]:
# Parse to string, force lowercasing, tokenize, filter out stopwords, and stem
def preprocessText(questions):
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer('english')
    stopwords = set('for a of the and to in'.split(' '))
    lowered = [str(question).lower() for question in questions]
    tokenized = [tokenizer.tokenize(question) for question in lowered]
    filtered = [[token for token in tokens if token not in stopwords] for tokens in tokenized]
    stemmed = [[stemmer.stem(token) for token in tokens if token not in stopwords] for tokens in filtered]
    return stemmed

# TODO: Research lemmatization

In [21]:
# Preprocess and pickle
all_questions_df = combined_questions_df.assign(tokens=preprocessText(combined_questions_df['question']))
all_questions_df.to_pickle('./pickles.gi/all_questions_df.pkl')

In [27]:
#all_questions_df = pd.read_pickle('./pickles.gi/all_questions_df.pkl')

In [28]:
# Get tagged documents
tagged_docs = [d2v.TaggedDocument(row[5], [row[1], row[3]]) for row in all_questions_df.itertuples()]
train_docs = [doc for doc in tagged_docs if doc[1][0] == 0]
test_docs = [doc for doc in tagged_docs if doc[1][0] == 1]
doc_list = tagged_docs[:]  # for reshuffling per pass

### Build Doc2Vec Model Vocab

In [39]:
print("START %s" % dt.datetime.now())

dmc_model = d2v.Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores)
dmc_model.build_vocab(tagged_docs)
dmc_model.save('./models.gi/dmc_model.build_vocab')

print("END %s" % str(dt.datetime.now()))

START 2017-04-25 13:11:55.605340
END 2017-04-25 13:14:58.076102


In [41]:
print("START %s" % dt.datetime.now())

dbow_model = d2v.Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores)
dbow_model.build_vocab(tagged_docs)
dbow_model.save('./models.gi/dbow_model.build_vocab')

print("END %s" % str(dt.datetime.now()))

START 2017-04-25 13:18:44.512104
END 2017-04-25 13:24:38.579115


In [42]:
print("START %s" % dt.datetime.now())

dmm_model = d2v.Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores)
dmm_model.build_vocab(tagged_docs)
dmm_model.save('./models.gi/dmm_model.build_vocab')

print("END %s" % str(dt.datetime.now()))

START 2017-04-25 13:30:00.605678
END 2017-04-25 13:43:42.225713
