In [1]:
# gensim modules
from __future__ import division
from gensim import utils
from gensim.models.doc2vec import LabeledSentence, TaggedDocument
from gensim.models import Doc2Vec

# numpy
import numpy as np
import re
# random
from random import shuffle

# classifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
import psycopg2
conn = psycopg2.connect("host=localhost port=5432 dbname=amazon user=postgres password=darkmatter")
cur = conn.cursor()
#connect to db and find q/a
cur.execute("SELECT question from qa;")
Qresults=cur.fetchall()
cur.execute("SELECT answer from qa;")
Aresults=cur.fetchall()

In [3]:
cur.execute("SELECT question from qa WHERE questiontype  = 'yes/no';")
Q_yn=cur.fetchall()
cur.execute("SELECT question from qa WHERE questiontype  = 'open-ended';")
Q_oe=cur.fetchall()

In [7]:
def process_line(sentence):
    #step 1 split if we need to
    sentences=re.split(r'[;:!?.-]\s*', sentence)
    result= [re.findall("[a-z']+", sent.lower()) for sent in sentences if \
            re.findall("[a-z']+", sent.lower())!=[]]
    if result==[]:
        result=['']
    return result
stoplist = set('for a of the and to in rt'.split())

<h1> Clean up the sentence </h1>

In [8]:
qs_yn= [[word for word in process_line(sentence[0])[0] if word not in stoplist] for sentence in Q_yn]
qs_yn_sample=qs_yn[:int(len(qs_yn)/2)]
qs_oe= [[word for word in process_line(sentence[0])[0] if word not in stoplist] for sentence in Q_oe]
qs_oe_sample=qs_oe[:int(len(qs_oe)/2)]

#here we only extract the first sentence for training. Only use 1/2 of the data

<h1> build labeled docs </h1>

In [15]:
labeledSent = [LabeledSentence(words= word[:5], tags=['YN_'+str(ii)]) for ii,word in enumerate(qs_yn_sample)]+\
    [LabeledSentence(words= word[:5], tags=['OE_'+str(ii)]) for ii,word in enumerate(qs_oe_sample)]
    
#only using first 4 words

<h1> Train </h1>

In [16]:
model = Doc2Vec(min_count=4, window=10, size=100, sample=1e-4, negative=5, workers=4)
model.build_vocab(labeledSent)
model.train(labeledSent)

9799914

<h1> Setup Training </h1>

In [17]:
all_yn_arrays = np.zeros((len(qs_yn_sample), 100))
all_oe_arrays = np.zeros((len(qs_oe_sample), 100))
all_yn_labels = np.zeros(len(qs_yn_sample))
all_oe_labels = np.zeros(len(qs_oe_sample))

#setup training for Y/N questions
for i in range(len(qs_yn_sample)):
    all_yn_arrays[i] = model.docvecs[i]
    all_yn_labels[i] = 1

#setup training for open-ended questions
for ii in range(len(qs_oe_sample)):
    all_oe_arrays[ii] = model.docvecs[i+ii]
    all_oe_labels[ii] = 0

<h1> Setup Testing </h1>

In [18]:
Nyn=len(all_yn_arrays)
Noe=len(all_oe_arrays)

train_arrays = np.vstack((all_yn_arrays[:Nyn/2],all_oe_arrays[:Noe/2]))
train_labels = np.hstack((all_yn_labels[:Nyn/2],all_oe_labels[:Noe/2]))

test_arrays = np.vstack((all_yn_arrays[Nyn/2:],all_oe_arrays[Noe/2:]))
test_labels = np.hstack((all_yn_labels[Nyn/2:],all_oe_labels[Noe/2:]))

<h1> logistic regresion </h1>

In [19]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
classifier.score(test_arrays, test_labels)

0.62981293082684342

<h1> Random Forest </h1>

In [62]:
classifier =RandomForestClassifier()
classifier.fit(train_arrays, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [63]:
classifier.score(test_arrays, test_labels)

0.61770335271220578

<h1> Test on data not used in traning </h1>

In [89]:
import warnings
warnings.filterwarnings('ignore')
prediction=[classifier.predict(model.infer_vector(line_arr)) for line_arr in qs_yn[-2000:]]

In [90]:
sum(prediction)/len(prediction)

array([ 0.5915])

In [91]:
prediction_oe=[classifier.predict(model.infer_vector(line_arr)) for line_arr in qs_oe[-2000:]]

In [92]:
sum([val==0 for val in prediction_oe])/len(prediction_oe)

array([ 0.4695])

<h1> user input </h1>

In [41]:
lin_arr="do I need to".split()
classifier.predict(model.infer_vector(lin_arr))

array([ 0.])