<h1> Doc2vec Y/N classification </h1>

In [40]:
# gensim modules
from __future__ import division
from gensim import utils
from gensim.models.doc2vec import LabeledSentence, TaggedDocument
from gensim.models import Doc2Vec

# numpy
import numpy as np
import re
# random
from random import shuffle

# classifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.cross_validation import train_test_split

%pylab inline
pylab.rcParams['figure.figsize'] = (12.0, 6.0) # set size of figures"
plt.rcParams.update({'font.size': 24})

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


<h1> Get the data </h1>

In [10]:
import psycopg2
conn = psycopg2.connect("host=localhost dbname=qa user=attiladobi")
cur = conn.cursor()
#connect to db and find open ended and yes/no question
cur.execute("SELECT question from qa WHERE questiontype  = 'yes/no';")
Q_yn=cur.fetchall()
cur.execute("SELECT question from qa WHERE questiontype  = 'open-ended';")
Q_oe=cur.fetchall()

<h1>clean sentences and create tagged docs for training </h1>

In [11]:
def process_line(sentence):
    '''Splits sentence if punctuation is identified. Returns a list of list of words for each sentence'''
    sentences=re.split(r"(?<![0-9])[.?!;](?![0-9])", sentence)
    result= [re.findall("[a-z'.-0-9]+", sent.lower()) for sent in sentences if \
            re.findall("[a-z'.-0-9]+", sent.lower())!=[]]
    if result==[]:
        result=[['']]
    return result
stoplist = set('number for a an of or the and to in rt'.split())

<h3> two catigories.... yn (yes no) and oe (open ended). Split 50/50 for training and testing</h3>

In [13]:
qs_yn= [[word for word in process_line(sentence[0])[0] if word not in stoplist] for sentence in Q_yn]

qs_oe= [[word for word in process_line(sentence[0])[0] if word not in stoplist] for sentence in Q_oe]

In [99]:
data=qs_yn+qs_oe
labels=np.hstack((ones(len(qs_yn)),zeros(len(qs_oe)))) #Label yn=1 and oe=0

X_train, X_test, y_train, y_test = train_test_split(data,labels, test_size=0.5, random_state=0)

<h3> build docs and lebel the two categories </h3>

In [103]:
docs=[]
label_MAP={1:'YN_',0:'OE_'}

for i,(words,label) in enumerate(zip(X_train, y_train)):
    docs.append(TaggedDocument(words,[label_MAP[label]+str(i)]))
    
##probably a more elegent solution. but it gets the job done... each doc is tagged with "Label_index"

#label='OE_'
#for i,words in enumerate(qs_oe):
#    docs.append(TaggedDocument(words,[label+str(i)]))

<h1> train a doc2vec model </h1>

In [105]:
model = Doc2Vec(min_count=2, window=20, size=100, sample=1e-4, negative=2, workers=4)
#adjust the window sizze to match the typical number of words per doc
model.build_vocab(docs)
model.train(docs)

17454901

In [106]:
docs[:10]

[TaggedDocument(words=['what', 'is', 'length', 'cord'], tags=['OE_0']),
 TaggedDocument(words=['i', 'am', 'size', '0', 'every', 'pair', 'leggings', 'i', 'get', 'sag', 'behind', 'knee', 'under', 'my', 'butt', 'will', 'these', 'sag'], tags=['YN_1']),
 TaggedDocument(words=['this', 'product', 'was', '9', 'two', 'days', 'ago'], tags=['OE_2']),
 TaggedDocument(words=['are', 'two', 'loop', 'handles', 'any', 'advantage', 'over', 'one', 'long', 'handle'], tags=['OE_3']),
 TaggedDocument(words=['what', 'is', 'duameter', 'base'], tags=['OE_4']),
 TaggedDocument(words=['i', 'have', "'", 'ford', 'f-', '0.'], tags=['OE_5']),
 TaggedDocument(words=['what', 'type', 'seal', 'does', 'this', 'box', 'have'], tags=['OE_6']),
 TaggedDocument(words=['what', 'percentage', 'hy', 'peroxide'], tags=['OE_7']),
 TaggedDocument(words=['anti-aging', 'property', 'is', "denon's", 'akdl', 'dedicated', 'link', 'cable', 'right', 'me'], tags=['OE_8']),
 TaggedDocument(words=['this', 'product', 'is', 'supposed', 'have', '

<h3> save or load </h3>

In [107]:
#model.save('/home/ubuntu/TallLabs/models/Rmodel_Doc2vec_cell_asin_title')
#model=Doc2Vec.load('/home/ubuntu/TallLabs/models/Rmodel_Doc2vec_cell')

<h1> before doing classification... it might be easier to see which vecrots are the most similar </h1>

In [112]:
v=model.infer_vector('what color is it'.lower().split()) #other params: ,alpha=0,steps=1 (steps is the learning rate)
model.docvecs.most_similar([v],topn=10)
#you can also find the most similar onces with a given tag: model.docvecs.most_similar('YN_1')

[('OE_567923', 0.7573308944702148),
 ('OE_60314', 0.746245265007019),
 ('OE_610076', 0.7441090941429138),
 ('OE_557712', 0.7434462308883667),
 ('OE_414245', 0.7371196150779724),
 ('OE_491108', 0.7297719717025757),
 ('OE_371811', 0.7229807376861572),
 ('OE_312769', 0.7211353778839111),
 ('OE_619637', 0.7203280925750732),
 ('OE_12538', 0.7190041542053223)]

<h3> ... turns out the vast majoirty of the similar documents are OE (OE). I could just run a quick test with the teting set </h3>

<h1> Train logistic regresion classifer </h1>

In [123]:
#setup training for Y/N questions. we have to convert words to doc vector
train_arrays=[]

for i,label in enumerate(y_train):
    train_arrays.append(model.docvecs[label_MAP[label]+str(i)])

##can access each one with the index using: model.docvecs[i] and tags with odel.docvecs.doctags

In [125]:
classifier = LogisticRegression()
classifier.fit(train_arrays, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

<h1> Test (will have to convert the array of words into a vector)</h1>

In [126]:
test_arrays=[]

for words in X_test:
    test_arrays.append(model.infer_vector(words,alpha=0))

In [127]:
classifier.score(test_arrays, y_test)

0.51501874565409067

<h3> lol this sucks, might do better with only a few key words or a smaller window :)</h3> 

In [48]:
X_test[0]

['does', 'it', 'fit', 'satellite', 'p', 't-a']