In [16]:
import numpy as np
import pandas as pd
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple
from gensim.models.doc2vec import LabeledSentence
import nltk
from nltk.corpus import stopwords
import sys
sys.path.append('/research/edubot/repo/edubot/utils/')
import utils

# Doc2Vec model

In this exercise we will create a Doc2Vec model base on our questions and answers dataset to find similar questions and answers to new questions.

In [7]:
#We'll need an object with questions and label like SENT_#number_of_question.
class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        for uid, line in enumerate(open(utils.data_path + 'doc2vec/' + self.filename, 'r')):
            yield LabeledSentence(words=line.split(), tags=['SENT_%s' % uid])

In [8]:
question_sentense = LabeledLineSentence('questions.txt')
type(question_sentense)

__main__.LabeledLineSentence

# Train the model

In [115]:
model = gensim.models.Doc2Vec(size=100, window=10, min_count=5, workers=5,alpha=0.025, min_alpha=0.025) # use fixed learning rate

In [None]:
model.save(utils.data_path+'doc2vec/doc2vec.model')

In [11]:
model = gensim.models.Doc2Vec.load(utils.data_path+'doc2vec/doc2vec.model')
type(model)

gensim.models.doc2vec.Doc2Vec

In [None]:
#model.build_vocab(question_sentense)

In [None]:
#for epoch in range(10):
#    model.train(question_sentense)
#    model.alpha -= 0.002  # decrease the learning rate
#    model.min_alpha = model.alpha  # fix the learning rate, no decay

# Test the model on existing questions/words

Lets look at some similar sentences to the one labeled SENT_0 (first question):

In [13]:
print (model.docvecs.most_similar('SENT_0'))

[('SENT_1', 0.988574206829071), ('SENT_10406', 0.43401357531547546), ('SENT_19487', 0.3859889507293701), ('SENT_6306', 0.3819279074668884), ('SENT_5421', 0.3793574571609497), ('SENT_5426', 0.3752645254135132), ('SENT_5425', 0.3752085268497467), ('SENT_4804', 0.36838746070861816), ('SENT_201', 0.36440780758857727), ('SENT_20527', 0.35341769456863403)]


Now lets try with an existing question in the form of a sentence:

In [14]:
print (model.most_similar('how is the exam going to be graded'.split()))

[('why', 0.4441051185131073), ("I'll", 0.42746639251708984), ('Any', 0.4150758385658264), ('Hello!My', 0.4144468605518341), ('Anyone', 0.4095126688480377), ('yeh....would', 0.40084922313690186), ('please.', 0.3923271596431732), ('i', 0.38553935289382935), ('skewed', 0.37819305062294006), ('my', 0.3708469569683075)]


Let's try with a common question word:

In [15]:
print (model.most_similar('how'))

[('what', 0.5978665351867676), ('How', 0.5137122869491577), ('why', 0.49313122034072876), ('whether', 0.4372948408126831), ('if', 0.4111274480819702), ('15,531.00.The', 0.38539761304855347), ('boring.*I', 0.38303035497665405), ("'not", 0.3646056652069092), (':)<<FULLNAME>>', 0.3630005717277527), ('this,', 0.3546842336654663)]


The most similar function only works with questions that have words in our vocabulary. It will not work newly created question. We'll have to try a different approach.

# WM distance

We will use the WM distance mesurement to find similarities between completly new questions, ones that may contain words that are not in our trained vocabulary, for example:

In [21]:
sentence_obama = utils.tokenize_only('Obama speaks to the media in Illinois')
sentence_president = utils.tokenize_only('The president greets the press in Chicago')
print(sentence_obama)
print(sentence_president)

['obama', 'speaks', 'media', 'illinois']
['president', 'greets', 'press', 'chicago']


In [22]:
model.wmdistance(sentence_obama, sentence_president)

2.4614712543673516

We will see if the distance improves with stemmed sentences:

In [26]:
norm_obama = utils.tokenize_and_stem('Obama speaks to the media in Illinois')
norm_president = utils.tokenize_and_stem('The president greets the press in Chicago')

In [27]:
norm_obama

['obama', 'speak', 'media', 'illinoi']

In [28]:
model.wmdistance(norm_obama, norm_president)

2.261510692152023

We can see that the distance improves. The only issue is that in the process of stemming we could be losing valuable information, stemming can map words like _what_, _why_ and _where_ to the same root, or loose the differences between big and bigest, which can change our related answers.

In [29]:
#Given a new question, returns the existing question that has the minimun wm distance to the new one.
def get_min_wm_distance(model, new_text, old_texts):
    norm_new_text = utils.tokenize_only(new_text)
    min_distance = float('inf')
    md_text = ''
    for old_text in old_texts:
        norm_old_text = utils.tokenize_only(old_text)
        distance = model.wmdistance(norm_new_text, norm_old_text)
        if distance < min_distance:
            min_distance = distance
            md_text = old_text
    return(min_distance, md_text)

#Given a new question, returns the all the existing question with the wm distance to the new one.
def get_min_wm_distance_list(model, new_text, old_texts):
    norm_new_text = utils.tokenize_only(new_text)
    distances = [(model.wmdistance(norm_new_text, utils.tokenize_only(old_text)), old_text) for old_text in old_texts]
    return sorted(distances, key=lambda x: x[0])

In [32]:
file = open(utils.data_path+'doc2vec/questions.txt', 'r')
questions = file.readlines()
file.close()
questions[0]

'Hi, I am <<FULLNAME>> <<FULLNAME>>.  I am a retired medical worker living in East Tennessee near the Smokey Mountains.  I have an abiding interest in all things spiritual and psychological.  I am also an InterFaith minister, that never really practiced that profession except among my friends.  A close friend is also doing this course for her social worker credits which is how I found out about it.  I like doing online courses where the participants share their experiences in an online group like this.  Looking forward to getting to know you all. Love and blessings,  CRose\n'

Lets see which is the most similar question to _how is the exam going to be graded_:

In [33]:
result = get_min_wm_distance(model, 'how is the exam going to be graded', questions)
result

(1.3410407172882863,
 'actually i got lost with the way this course is going on,and i want to know how am going to be graded.\n')

Now lets take a look at the first 3 most similar questions:

In [34]:
distances = get_min_wm_distance_list(model, 'how is the exam going to be graded', questions)
distances[:3]

[(1.3410407172882863,
  'actually i got lost with the way this course is going on,and i want to know how am going to be graded.\n'),
 (1.4537153441390562, 'major echo going on and talking over each other.\n'),
 (1.4816835585840877, 'when are we going to start with week 3?\n')]

Now let's look at several questions ranging from course subjects to general ones.

In [27]:
result = get_min_wm_distance(model, 'I am looking for English Grammar resources', questions)
result

(0.8805966358289719, 'Is the grammar important for English writing?\n')

In [28]:
print(get_min_wm_distance(model, 'What date is it today?', questions))

(1.1093645755827917, 'What is the end date of this course?\n')


In [29]:
print(get_min_wm_distance(model, 'What topics are covered in this course', questions))

(1.157503132306017, 'Is this Course work?\n')


In [30]:
print(get_min_wm_distance(model, 'how many assignments does this course have?', questions))

(1.1739545934280706, 'i am forma spa in and this is my first course\n')


In [31]:
print(get_min_wm_distance(model, 'how  long is this course?', questions))

(0.7431855046505927, 'i am forma spa in and this is my first course\n')


In [35]:
print(get_min_wm_distance(model, 'Anyone want to form a study group', questions))

(1.0471347683698653, 'Does anyone have a citation for the study that showed humou is important?\n')


In [33]:
print(get_min_wm_distance(model, 'How is this course graded?', questions))

(1.0817537307739258, 'i am forma spa in and this is my first course\n')


# Finding the answers to the above questions

In [35]:
qa_all = utils.get_qa_df()

reading file 1 of 31 
reading file 2 of 31 
reading file 3 of 31 
reading file 4 of 31 
reading file 5 of 31 
reading file 6 of 31 
reading file 7 of 31 
reading file 8 of 31 
reading file 9 of 31 
reading file 10 of 31 
reading file 11 of 31 
reading file 12 of 31 
reading file 13 of 31 
reading file 14 of 31 
reading file 15 of 31 
reading file 16 of 31 
reading file 17 of 31 
reading file 18 of 31 
reading file 19 of 31 
reading file 20 of 31 
reading file 21 of 31 
reading file 22 of 31 
reading file 23 of 31 
reading file 24 of 31 
reading file 25 of 31 
reading file 26 of 31 
reading file 27 of 31 
reading file 28 of 31 
reading file 29 of 31 
reading file 30 of 31 
reading file 31 of 31 
42754 question threads in the data
23838 answer threads in the data


  all_qa = all_qa.sort(['_id.$oid_q', 'comment_thread_id.$oid'])


In [36]:
qa_all.head()

Unnamed: 0,title_q,body_q,author_id_q,course_id_q,_id.$oid_q,parent_id.$oid,title_a,body_a,author_id_a,course_id_a,_id.$oid_a,comment_thread_id.$oid
52504,Hi from Spain,I'm pharmacist and I want to do this course to...,588608143,BerkeleyX/GG101x/1T2014,5408bf5b1df0baf15900000c,,,,,,,
52503,Hello,I have just registered for the course and am a...,570428032,BerkeleyX/GG101x/1T2014,5408c2f11df0bac5fb0000ce,,,,,,,
52501,Introduction,"Hi, I am <<FULLNAME>> <<FULLNAME>>. I am a re...",691432080,BerkeleyX/GG101x/1T2014,5408d3c89bb11c6fb5000019,,,"Cheryl Rose, I am living in Knoxville. I am a ...",704734742.0,BerkeleyX/GG101x/1T2014,540c6c949bb11c4b480000f1,5408d3c89bb11c6fb5000019
52502,Introduction,"Hi, I am <<FULLNAME>> <<FULLNAME>>. I am a re...",691432080,BerkeleyX/GG101x/1T2014,5408d3c89bb11c6fb5000019,,,Hello from Toronto,3439105.0,BerkeleyX/GG101x/1T2014,5408f8991df0babbb1000016,5408d3c89bb11c6fb5000019
52499,Greetings from the Northwest U.S.,Hello! I live in a small town near the U.S./Ca...,85357777,BerkeleyX/GG101x/1T2014,5408d89f1df0baaff0000015,,,"Greetings, Sister Northwesterner! I live over ...",538343067.0,BerkeleyX/GG101x/1T2014,5409ce0b9bb11ccfc2000007,5408d89f1df0baaff0000015


In [41]:
qa_all[qa_all['body_q'] == 'Is the grammar important for English writing?']['body_a'].values

array([ 'The following link may help you to understand the importance of grammar.\n\nhttp://grammar.about.com/od/grammarfaq/f/grammarvalue.htm'], dtype=object)