In [1]:
# ANUJA TAYAL
#
# Problem Statement:
# Question answering system for a text article
# 
# Method of Solution
# - The application is designed to answer questions on an artical specifically Wikipedia using Gensim module of Doc2vec.
# - The application is designed in form of a chatbot which greets with hi and you can answer as many questions 
#   till you thank him for his services.
# - The application scrapes the required article using Beautiful Soup, preprocesses it to remove punctuations 
#   and other irrelevant texts. The processed text then trains the model using Doc2vec which converts every sentence of article to its corresponding vector. 
# - When the application is asked a question most probably a fact based question, question is preprocessed and 
#   trained model calculates calculates the corresponding Doc2vec vector. 
# - And the application returns the most similar sentence from the model and also the similarity.  
#
# Future Improvement:
# - The Doc2vec model changes its output in every iteration 
# - The system works best for factoid questions
# - We can use biLSTM model to train the model to increase accuracy 
# - Application can be trained to output more accurately to give the required answer.
# - To increase the training set, we can use data from the embedded references.

In [2]:
#import required libraries
import nltk
from nltk.corpus import stopwords
import urllib.request
from bs4 import BeautifulSoup
import re 
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import random

In [3]:
#preprocessing text by removing links, removing anything except numbers,letters,-,. and remove extra white spaces
def preprocess(text):
    text=re.sub(r'\[[0-9]*\]','',text) #remove links
    text=re.sub(r'[^a-zA-Z-0-9.]',' ',text) #remove anything except numbers,letters,-,.
    text=re.sub(r'\s+',' ',text)    #remove extra white spaces
    return text #return processed text

In [4]:
#scrape wikipedia page using Beautiful Soup, extract paragraphs and tokenize them to sentences
response=urllib.request.urlopen('https://en.wikipedia.org/wiki/R2-D2')
html=response.read()
soup=BeautifulSoup(html,'lxml')
para=soup.find_all('p') #extracting paragraphs
text=""
for p in para:
    text+=p.text
# print(text)
text=preprocess(text)
sent_list=nltk.sent_tokenize(text) #tokenize text to sentences

In [5]:
#tokenize sentences to individual words of sentences and also remove stopwords
words1=[nltk.word_tokenize(sent.lower()) for sent in sent_list]
for i in range(len(words1)):  
    words1[i] = [w for w in words1[i] if w not in stopwords.words('english')]

In [6]:
#use Doc2vec to create a vector from the 
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(words1)]
model = Doc2Vec(vector_size=100, window=10, min_count=1, workers=4)
model.build_vocab(documents)

In [7]:
#train the modell with 100 epochs
max_epochs=100
for epoch in range(max_epochs):
#     print('iteration {0}'.format(epoch))
    model.train(documents,total_examples=model.corpus_count,epochs=model.epochs)
    model.alpha -= 0.0002 # decrease the learning rate
    model.min_alpha = model.alpha

In [8]:
#greet the user
greeting_input_texts = ("hi","hey", "heys", "hello", "morning", "evening","greetings",)
greeting_replie_texts = ["hey", "hey hows you?", "*nods*", "hello there", "ello", "Welcome, how are you"]
 
def reply_greeting(text):
    for word in text.split():
        if word.lower() in greeting_input_texts:
            return random.choice(greeting_replie_texts)

In [9]:
#give reply function that takes user input and returns required most similar sentence and the similarity number
def give_reply(user_input):
    chatbot_response=''
    user_input=preprocess(user_input) #preprocess user input
    user_sent=nltk.word_tokenize(user_input.lower()) #tokenize user sentence
    input_vector=model.infer_vector(user_sent) #infer the vector from the trained model
    sims = model.docvecs.most_similar([input_vector], topn=len(model.docvecs))
    sent_index,similarity=sims[0][0],sims[0][1] #extract most similar sentence and its similarity
    return (sent_list[sent_index],similarity*100)

In [10]:
#sample 
output_greetings=["thanks","thank you very much","thank you","bye"]
print("Hello, I will answer your queries regarding R2-D2:") #greet user
continue_discussion=True
while continue_discussion==True:
    user_input = input() #get user input
    user_input = user_input .lower()
    if user_input in output_greetings or user_input =='':
            continue_discussion=False
            print("Chatbot: Most welcome")
    elif reply_greeting(user_input)!=None:
                print("Chatbot: "+reply_greeting(user_input))
    else:
        print("Chatbot: ",end="")
        print(give_reply(user_input))
print("Chatbot: Take care, bye ..")

Hello, I will answer your queries regarding R2-D2:
hey
Chatbot: *nods*
When was R2-D2 inducted into the Robot Hall of Fame?
Chatbot: ('R2-D2 was inducted into the Robot Hall of Fame in 2003.', 49.33385252952576)
bye
Chatbot: Most welcome
Chatbot: Take care, bye ..
