In [1]:
# ANUJA TAYAL
#
# Problem Statement:
# Question answering system for a text article
# 
# Method of Solution
# - The application is designed to answer questions on an artical specifically Wikipedia/article using Gensim module of Doc2vec.
# - The application is designed in form of a chatbot which greets with hi and you can answer as many questions 
#   till you thank him for his services.
# - The application reads the article, preprocesses it to remove punctuations and other irrelevant texts. 
#   The processed text then trains the model using Doc2vec which converts every sentence of article to its corresponding vector. 
# - When the application is asked a question most probably a fact based question, question is preprocessed and 
#   trained model calculates calculates the corresponding Doc2vec vector. 
# - And the application returns the most similar sentence from the model and also the similarity.  

In [2]:
#import required libraries
import nltk
from nltk.corpus import stopwords
import re 
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import random

In [3]:
#preprocessing text by removing links, removing anything except numbers,letters,-,. and remove extra white spaces
def preprocess(text):
    text=re.sub(r'[^a-zA-Z-0-9.]',' ',text) #remove anything except numbers,letters,-,.
    text=re.sub(r'\s+',' ',text)    #remove extra white spaces
    return text #return processed text

In [4]:
#Read the article, process it and tokenize them to sentences
with open('document.txt') as f:
    text=f.read()

text=preprocess(text)
sent_list=nltk.sent_tokenize(text) #tokenize text to sentences
# print(sent_list)

In [5]:
#tokenize sentences to individual words of sentences and also remove stopwords
# words1=[nltk.word_tokenize(sent.lower()) for sent in sent_list]
words1=[]
for sent in sent_list:
    if sent!='.':
        words1.append(nltk.word_tokenize(sent.lower()))
# print(words1)
for i in range(len(words1)):  
    words1[i] = [w for w in words1[i] if w not in stopwords.words('english')]
# # print(words1)

In [6]:
#use Doc2vec to create a vector from the 
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(words1)]
model = Doc2Vec(vector_size=100, window=10, min_count=1, workers=4)
model.build_vocab(documents)

In [7]:
#train the modell with 100 epochs
max_epochs=100
for epoch in range(max_epochs):
#     print('iteration {0}'.format(epoch))
    model.train(documents,total_examples=model.corpus_count,epochs=model.epochs)
    model.alpha -= 0.0002 # decrease the learning rate
    model.min_alpha = model.alpha

In [8]:
#greet the user
greeting_input_texts = ("hi","hey", "heys", "hello", "morning", "evening","greetings",)
greeting_replie_texts = ["hey", "hey hows you?", "*nods*", "hello there", "ello", "Welcome, how are you"]
 
def reply_greeting(text):
    for word in text.split():
        if word.lower() in greeting_input_texts:
            return random.choice(greeting_replie_texts)

In [9]:
#give reply function that takes user input and returns required most similar sentence and the similarity number
def give_reply(user_input):
    chatbot_response=''
    user_input=preprocess(user_input) #preprocess user input
    user_sent=nltk.word_tokenize(user_input.lower()) #tokenize user sentence
#     print(user_input,user_sent)
    input_vector=model.infer_vector(user_sent) #infer the vector from the trained model
    sims = model.docvecs.most_similar([input_vector], topn=len(model.docvecs))
    sent_index1,sent_index2=sims[0][0],sims[1][0] #extract 2 most similar sentence
    return (sent_list[sent_index1],sent_list[sent_index2])

In [11]:
#sample 
output_greetings=["thanks","thank you very much","thank you","bye"]
print("Hello, I am a chatbot and will answer your queries regarding the article:") #greet user
continue_discussion=True
while continue_discussion==True:
    user_input = input() #get user input
    user_input = user_input .lower()
    if user_input in output_greetings or user_input =='':
            continue_discussion=False
            print("Chatbot: Most welcome")
    elif reply_greeting(user_input)!=None:
                print("Chatbot: "+reply_greeting(user_input))
    else:
        print("Chatbot: ",end="")
        print(give_reply(user_input))
print("Chatbot: Take care, bye ..")

Hello, I am a chatbot and will answer your queries regarding the article:
What is the difference between the approaches of Socrates and Aristotle?
Chatbot: ('Plato believed that talent and intelligence are not distributed genetically and thus is be found in children born to all classes although his proposed system of selective public education for an educated minority of the population does not really follow a democratic model.', 'Aristotle considered human nature habit and reason to be equally important forces to be cultivated in education the ultimate aim of which should be to produce good and virtuous citizens.')
Why do educationists consider philosophy a ‘weak and woolly’ field?
Chatbot: ('Many educationalists consider it a weak and woolly field too far removed from the practical applications of the real world to be useful.', 'Philosophy of Education is a label applied to the study of the purpose process nature and ideals of education.')
What do you understand by the term ‘Perennia