In [1]:
# ANUJA TAYAL
#
# Problem Statement:
# Question answering system for a text article
# 
# Method of Solution
# - The application is designed to answer questions on an artical specifically Wikipedia using Gensim module of Doc2vec.
# - The application is designed in form of a chatbot which greets with hi and you can answer as many questions 
#   till you thank him for his services.
# - The application scrapes the required article using Beautiful Soup, preprocesses it to remove punctuations 
#   and other irrelevant texts. The processed text then trains the model using Doc2vec which converts every sentence of article to its corresponding vector. 
# - When the application is asked a question most probably a fact based question, question is preprocessed and 
#   trained model calculates calculates the corresponding Doc2vec vector. 
# - And the application returns the most similar sentence from the model and also the similarity.  
#
# Future Improvement:
# - The Doc2vec model changes its output in every iteration 
# - The system works best for factoid questions
# - We can use biLSTM model to train the model to increase accuracy 
# - Application can be trained to output more accurately to give the required answer.
# - To increase the training set, we can use data from the embedded references.

In [17]:
#import required libraries
import nltk
from nltk.corpus import stopwords
import re 
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument
import random
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#preprocessing text by removing links, removing anything except numbers,letters,-,. and remove extra white spaces
def preprocess(text):
#     text=re.sub(r'\[[0-9]*\]','',text) #remove links
    text=re.sub(r'[^a-zA-Z-0-9.]',' ',text) #remove anything except numbers,letters,-,.
    text=re.sub(r'\s+',' ',text)    #remove extra white spaces
    return text #return processed text

In [28]:
#scrape wikipedia page using Beautiful Soup, extract paragraphs and tokenize them to sentences
# response=urllib.request.urlopen('https://en.wikipedia.org/wiki/R2-D2')
# html=response.read()
# soup=BeautifulSoup(html,'lxml')
# para=soup.find_all('p') #extracting paragraphs
with open('document.txt') as f:
    text=f.read()
text=preprocess(text)
sent_list=nltk.sent_tokenize(text) #tokenize text to sentences
print(sent_list)

['Philosophy of Education is a label applied to the study of the purpose process nature and ideals of education.', 'It can be considered a branch of both philosophy and education.', 'Education can be defined as the teaching and learning of specific skills and the imparting of knowledge judgment and wisdom and is something broader than the societal institution of education we often speak of.', 'Many educationalists consider it a weak and woolly field too far removed from the practical applications of the real world to be useful.', 'But philosophers dating back to Plato and the Ancient Greeks have given the area much thought and emphasis and there is little doubt that their work has helped shape the practice of education over the millennia.', 'Plato is the earliest important educational thinker and education is an essential element in The Republic his most important work on philosophy and political theory written around 360 B.C.', '.', 'In it he advocates some rather extreme methods remo

In [7]:
#tokenize sentences to individual words of sentences and also remove stopwords
# words1=[nltk.word_tokenize(sent.lower()) for sent in sent_list]
# for i in range(len(words1)):  
#     words1[i] = [w for w in words1[i] if w not in stopwords.words('english')]
# word2vec=Word2Vec(words1,min_count=2)

# sent_scores={}
# # sent_list=nltk.sent_tokenize(text)
# for sent in sent_list:
#     for word in nltk.word_tokenize(sent.lower()):
#         if word in freq.keys():
#             if len(sent.split(' ')) < 30:
#                 if sent not in sent_scores.keys():
#                     sent_scores[sent] = freq[word]
#                 else:
#                     sent_scores[sent] += freq[word]



In [16]:
lemmatizer = nltk.stem.WordNetLemmatizer()
 
def LemmatizeWords(words):
    return [lemmatizer.lemmatize(word) for word in words]
 
remove_punctuation= dict((ord(punctuation), None) for punctuation in string.punctuation)
 
def RemovePunctuations(text):
    return LemmatizeWords(nltk.word_tokenize(text.lower().translate(remove_punctuation)))

In [11]:
#greet the user
greeting_input_texts = ("hi","hey", "heys", "hello", "morning", "evening","greetings",)
greeting_replie_texts = ["hey", "hey hows you?", "*nods*", "hello there", "ello", "Welcome, how are you"]
 
def reply_greeting(text):
    for word in text.split():
        if word.lower() in greeting_input_texts:
            return random.choice(greeting_replie_texts)

In [29]:
#give reply function that takes user input and returns required most similar sentence and the similarity number
def give_reply(user_input):
    chatbot_response=''
    user_input=preprocess(user_input)
    sent_list.append(user_input)
    print(sent_list)
#     user_input=preprocess(user_input) #preprocess user input
#     user_sent=nltk.word_tokenize(user_input.lower()) #tokenize user sentence
    word_vectors = TfidfVectorizer(tokenizer=RemovePunctuations, stop_words='english')
    vectorized_words = word_vectors.fit_transform(sent_list)
#     print(vectorized_words)
    similarity_values = cosine_similarity(vectorized_words[-1], vectorized_words)
    similar_sentence_number =similarity_values.argsort()[0][-2]
    matched_vector=similarity_values[0][similar_sentence_number]
    sent_list.remove(user_input)
    print(matched_vector)
    if(matched_vector ==0):
        chatbot_response=chatbot_response+"I am sorry! I don't understand you"
        return chatbot_response
    else:
        chatbot_response = chatbot_response +sent_list[similar_sentence_number]
        return chatbot_response

In [30]:
#sample 
output_greetings=["thanks","thank you very much","thank you","bye"]
print("Hello, I will answer your queries regarding R2-D2:") #greet user
continue_discussion=True
while continue_discussion==True:
    user_input = input() #get user input
    user_input = user_input .lower()
    if user_input in output_greetings or user_input =='':
            continue_discussion=False
            print("Chatbot: Most welcome")
    elif reply_greeting(user_input)!=None:
                print("Chatbot: "+reply_greeting(user_input))
    else:
        print("Chatbot: ",end="")
        print(give_reply(user_input))
        
print("Chatbot: Take care, bye ..")

Hello, I will answer your queries regarding R2-D2:
hey
Chatbot: hey
What do you understand by the term ‘Perennialism’, in the context of the given comprehension passage?
Chatbot: ['Philosophy of Education is a label applied to the study of the purpose process nature and ideals of education.', 'It can be considered a branch of both philosophy and education.', 'Education can be defined as the teaching and learning of specific skills and the imparting of knowledge judgment and wisdom and is something broader than the societal institution of education we often speak of.', 'Many educationalists consider it a weak and woolly field too far removed from the practical applications of the real world to be useful.', 'But philosophers dating back to Plato and the Ancient Greeks have given the area much thought and emphasis and there is little doubt that their work has helped shape the practice of education over the millennia.', 'Plato is the earliest important educational thinker and education is 

KeyboardInterrupt: 

In [33]:
import tensorflow.contrib as seq2seq
from tensorflow.contrib.seq2seq.models import SimpleSeq2Seq

model = SimpleSeq2Seq(input_dim=5, hidden_dim=10, output_length=8, output_dim=8)
model.compile(loss='mse', optimizer='rmsprop')

ModuleNotFoundError: No module named 'tensorflow.contrib'

In [None]:
print(model.summary())