In [None]:
# mouting google drive to import data saved on it
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import nltk
import random
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
# Opening the different sets of data
s08 = pd.read_csv('/content/drive/MyDrive/Question_Answer_Dataset_v1.2/S08/question_answer_pairs.txt', sep='\t', encoding = 'ISO-8859-1')
s09 = pd.read_csv('/content/drive/MyDrive/Question_Answer_Dataset_v1.2/S09/question_answer_pairs.txt', sep='\t', encoding = 'ISO-8859-1')
s10 = pd.read_csv('/content/drive/MyDrive/Question_Answer_Dataset_v1.2/S10/question_answer_pairs.txt', sep='\t', encoding = 'ISO-8859-1')

In [None]:
# Making a new variable to manipulate the data while keeping the original
s08_new = s08
s09_new = s09
s10_new = s10

In [None]:
# Dropping columns which we will not use and removing any NAN values then resetting the index for each set of data
s08_new = s08_new.drop(columns = ['DifficultyFromQuestioner', 'DifficultyFromAnswerer', 'ArticleTitle'])
s08_new = s08_new.dropna()
s08_new = s08_new.reset_index(drop=True)

s09_new = s09_new.drop(columns = ['DifficultyFromQuestioner', 'DifficultyFromAnswerer', 'ArticleTitle'])
s09_new = s09_new.dropna()
s09_new = s09_new.reset_index(drop=True)

s10_new = s10_new.drop(columns = ['DifficultyFromQuestioner', 'DifficultyFromAnswerer', 'ArticleTitle'])
s10_new = s10_new.dropna()
s10_new = s10_new.reset_index(drop=True)

In [None]:
# Getting the article and storing it by using the path instead then replacing all \n to a space
for i in range(len(s08_new)):
    article = open('/content/drive/MyDrive/Question_Answer_Dataset_v1.2/S08/' + str(s08_new['ArticleFile'].loc[i]) + '.txt.clean', encoding = 'ISO-8859-1')
    s08_new['ArticleFile'].loc[i] = (article).read()
    article.close()

for i in range(len(s09_new)):
    article = open('/content/drive/MyDrive/Question_Answer_Dataset_v1.2/S09/' + str(s09_new['ArticleFile'].loc[i]) + '.txt.clean', encoding = 'ISO-8859-1')
    s09_new['ArticleFile'].loc[i] = (article).read()
    article.close()

for i in range(len(s10_new)):
    article = open('/content/drive/MyDrive/Question_Answer_Dataset_v1.2/S10/' + str(s10_new['ArticleFile'].loc[i]) + '.txt.clean', encoding = 'ISO-8859-1')
    s10_new['ArticleFile'].loc[i] = (article).read()
    article.close()

s08_new = s08_new.replace('\n',' ', regex=True)
s09_new = s09_new.replace('\n',' ', regex=True)
s10_new = s10_new.replace('\n',' ', regex=True)

In [None]:
# Splitting the questions into train and validation sets

s08_val = s08_new.sample(frac=0.1, replace=False, axis=0, ignore_index=False)

In [None]:
# Combining all Articles into one variable

Combined_data = ""

# Getting the S08 articles into combined data
for i in range(4):
  for y in range(10):
    open_data = open('/content/drive/MyDrive/Question_Answer_Dataset_v1.2/S08/data/set' + str(i+1) + '/a' + str(y+1) + '.txt.clean', 'r', errors = 'ignore')
    raw_data = open_data.read()
    raw_data = raw_data.lower()
    Combined_data = Combined_data + raw_data

# Getting the S09 articles into combined data
for i in range(5):
  for y in range(10):
    open_data = open('/content/drive/MyDrive/Question_Answer_Dataset_v1.2/S09/data/set' + str(i+1) + '/a' + str(y+1) + '.txt.clean', 'r', errors = 'ignore')
    raw_data = open_data.read()
    raw_data = raw_data.lower()
    Combined_data = Combined_data + raw_data

# Getting the S10 articles into combined data
for i in range(6):
  for y in range(10):
    open_data = open('/content/drive/MyDrive/Question_Answer_Dataset_v1.2/S10/data/set' + str(i+1) + '/a' + str(y+1) + '.txt.clean', 'r', errors = 'ignore')
    raw_data = open_data.read()
    raw_data = raw_data.lower()
    Combined_data = Combined_data + raw_data

In [None]:
# Tokenizing the Data into sentences and words

sent_token = nltk.sent_tokenize(Combined_data)
word_token = nltk.word_tokenize(Combined_data)

In [None]:
# Pre-processing the data (lemmatizing)

lemmer = nltk.stem.WordNetLemmatizer()

def LemTokens(tokens):
  return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
  return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [None]:
# Creating some greeting inputs

greet_input = ("hi", "hello", "hey", "greetings", "hi there", "whats up")
greet_response = ("hi", "hi there", "hello there", "Nice to meet you", "hey", "Hey I'm 788 Chatbot")

In [None]:
# Check if user typed a greeting

def greeting(user_input):
  for word in user_input.split():
    if word.lower() in greet_input:
      return random.choice(greet_response)

In [None]:
# Making the chatbot respond to questions

def response(user_response):
  chatbot_response=''
  TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
  tfidf = TfidfVec.fit_transform(sent_token)
  vals = cosine_similarity(tfidf[-1], tfidf)
  idx = vals.argsort()[0][-2]
  flat = vals.flatten()
  flat.sort()
  req_tfidf = flat[-2]
  if(req_tfidf == 0):
    chatbot_response = chatbot_response + "I don't understand"
    return chatbot_response
  else:
    chatbot_response = chatbot_response + sent_token[idx]
    return chatbot_response

In [None]:
# Running the Chat bot and making it respond to questions

flag = True
print("788 Chatbot: My name is 788 Chatbot. I will answer questions about different Wikipedia Articles. To exit, type exit")

while(flag == True):
    user_response = input()
    user_response = user_response.lower()
    if(user_response!='exit'):
      if(user_response == 'thank you' or user_response == 'thanks'):
        flag = False
        print("788 Chatbot: You're welcome")
      else:
        if(greeting(user_response)!=None):
          print("788 Chatbot: "+greeting(user_response))
        else:
          sent_token.append(user_response)
          word_token = word_token + nltk.word_tokenize(user_response)
          final_words = list(set(word_token))
          print("788 Chatbot: ", end="")
          print(response(user_response))
          sent_token.remove(user_response)
    else:
      flag = False
      print("788 Chatbot: Bye")

Chatbot: My name is 788 Chatbot. I will answer questions about different Wikipedia Articles. To exit, type exit
abraham lincoln
Chatbot: 

  % sorted(inconsistent)


the assassination of abraham lincoln.
who is abraham lincoln
Chatbot: 

  % sorted(inconsistent)


the assassination of abraham lincoln.
elephant
Chatbot: 

  % sorted(inconsistent)


there are three living species: the african bush elephant, the african forest elephant (until recently known collectively as the african elephant), and the asian elephant (also known as the indian elephant).
when was abraham lincoln born
Chatbot: 

  % sorted(inconsistent)


the assassination of abraham lincoln.
abraham lincoln was born
Chatbot: 

  % sorted(inconsistent)


the assassination of abraham lincoln.
Thomas Lincoln and Nancy Hanks
Chatbot: 

  % sorted(inconsistent)


abraham lincoln was born on february 12, 1809, to thomas lincoln and nancy hanks, two uneducated farmers.
who was thomas lincoln
Chatbot: 

  % sorted(inconsistent)


abraham lincoln was born on february 12, 1809, to thomas lincoln and nancy hanks, two uneducated farmers.
what was lincoln's formal education
Chatbot: 

  % sorted(inconsistent)


2.

lincoln's formal education consisted of about 18 months of schooling.
when did lincoln begin his political career
Chatbot: 

  % sorted(inconsistent)


:    "he did it."
Ulysses S. Grant
Chatbot: 

  % sorted(inconsistent)


ulysses s. grant

  
ulysses s. grant, see military career for a discussion of grant's middle initial.
Hiram Ulysses Grant
Chatbot: 

  % sorted(inconsistent)


grant wrote his name in the entrance register as "ulysses hiram grant" (concerned that he would otherwise become known by his initials, h.u.g.
hi
Chatbot: hi
hi
Chatbot: hi there
hi
Chatbot: Hey I'm 788 Chatbot
hi
Chatbot: hey
hi
Chatbot: hello there
hi
Chatbot: hey
hi
Chatbot: Hey I'm 788 Chatbot
i
Chatbot: 

  % sorted(inconsistent)


I don't understand
exit
Chatbot: Bye
