# TRP FAQ chatbot using pretrained Word2Vec Model

Importing necessary libraries

In [18]:
import os 
import numpy as np
import pandas as pd
import re, unicodedata
import string
import pickle
from nltk import sent_tokenize, word_tokenize

In [19]:
os.chdir('C:/Users/akadali/Desktop/Deep_NLP/MLG_Capstone_ChatBot/ChatBot_GoogleW2V')
data = pd.read_csv('trp_faq_dataset.csv', encoding = 'latin1')
data = data[['question', 'answer']]

Importing stop words, WordNetLemmatizer and punctuations

In [20]:
import nltk
stops = nltk.corpus.stopwords.words('english')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

import string
puncs = string.punctuation
#Adding additional punctuations
puncs = puncs + "’'`" 
print(puncs)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~’'`


Data Cleaning
-----

* This dataset has a lot of contractions, hence all of them need to be removed/replaced within the dataset. The below function replaces the contractions with corresponding word pairs.

* Also, there are quite a few abbreviations such as U.S., US, USI to be substituted with their expanded forms

* removing additional spaces, if there are any

In [21]:
import re
def text_clean(text):
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub('[%s]'%re.escape(puncs), ' ', text)
    #convert u.s or us  to 'United States'
    text = re.sub(r" U.S. ", " united states ", text)
    text = re.sub(r" US ", " united states ", text)
    text = re.sub(r" USI ", " united states india ", text)
    text = re.sub(r" +"," ", text)
    return text

Data Preprocessing and Cleaning(2)
----------------------------
* Tokenizing the words
* Lemmatization
* removing numeric digits and punctuations

In [22]:
def text_preprocess(text):
    #tokenizing
    tokens = word_tokenize(text)
    #converting to lower case and lemmatization
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.lower() not in stops]
    filtered_tokens = []
    #Remove anything but alphabets - numbers, punctuations etc
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    #return list of lists for embedding vectors
    return filtered_tokens

Applying 'Cleaning' and 'Preprocessing' methods on the dataset

In [23]:
#Clean the questions dataset
data['question'] = data['question'].apply(lambda x:text_clean(x))

In [None]:
#Preprocess the questions
data['question'] = data['question'].apply(lambda x:text_preprocess(x))

Pre-trained word and phrase vectors from gensim models
-----------------------------------------
* Getting GoogleNews-vectors-negative - A Pre-trained word and phrase vectors. 'Questions'in our dataset are converted to vectors by averaging the vectors of individual words in each question.

* This can be loaded using 'gensim.models'

In [24]:
import gensim.models
#Changing the directory to get to the embeddings 
os.chdir('C:/Users/akadali/Desktop/Deep_NLP/MLG_Capstone_ChatBot/ChatBot_GoogleW2V/Word Embeddings')
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

Below function creates the 300-dimension vectors for each question (for all tokens that are present in the Google New pre-trained Word2Vec models)

In [25]:
# Creating a feature vector by averaging embeddings for all words in a sentence
def embedding_feats(list_of_tokens):
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    #feats = []
    feat_for_this = zero_vector
    count_for_this = 0
    for token in list_of_tokens:
        if token in model:
            feat_for_this += model[token]
            count_for_this+= 1
    feats = feat_for_this/count_for_this
    return feats

Creating vectors for all questions in the faq dataset

In [26]:
#np.seterr(divide='ignore', invalid='ignore')
question_vectors = [embedding_feats(x) for x in data['question']]

* This block of code would convert the user's question to a 300 dimensional vector and calculates the cosine_similarity against all question vectors in the FAQ dataset. 
* Obtains the correspoding answer of the best matched question

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

def get_response(query, t):
    query = text_clean(query)
    query = text_preprocess(query)
    #print(query)
    if len(query) == 0:
        response = "Can you please provide more information for me to undersatnd"
    else:
        q_vec = [embedding_feats(query)]
        sims = cosine_similarity(q_vec, question_vectors)
        max_s = sims.max()
        if max_s < t:
            response = "Hmm..sorry, I don't quite understand that, can you please rephrase your question"
        else:
            max_i = np.argmax(sims)
            response = data.answer[max_i]
    return response

In [73]:
def chat():
    print("Specify the confidence level")
    level = input("Confidence:")
    conf = int(level)/100
    print("-------------------------------------------------")
    print("Start chatting with the bot (type 'quit' to stop)")
    print("-------------------------------------------------")
    print("Hi There....I'm Talent Referral Bot and I'm here to help you with referral inquiries\n")
    print("**tip: please enter \n'status_' to check your referral status \n 'bonus_' to check your bonus payout status and \n 'prog' for program related inquiry")
    while True:
        user_input = input("You:")
        if user_input == 'status_':
            print("can you please provide your referral details in the below order to check in database\n1.Referral Name\n2.Referral Email address\n3.Your email address")
        elif user_input == 'bonus_':
            print("can you please provide your referral details in the below order to check in database\n1.Referral Name\n2.Referral Email address\n3.Your email address")
        #elif user_input == '':
        #    print("Hmm..sorry, I don't quite understand that, can you please rephrase your question")
        elif (user_input).lower() == 'quit':
            break            
        response = get_response(user_input, conf)
        print("Bot:",response)

In [74]:
chat()

Specify the confidence level
Confidence:80
-------------------------------------------------
Start chatting with the bot (type 'quit' to stop)
-------------------------------------------------
Hi There....I'm Talent Referral Bot and I'm here to help you with referral inquiries

**tip: please enter 'status_' to check your referral status 
 'bonus_' to check your bonus payout status and 
 'prog' for program related inquiry
You:bonus_
can you please provide your referral details in the below order to check in database
1.Referral Name
2.Referral Email address
3.Your email address
Bot: Hmm..sorry, I don't quite understand that, can you please rephrase your question
You:QUIT


# Creating Chatbot GUI using tkinter 

In [102]:
#Creating GUI with tkinter
import tkinter
from tkinter import *

In [128]:
def send():
    msg = EntryBox.get("1.0",'end-1c')
    EntryBox.delete("0.0",END)
    if msg != '':
        ChatLog.config(state=NORMAL)
        ChatLog.insert(END, "You: " + msg + '\n\n')
        ChatLog.config(foreground="#000000", font=("Calibri", 10 ))
        res = get_response(msg, 0.8)
        ChatLog.insert(END, "Bot: " + res + '\n\n')
        ChatLog.config(state=DISABLED)
        ChatLog.yview(END)

In [129]:
base = Tk()
base.title("Talent Referral BOT")
base.geometry("400x500")
base.resizable(width = TRUE, height = TRUE)

''

In [130]:
#Create Chat window
ChatLog = Text(base, bd=0, bg="white", height="8", width="50", font= ("Calibri", 10) ,fg='#000000')
ChatLog.config(state=DISABLED)

In [131]:
#Bind scrollbar to Chat window
scrollbar = Scrollbar(base, command=ChatLog.yview, cursor="heart")
ChatLog['yscrollcommand'] = scrollbar.set

In [132]:
#Create Button to send message
SendButton = Button(base, font=("Calibri",12,'bold'), text="Send", width="12", height=5,
                    bd=0, bg="#86bc25", activebackground="#3c9d9b",fg='#ffffff',
                    command = send)

In [133]:
#Create the box to enter message
EntryBox = Text(base, bd=0, bg="white",width="29", height="5", font="Calibri")
#EntryBox.bind("<Return>", send)

In [134]:
#Place all components on the screen
scrollbar.place(x=376,y=6, height=386)
ChatLog.place(x=6,y=6, height=386, width=370)
EntryBox.place(x=128, y=401, height=90, width=265)
SendButton.place(x=6, y=401, height=90)

In [135]:
base.mainloop()