In [1]:
#from absl import logging
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
sns.set()

# Overview
The Universal Sentence Encoder encodes text into high-dimensional vectors that can be used for text classification, semantic similarity, clustering and other natural language tasks.

The model is trained and optimized for greater-than-word length text, such as sentences, phrases or short paragraphs. It is trained on a variety of data sources and a variety of tasks with the aim of dynamically accommodating a wide variety of natural language understanding tasks. The input is variable length English text and the output is a 512 dimensional vector. We apply this model to the STS benchmark for semantic similarity, and the results can be seen in the example notebook made available. The universal-sentence-encoder model is trained with a deep averaging network (DAN) encoder.

To learn more about text embeddings, refer to the TensorFlow Embeddings documentation. Our encoder differs from word level embedding models in that we train on a number of natural language prediction tasks that require modeling the meaning of word sequences rather than just individual words. Details are available in the paper "Universal Sentence Encoder" 

In [2]:
#Loading model from USE DAN (Deep Averaging Network)
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5" 
model = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder-large/5 loaded


In [3]:
#Method to embed inputs
def embed(input):
    return model(input)

In [4]:
os.chdir('C:/Users/akadali/Desktop/Deep_NLP/MLG_Capstone_ChatBot/ChatBot_GoogleW2V')
data = pd.read_csv('chatbot_train.csv', encoding = 'latin1')
data = data[['question', 'answer', 'intent']]

In [5]:
import nltk
from nltk.tokenize import word_tokenize
stops = nltk.corpus.stopwords.words('english')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

import string
puncs = string.punctuation
#Adding additional punctuations
puncs = puncs + "’'`" 
print(puncs)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~’'`


In [6]:
print(stops)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Data Cleaning
-----

* This dataset has a lot of contractions, hence all of them need to be removed/replaced within the dataset. The below function replaces the contractions with corresponding word pairs.

* Also, there are quite a few abbreviations such as U.S., US, USI to be substituted with their expanded forms

* removing additional spaces, if there are any

In [7]:
import re
def text_clean(text):
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub('[%s]'%re.escape(puncs), ' ', text)
    #convert u.s or us  to 'United States'
    text = re.sub(r" U.S. ", " united states ", text)
    text = re.sub(r" US ", " united states ", text)
    text = re.sub(r" USI ", " united states india ", text)
    text = re.sub(r" +"," ", text)
    #Removing additional characters that captured during import
    text = re.sub("[\x97]","'", text)
    text = re.sub("[\x96]","'", text)
    text = re.sub("[\x95]","'", text)
    text = re.sub("[\x94]","'", text)
    text = re.sub("[\x93]","'", text)
    text = re.sub("[\x92]","'", text)
    text = re.sub("[\x91]","'", text)
    return text

# Data Preprocessing and Cleaning(2)
----------------------------
* Tokenizing the words
* Lemmatization
* removing numeric digits and punctuations

* import test data
* create test question embeddings
* create a new column for new answers
* get the answers in that column
* compare both answers
* get the results
* Get the cosine values as well

In [8]:
def text_preprocess(text):
    #tokenizing
    tokens = word_tokenize(text)
    #converting to lower case and lemmatization
    tokens = [word.lower() for word in tokens]
    filtered_tokens = []
    #Remove anything but alphabets - numbers, punctuations etc
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    #return list of lists for embedding vectors
    return " ".join(filtered_tokens)

In [9]:
#Clean the questions dataset
data['question'] = data['question'].apply(lambda x: text_clean(x))

In [10]:
#Preprocess the questions
data['question'] = data['question'].apply(lambda x:text_preprocess(x))

# Creating Embeddings using pretrained google universal-sentence-encoder

In [11]:
#Method to embed inputs
def embed(input):
    return model(input)

In [12]:
#np.seterr(divide='ignore', invalid='ignore')
question_vectors = [embed([x]) for x in data['question']]

In [13]:
data['question'][144]

'can you confirm when jennie payout will be'

In [14]:
question_vectors[144]

<tf.Tensor: shape=(1, 512), dtype=float32, numpy=
array([[-0.05937516, -0.07139486, -0.03580907, -0.02181975,  0.04792665,
        -0.0549308 , -0.00814513, -0.00288107,  0.01621236,  0.06205643,
         0.00832495,  0.01106013,  0.03006287, -0.03137842, -0.00210727,
         0.03081903, -0.0387282 ,  0.03473219,  0.01469928,  0.0431647 ,
        -0.00048844,  0.06611931, -0.0083472 ,  0.01772776, -0.08686357,
         0.00481038, -0.01594081,  0.03066936,  0.01401082,  0.00223883,
         0.04004965,  0.00769123, -0.03169547,  0.04066599, -0.01992467,
         0.04570906, -0.00786444,  0.08135858, -0.02986662, -0.00229437,
        -0.02533258,  0.00544696,  0.07811685,  0.01056373, -0.04172884,
        -0.00344714,  0.03654783,  0.04101761,  0.01500739, -0.03979726,
        -0.05531175, -0.04886104,  0.05610216, -0.00932195, -0.05707739,
        -0.05330759,  0.01215209, -0.06623979, -0.047648  , -0.04336387,
         0.10992612,  0.01377444, -0.01867487, -0.00571322,  0.05534788,
 

# Estalishing connection to the 'Talent referral payout' database

In [15]:
import mysql.connector
mydb = mysql.connector.connect(host="localhost", 
                               user="root", 
                               passwd="root",
                               auth_plugin = 'mysql_native_password',
                               database = "talent_referral_payout")

In [16]:
mycursor = mydb.cursor(buffered = True)

In [17]:
#Definition to obtain te referral bonus payout history of a referring professional

def search_pay_db(emp_id):
    emp_id = str(emp_id)
    #print(type(emp_id))
    query = "SELECT referral_name,amount,date_of_origin FROM trp_payment_report_sample WHERE emp_num = "+ emp_id
    mycursor.execute(query)
    return mycursor.fetchone()

In [18]:
#Definition to obtain the status of referrals submitted by the referring professional

def search_status_db(rms_id):
    rms_id = str(rms_id)
    #print(type(emp_id))
    query = "SELECT candidate_name,requisition_no,current_status,reference_date FROM trp_status WHERE candidate_id = " + rms_id
    mycursor.execute(query)
    return mycursor.fetchone()

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

def get_response(t):
    #print(query)
    while True:
        user_input = input("You:")
        if len(user_input) == 0:
            print("Bot: Can you please provide more information")
        elif user_input.lower() == '_bonus_':
            print("Enter your personnel number to fetch the payment details")
            emp_no = input("Emp. ID:")
            emp_no = str(emp_no)
            ans = search_pay_db(emp_no)
            #print("Bot: Your bonus payout details are below (in the last 6 months)")
            if ans is not None:
                print("Bot: You were paid $"+ str(ans[1])+" for referring "+str(ans[0])+" along with "+str(ans[2])+" pay period")
                print("     Let me know if I can help you with anything else")
            else:
                print("Bot: Sorry...I do not have your referral info. in my database. Please contact TRP Team")
        elif user_input.lower() == '_status_':
            print("Bot: Enter your referral's RMS/Taleo ID to fetch the status info.")
            email = input("ID:")
            email = str(email)
            ans = search_status_db(email)
            #print("Bot: Below are the list of referrals you submitted in the last 6 months")
            if ans is not None:
                print("Bot: Your referral "+str(ans[0])+ " has been "+str(ans[2])+" for requisition with ID:"+str(ans[1]))
                print("     Let me know if I can help you with anything else")
            else:
                print("Bot: Sorry...I do not have your referral info. in my database. Please contact TRP Team")
        elif user_input.lower() == 'quit':
            break
        else:
            #q = text_clean(user_input)
            #q = text_preprocess(user_input)
            q = user_input
            q_vec = embed([q])
            size_ = len(question_vectors)
            sims = np.zeros(size_)
            for i in range(size_):
                sims[i] = cosine_similarity(q_vec, question_vectors[i])
            max_s = sims.max()
            if max_s < t:
                response = "Hmm..sorry, I don't quite understand that, can you please rephrase your question"
            else:
                max_i = np.argmax(sims)
                response = data.answer[max_i]
            print("Bot:", response)

In [20]:
def chat_test():
    print("Specify the confidence level")
    level = input("Confidence:")
    conf = int(level)/100
    print("Hi There....I'm Talent Referral Bot and I'm here to help you with referral inquiries\n[type 'quit' to stop]")
    print("***TIP: PLEASE ENTER \n 1.'_status_' -> to check your referral status \n 2.'_bonus_' -> to check your bonus payout status \n 3.Else, enter your question")
    ans = get_response(conf)
    #print("Bot:", ans)

In [21]:
q = "Can u please haaallppp"
q_vec = embed([q])
#print(q_vec.shape)
size = len(question_vectors)
sims = np.zeros(size)
for i in range(size):
    sims[i] = cosine_similarity(q_vec, question_vectors[i])
max_s = sims.max()
if max_s < 0.4:
    response = "Hmm..sorry, I don't quite understand that, can you please rephrase your question"
else:
    max_i = np.argmax(sims)
    response = data.answer[max_i]
print("Bot:", response)

Bot: Sure, I can help you. Please type your question here


In [22]:
sims.max()

0.5857939720153809

In [23]:
size = len(question_vectors)
sims = np.zeros(size)
for i in range(size):
    sims[i] = cosine_similarity(q_vec, question_vectors[i])
print(sims)

[-5.89681715e-02 -6.63996860e-02 -5.76971248e-02 -6.63996860e-02
 -3.44532263e-03  5.23576029e-02 -1.27700455e-02  3.99235338e-02
 -1.08642206e-02 -1.21218869e-02  6.56280667e-04  1.59944415e-01
  7.81605691e-02  3.05706859e-01  5.86312339e-02 -8.83430988e-02
  4.48929481e-02  2.72031873e-04  1.33828018e-02  3.01438086e-02
 -3.34475562e-03 -6.77500963e-02  5.30438796e-02 -5.16578928e-02
 -1.09052099e-03 -2.63986476e-02 -4.80867438e-02  2.78110951e-02
  8.31587315e-02  1.51433915e-01  1.23769633e-01 -8.20510015e-02
  2.60227352e-01 -4.68182042e-02 -1.27102301e-01  9.35881361e-02
  2.02220380e-01  8.49327296e-02  7.21833576e-03 -2.31734104e-02
  2.87696905e-02  3.06047257e-02  2.83290595e-02 -1.38194449e-02
  2.46626705e-01  1.20695613e-01  1.77700147e-01  1.28624022e-01
  1.42477751e-01  1.87669829e-01  2.27537870e-01  1.07021615e-01
  1.43959373e-01 -1.22436792e-01  1.01116829e-01 -1.12528041e-01
  2.35278443e-01  1.02109000e-01  1.01292141e-01  1.00796089e-01
  1.96581986e-02  2.03028

In [24]:
#Sample RMS IDs for chatbot testing of referral status
"""
24211419
18327009
24482992
21996054
22425736
24426987
17783534
24601097
23736632
24698456
24623156
21705392
"""
#Sample Employee IDs for chatbot testing bonus status
"""
213408
239660
249058
250194
260943
271637
274346
282822
284183
284315
284754
285695
290560
"""

'\n213408\n239660\n249058\n250194\n260943\n271637\n274346\n282822\n284183\n284315\n284754\n285695\n290560\n'

# Test Data

In [25]:
#Testing the Chabot with Test Dataset
os.chdir('C:\\Users\\akadali\\Desktop\\Deep_NLP\\MLG_Capstone_ChatBot\\ChatBot_GoogleW2V')

#importing test dataset
test_data = pd.read_csv("chatbot_test.csv", encoding = 'latin1')
#test_data.drop(['Unnamed: 3'], axis = 1, inplace = True)
size = len(question_vectors)
sims = np.zeros(size)
def test_response(q, vecs = question_vectors):
    q = text_clean(q)
    q = text_preprocess(q)
    q_vec = embed([q])
    for i in range(size):
        sims[i] = cosine_similarity(q_vec, question_vectors[i])
    max_s = sims.max()
    max_i = np.argmax(sims)
    return max_s, data.answer[max_i], data.question[max_i], data.intent[max_i]

test_data['bot_response'] = test_data['test_question'].apply(lambda x:test_response(x)[1])
test_data['train_question'] = test_data['test_question'].apply(lambda x:test_response(x)[2])
test_data['sim_score'] = test_data['test_question'].apply(lambda x:test_response(x)[0])
test_data['train_intent'] = test_data['test_question'].apply(lambda x:test_response(x)[3])

test_data['valid'] = test_data['bot_response'] == test_data['test_answer']

print("Accuracy of TF USE Transformer \n",(test_data.valid.sum()/len(test_data)).round(2))
print("Average Simailarity score",test_data.sim_score.mean())

##First Test result without stop words, with lemmatization
# Accuracy - 55%
# Mean Similarity score - 0.70

##Second Test Result - without stop words, with lemmatization and 
# Accuracy - 61%
# Mean Similarity score - 0.71

##Third Test Result - without stop words, without lemmatization
# Accuracy - 60
# Mean Similarity score - 0.71


Accuracy of TF USE Transformer 
 0.6
Average Simailarity score 0.7118765065649978


In [26]:
test_data.to_csv("C:\\Users\\akadali\\Desktop\\Deep_NLP\\MLG_Capstone_ChatBot\\ChatBot_GoogleW2V\\test_results_tran.csv",
                 index = False, header = True)