### Import Libraries

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import json

In [15]:
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras import preprocessing, utils
from tensorflow.keras.preprocessing.text import Tokenizer
import os 

from tensorflow.keras.models import Model

from tensorflow.keras.layers import Concatenate, dot, Activation, Lambda, Input, Embedding, LSTM, Dense

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dot, Activation, Add

import re
import spacy
import nltk
#from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize
#from nltk.stem import WordNetLemmatizer
import string

#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

### 1. Load Trained Model and Dependencies

In [3]:
#load model 
model_50b = load_model('model_50b.h5')

#load tokenizer.word_index
#with open('tokenize_word_index.pkl', 'rb') as file:
    #tokenize_word_index = pickle.load(file)

# Load the tokenizer from the file
with open('tokenizer_50b.pkl', 'rb') as file:
    tokenizer = pickle.load(file)

# This is for ease of assignment; previously loaded just the word_index and not the tokenizer itself
#tokenize_word_index = tokenizer.word_index

In [4]:
print(model_50b.summary()) #for reference in extracting correct layers for inference model

Model: "model_17"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_35 (InputLayer)          [(None, 77)]         0           []                               
                                                                                                  
 input_36 (InputLayer)          [(None, 77)]         0           []                               
                                                                                                  
 embedding_34 (Embedding)       (None, 77, 64)       1570432     ['input_35[0][0]']               
                                                                                                  
 embedding_35 (Embedding)       (None, 77, 64)       1570432     ['input_36[0][0]']               
                                                                                           

### 2. Build Chatbout Inference Model Architecture

In [5]:
hidden_dim = 128 #dimension for hidden layer, as set in train model
output_dim = 64 #dimension for output , as set in train model
maxlen = model_50b.input_shape[1][1]
VOCAB_SIZE = len(tokenizer.word_index)+1
#tokenizer = Tokenizer()

# Extract the Decoder embedding, LSTM, and dense layers and create an instance of each
decoder_embedding_layer = model_50b.get_layer('embedding_35')
decoder_lstm_layer = model_50b.get_layer('lstm_35')
decoder_dense_layer = model_50b.get_layer('dense_17')
#decoder_dense_layer = Dense(VOCAB_SIZE, activation='softmax', name='dense_9')




In [6]:
def attn_layer(enc_out, dec_out):
    #calculate the attention scores
    attn_score = dot([dec_out, enc_out], axes = [2,2])
    attention_weights = Activation('softmax')(attn_score)

    #calculate the context vector
    context_vector = dot([attention_weights, enc_out],axes=[2,1])
    return context_vector, attention_weights

In [7]:
#inference Encoder and Decoder Model


#Encoder
encoder_inputs = model_50b.input[0]
encoder_outputs, state_h, state_c = model_50b.layers[4].output
encoder_states = [state_h, state_c]
#encoder_model = Model(encoder_inputs, encoder_states)
encoder_model = Model(encoder_inputs,[encoder_outputs] + encoder_states)


#Decoder
decoder_inputs = tf.keras.Input(shape=(1,),dtype=tf.float32, name='decoder_inputs')

#tensors for decoder's LSTM state
decoder_state_input_h = tf.keras.Input(shape = (hidden_dim, ), dtype=tf.float32, name = 'decoder_state_input_h') #name included to help properly identify the state during processing
decoder_state_input_c = tf.keras.Input(shape = (hidden_dim, ), dtype=tf.float32, name = 'decoder_state_input_c') #name included to help properly identify the state during processing
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding = decoder_embedding_layer(decoder_inputs) #using same decoder_embedding from training on decoder_inputs_inference

#LSTM decoder with initial state
decoder_outputs, state_h, state_c = decoder_lstm_layer(decoder_embedding, initial_state = decoder_states_inputs)
decoder_states = [state_h, state_c]

#encoder_outputs that will be fed during inference
encoder_outputs_input = Input(shape=(None, hidden_dim), name='encoder_outputs')

#Calculate the attention context vector values
context_vector, attention_weights = attn_layer(encoder_outputs_input, decoder_outputs)#(encoder_outputs_input, decoder_outputs)

# Combine the context vector with the decoder outputs
decoder_combined_context = Concatenate(axis = -1)([decoder_outputs, context_vector])

#Dense layer for final output
decoder_outputs = decoder_dense_layer(decoder_combined_context)

#Inference decoder model
decoder_model = Model([decoder_inputs] + decoder_states_inputs + [encoder_outputs_input],[decoder_outputs] + decoder_states, name = 'decoder_model')
#decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states, name = 'decoder_model')

In [8]:
print("Shape of decoder_outputs:", decoder_outputs.shape)
print("Shape of context_vector:", context_vector.shape)
print("Shape of decoder_combined_context:", decoder_combined_context.shape)

Shape of decoder_outputs: (None, 1, 24538)
Shape of context_vector: (None, 1, 128)
Shape of decoder_combined_context: (None, 1, 256)


In [9]:
decoder_model.summary()

Model: "decoder_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_inputs (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 embedding_35 (Embedding)       multiple             1570432     ['decoder_inputs[0][0]']         
                                                                                                  
 decoder_state_input_h (InputLa  [(None, 128)]       0           []                               
 yer)                                                                                             
                                                                                                  
 decoder_state_input_c (InputLa  [(None, 128)]       0           []                   

### 3. Chatbot Response Testing

### 3.a. Text preprocessing methods

In [17]:
# Define a dictionary of contractions and their expansions
contractions_dict = {"don't": "do not","can't": "cannot","won't": "will not","wouldn't": "would not",
                          "shouldn't": "should not","couldn't": "could not","it's": "it is","i'm": "i am",
                          "you're": "you are","there's" : "there is","he's": "he is","she's": "she is",
                          "we're": "we are","they're": "they are","wasn't": "was not","weren't": "were not",
                          "hasn't": "has not","haven't": "have not","hadn't": "had not","mightn't": "might not",
                          "shan't": "shall not","mustn't": "must not","isn't":"is not","didn't":"did not","aren't":"are not",
                          "oughtn't":"ought not","needn't":"need not", "could've":"could have", "should've":"should have",
                          "would've":"would have","might've":"might have","i've":"i have","you've":"you have","we've":"we have",
                          "they've":"they have","i'll":"i will","you'll":"you will","he’ll":"he will","she’ll":"she will",
                          "it’ll":"it will","we’ll":"we will","they’ll":"they will","i’d":"i had","you’d":"you had",
                          "she’d":"she had","he’d":"he had","it'd":"it had", "we'd":"we had","they'd":"they had","that's":"that is",
                          "that’ve":"that have","that’d":"that would","which’ve":"which have","who’s":"who is","who’re":"who are",
                          "who’ve":"who have","who’d":"who had","who'll":"who will","what’s":"what is","what’re":"what are",
                          "what’ll":"what will","where’s":"where is","where’d":"where did","when’s":"when is","why’s":"why is",
                          "why’d":"why did","how’s":"how is","here’s":"here is","there’s":"there is","there’ll":"there will",
                          "there’d":"there had","someone’s":"someone is","somebody’s":"somebody is","no one’s":"no one is","nobody’s":"nobody is",
                          "something’s":"something is","nothing’s":"nothing is","let’s":"let us","ma’am":"madam","o'clock":"of the clock", "let's":"let us",
                          
}

# Create a regular expression pattern from the contractions dictionary
contractions_pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b')

# Define a function to replace contractions using the pattern
#def expand_contractions(text):
    #return contractions_pattern.sub(lambda x: contractions_dict[x.group(0)], text)

def replace_contractions(text):
    word_contraction = lambda x: dict_word_contractions[x.group(0)] #locates contraction in dictionary
    result = re.sub(contractions_pattern, word_contraction,text) #finds and replace contraction in text with equivalent word/phrase
    return result

In [18]:
#Instances needed for preprocessing function

patterns = re.compile(pattern = "["u"\U0001F600-\U0001F64F"  # emoticons 
                                       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                       u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                       u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                       "]+", flags = re.UNICODE)


nlp = spacy.load('en_core_web_sm')

In [19]:
def preprocessing(text):
    '''Preprocessing of text, steps covered:
    1. transform to lower case
    2. remove punctuations, numbers, emojis, unwanted characters
    3. tokenize text
    4. lemmatize text
    '''
        
    text = text.lower() #convert to lowercase

    text = replace_contractions(text) #replace word contractions
    #text= spell_check(text) #correct spelling errors
    text.translate(str.maketrans('', '', string.punctuation)) #remove punctuations
    text = re.sub(patterns,'', text)
    #text = re.sub('<!@.*?>', '', text) #remove unwanted characters 
    text = re.sub('[^A-Za-z0-9]+', ' ', text) #remove numbers
    text = re.sub(r"([.!?])", r" \1", text)
    text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)
    text = re.sub(r"\s+", r" ", text).strip()
    
    text = nlp(text)

    # Perform lemmatization, removing stop words and non-alphabetic tokens
    cleaned = [token.lemma_ for token in text if token.is_alpha]
    
    #tokenized_words = word_tokenize(text) #tokenize text
    #cleaned = [lemmatizer.lemmatize(token) for token in tokenized_words] # #create list of cleaned lemmatized words
    
    
    return ' '.join(cleaned) # create list of strings

### 3.b Engage with Chatbot/ Model Prediction

In [22]:
def get_chatbot_response():
        
    user_input = input("Say something Emobot would like to chat with you: ")
    
    #preprocess user input 
    clean_input = preprocessing(user_input)
    clean_input_seq = tokenizer.texts_to_sequences([clean_input])
    clean_input_seq = pad_sequences(clean_input_seq, maxlen = maxlen, padding = "post")
    
    #chatbot response
    encoder_outputs, state_h, state_c = encoder_model.predict(clean_input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0,0] = tokenizer.word_index.get('<start>',1)
    
    # placeholder for response sequence
    stop_condition = False
    response_seq = []
    
    while not stop_condition:
        #predict next word in sequence
        output_tokens, h, c = decoder_model.predict([target_seq] + [state_h, state_c] +[encoder_outputs])
        #output_tokens, h, c = decoder_model.predict([target_seq]+ states_value)
        #print(f"Output tokens: {output_tokens}")
        
        #get predicted word index
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        #print(f"Sampled token index: {sampled_token_index}")
        
        response_seq.append(sampled_token_index)
        #for idx in response_seq:
            #print(f"Token index: {idx}, Word: {tokenizer.index_word.get(idx, '<unk>')}")
        
        #exit condition by either hitting max length or found stop token
        if sampled_token_index == tokenizer.word_index.get('<end>',2) or len(response_seq) > maxlen:
            stop_condition = True
        
       
        #update the target sequence    
        target_seq = np.zeros((1, 1))
        target_seq[0,0] = sampled_token_index
        
        #update states
        states_value =[h, c] 
        #documents = SimpleDirectoryReader(directory_path).load_data
        
    #Transform response sequence to text
    #print(f"Raw response sequence: {response_seq}")
    responses = tokenizer.sequences_to_texts([response_seq])[0]
    #print(f"Final response text: {responses}")
    return responses

In [24]:
get_chatbot_response()

Say something Emobot would like to chat with you:  hi how are you




'you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not you do not'