## Building A ChatBot Using Deep NLP

In [0]:
!pip install tensorflow==1.0.0

Collecting tensorflow==1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/48/58/b71480f9ec9d08d581d672a81b15ab5fec36a5fcda2093558a23614d8468/tensorflow-1.0.0-cp36-cp36m-manylinux1_x86_64.whl (44.5MB)
[K     |████████████████████████████████| 44.5MB 92kB/s 
Installing collected packages: tensorflow
  Found existing installation: tensorflow 2.2.0
    Uninstalling tensorflow-2.2.0:
      Successfully uninstalled tensorflow-2.2.0
Successfully installed tensorflow-1.0.0


- We have downloaded the Cornell Movies Dataset.
- After downloading, we will first create a difference between Metadata and the real training data.
- Metadata is the discriptional data of movies like the ratings, the name of the movies from which the dialogues have been taken etc. We do not need that data to train our chatbot.
- We only need the conversations between our characters. These are the only data we will be using.

- We will only be using 2 files from the cornell movie folder
1. movie_conversations.txt
2. movie_lines.txt

- The columns in movie_lines.txt are as follows:
1. L0123 - This column represents line numbers
2. U0, U1 - This column represents the characters in the movie as user0, user1
3. m0, m1 - This column represents movies from which the line has been taken. movie0, movie1
4. Bianca, Camroon - Names of the characters
5. Last column is the conversations between the characters

- The last column in movie_conversations.txt represents conversations in the form of line numbers given in movie_lines.txt.
- This file helps us to seperate the conversations from one another which is not done in movie_lines.txt

In [0]:
# Importing required Libraries

import numpy as np
import pandas as pd
import tensorflow as tf
import re  #used to clean the text and make it simple as possible for the chatbot to learn in the best conditions
import time #to measure training time of each epoch 

In [0]:
lines = open('/content/drive/My Drive/CSV files for Deep learning and ML/movie_lines.txt', encoding = "utf-8", errors = "ignore").read().split("\n")
conversations = open('/content/drive/My Drive/CSV files for Deep learning and ML/movie_conversations.txt', encoding = "utf-8", errors = "ignore").read().split("\n")

In [0]:
# Creating a dictionary that maps each line and its id
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]
 
# Creating a list of all of the conversations
conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(','))
 
# Getting separately the questions and the answers
questions = []
answers = []
for conversation in conversations_ids:
    for i in range(len(conversation) - 1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])

contractions = {"aight" : "alright",
 "ain't": "am not",
 "amn't" : "am not",
 "aren't": "are not",
 "can't": "cannot",
 "'cause" : "because",
 "could've": "could have",
 "couldn't" : "could not",
 "couldn't've" : "could not have", 
 "daren't" : "dare not",
 "daresn't" : "dare not",
 "dasn't" : "dare not",
 "didn't" : "did not",
 "doesn't" : "does not",
 "don't" : "do not",
 "d'ye" : "do you",
 "e'er" : "ever",
 "everybody's" : "everybody is",
 "everyone's" : "everyone is",
 "finna":"fixing to",
 "g'day" : "good day",
 "gimme" : "give me",
 "giv'n": "given",
 "gonna":"going to",
 "gon't":"go not",
 "gotta":"got to",
 "hadn't":"had not",
 "had've":"had have",
 "hasn't":"has not",
 "haven't":"have not",
 "he'd":"he would",
 "he'dn't've'd":"he would not have had",
 "he'll":"he will",
 "he's":"he is",
 "he've":"he have",
 "how'd":"how did",
 "howdy":"how do you do",
 "how'll":"how will",
 "how're":"how are",
 "how's":"how has",
 "i'd": "I would",
 "i'd've":"I would have",
 "i'll": "I will",
 "i'm": "I am",
 "i'm'a": "I am about to",
 "i'm'o": "I am going to",
 "innit": "is it not",
 "i've": "I have",
 "isn't": "is not",
 "it'd": "it would",
 "it'll": "it will",
 "it's": "it is",
 "let's": "let us", 
 "ma'am": "madam",
 "mayn't": "may not",
 "may've": "may have",
 "methinks" : "me thinks",
 "mightn't": "might not",
 "might've": "might have",
 "mustn't": "must not",
 "mustn't've": "must not have",
 "must've": "must have",
 "needn't": "need not",
 "ne'er":"never",
 "o'clock": "of the clock",
 "o'er": "over",
 "ol'": "old",
 "oughtn't":"ought not",
 "'s": "is",
 "shalln't":"shall not",
 "shan't":"shall not",
 "she'd":"she would",
 "she'll":"she will",
 "she's":"she is",
 "should've":"should have",
 "shouldn't":"should not",
 "shouldn't've":"should not have",
 "somebody's":"somebody is",
 "someone's":"someone is",
 "something's":"something is",
 "so're":"so you are",
 "that'll":"that will",
 "that're":"that are",
 "that's":"that is",
 "that'd":"that had",
 "there'd":"there would",
 "there'll":"here shall",
 "there're":"there are",
 "there's":"there has",
 "these're":"these are",
 "these've":"these have",
 "they'd":"they would",
 "they'll":"they will",
 "they're":"they are",
 "they've":"they have",
 "this's":"this is",
 "those're":"those are",
 "those've":"those have",
 "'tis":"it is",
 "to've":"to have",
 "'twas":"it was",
 "wanna":"want to",
 "wasn't":"was not",
 "we'd":"we would",
 "we'd've":"we would have",
 "we'll":"we will",
 "we're":"we are",
 "we've":"we have",
 "weren't":"were not",
 "what'd":"what did",
 "what'll":"what will",
 "what're":"what are",
 "what's":"what is",
 "what've":"what have",
 "when's":"when is",
 "where'd":"where did",
 "where'll":"where will",
 "where're":"where are",
 "where's":"where has",
 "where've":"where have",
 "which'd":"which had",
 "which'll":"which shall",
 "which're":"which are",
 "which's":"which has",
 "which've":"which have",
 "who'd":"who would",
 "who'd've":"who would have",
 "who'll":"who will",
 "who're":"who are",
 "who's":"who has",
 "who've":"who have",
 "why'd":"why did",
 "why're":"why are",
 "why's":"why is",
 "won't":"will not",
 "would've":"would have",
 "wouldn't":"would not",
 "wouldn't've":"would not have",
 "y'all":"you all",
 "y'all'd've":"you all would have",
 "y'all'dn't've'd":"you all would not have had",
 "y'all're":"you all are",
 "you'd":"you would",
 "you'll":"you will",
 "you're":"you are",
 "you've":"you have",
  " u " : " you",
 " ur " : " your",
 " n ": " and",
 "\'ll": "will",
 "\'ve": "have",
 "\'re": "are",
 "\'d": "would",}
 
# Doing a first cleaning of the texts
def clean_text(text):
    text = text.lower()
    if type(text) is str:
        for key in contractions:
            value = contractions[key]
            text = text.replace(key,value)
        return text
    else:
        return text
 
# Cleaning the questions
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
 
# Cleaning the answers
clean_answers = []
for answer in answers:
    clean_answers.append(clean_text(answer))

# Removing special charecters
def remove_spl(text):
    text = re.sub('[^A-Z a-z 0-9]+', "", text)
    return text
 
clean_questions_1 = []
for question in clean_questions:
    clean_questions_1.append(remove_spl(question))

clean_answers_1 = []
for answer in clean_answers:
    clean_answers_1.append(remove_spl(answer))


# Creating a dictionary that maps each word to its number of occurrences
word2count = {}
for question in clean_questions_1:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
for answer in clean_answers_1:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
 
# Creating two dictionaries that map the questions words and the answers words to a unique integer
threshold_questions = 20
questionswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_questions:
        questionswords2int[word] = word_number
        word_number += 1
threshold_answers = 20
answerswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_answers:
        answerswords2int[word] = word_number
        word_number += 1
 
# Adding the last tokens to these two dictionaries
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    questionswords2int[token] = len(questionswords2int) + 1
for token in tokens:
    answerswords2int[token] = len(answerswords2int) + 1
 
# Creating the inverse dictionary of the answerswords2int dictionary
answersints2word = {w_i: w for w, w_i in answerswords2int.items()}
 
# Adding the End Of String token to the end of every answer
for i in range(len(clean_answers_1)):
    clean_answers_1[i] += ' <EOS>'
 
# Translating all the questions and the answers into integers
# and Replacing all the words that were filtered out by <OUT> 
questions_into_int = []
for question in clean_questions_1:
    ints = []
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int['<OUT>'])
        else:
            ints.append(questionswords2int[word])
    questions_into_int.append(ints)
answers_into_int = []
for answer in clean_answers_1:
    ints = []
    for word in answer.split():
        if word not in answerswords2int:
            ints.append(answerswords2int['<OUT>'])
        else:
            ints.append(answerswords2int[word])
    answers_into_int.append(ints)
 
# Sorting questions and answers by the length of questions
sorted_clean_questions = []
sorted_clean_answers = []
for length in range(1, 25 + 1):
    for i in enumerate(questions_into_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])

### Reason for creating id2line dictionary

- We have to obtain dataset which contains 2 columns:
  1. Input: The input text that will be fed into the neural network
  2. Output: The output will be the target or the answer received during the conversation by characters. Coz we will compare this target answers with the answers given by the chatbot to calculate the accuracy 


### The reason for creating list of conversations

- The movie_conversations.txt file has to many metadata. Hence we only need to extract the last part i.e conversation with line_ids

- Explanation for _conversation = conversation.split(" +++$+++ ")[-1][1:-1].replace(" ' ", "").replace(" ", "")

- Split each line at +++$+++, take the last part of each line ie the conversations and then remove [] by selecting [1:-1] as it will remove the [ ] at 0th position and -1th position and finally replace '' and " " by nothing "".


### Removing the Non frequent Words

- Removing the non frequent words from the above lists
- we do that to optimize our training as we need only the essential words in the dialogue
- We remove the words that ouccur less than 5% of the times


### Reason for using SOS, POD & EOS

- SOS, PAD and EOS are added because the process training data and the sequences in the batches should all have the same length.Hence SOS and EOS are places in the empty positions. PAD is used to make the answers of same size 
- The first token that starts in the decoding layer is the SOS.
- We will create a last token called OUT that corresponds to all the words that were filtered out as 5% least frequent words


### Inverting answersword2int dictionary
- The inverse dictionary of answersword2int dictionary is required for inverse mapping from integers to the answer words in our Seq2Seq model implementation


### Reason for adding EOS at the end of every answer
- Adding EOS at the end of every answer is very necessary at it is important to train the bot when to stop while answering.


### Translating all the questions and answers to integers
- We are doing this to sort all the questions and all the answers by their lengths to optimize training performance.


### Sorting questions and answers by length of questions
- We sort the questions and answers by length of questions in order to speed up the training and help to reduce the loss.
- The reason for it is that it will reduce the amount of padding during the training
- We will train the chatbot on short sentences and we will avoid the long sentences. This is because, when we teach a child how to talk, we make use of short sentences and not the long ones.
- We use emumerate function to get the question and its index value as well. 
- We need index values to be added in sorted_clean_question and sorted_clean_answers list and we also need questions len and answer len for sorting.


In [0]:
# Creating placeholders for the inputs and the targets
def model_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name = 'input')
    targets = tf.placeholder(tf.int32, [None, None], name = 'target')
    lr = tf.placeholder(tf.float32, name = 'learning_rate')
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    return inputs, targets, lr, keep_prob
 
# Preprocessing the targets
def preprocess_targets(targets, word2int, batch_size):
    left_side = tf.fill([batch_size, 1], word2int['<SOS>'])
    right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1])
    preprocessed_targets = tf.concat([left_side, right_side], 1)
    return preprocessed_targets
 
# Creating the Encoder RNN
def encoder_rnn(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    encoder_output, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw = encoder_cell,
                                                                    cell_bw = encoder_cell,
                                                                    sequence_length = sequence_length,
                                                                    inputs = rnn_inputs,
                                                                    dtype = tf.float32)
    return encoder_state
 
# Decoding the training set
def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input, sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = "bahdanau", num_units = decoder_cell.output_size)
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              name = "attn_dec_train")
    decoder_output, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                                              training_decoder_function,
                                                                                                              decoder_embedded_input,
                                                                                                              sequence_length,
                                                                                                              scope = decoding_scope)
    decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)
    return output_function(decoder_output_dropout)
 
# Decoding the test/validation set
def decode_test_set(encoder_state, decoder_cell, decoder_embeddings_matrix, sos_id, eos_id, maximum_length, num_words, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = "bahdanau", num_units = decoder_cell.output_size)
    test_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_inference(output_function,
                                                                              encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              decoder_embeddings_matrix,
                                                                              sos_id,
                                                                              eos_id,
                                                                              maximum_length,
                                                                              num_words,
                                                                              name = "attn_dec_inf")
    test_predictions, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                                                test_decoder_function,
                                                                                                                scope = decoding_scope)
    return test_predictions
 
# Creating the Decoder RNN
def decoder_rnn(decoder_embedded_input, decoder_embeddings_matrix, encoder_state, num_words, sequence_length, rnn_size, num_layers, word2int, keep_prob, batch_size):
    with tf.variable_scope("decoding") as decoding_scope:
        lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
        lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
        decoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
        weights = tf.truncated_normal_initializer(stddev = 0.1)
        biases = tf.zeros_initializer()
        output_function = lambda x: tf.contrib.layers.fully_connected(x,
                                                                      num_words,
                                                                      None,
                                                                      scope = decoding_scope,
                                                                      weights_initializer = weights,
                                                                      biases_initializer = biases)
        training_predictions = decode_training_set(encoder_state,
                                                   decoder_cell,
                                                   decoder_embedded_input,
                                                   sequence_length,
                                                   decoding_scope,
                                                   output_function,
                                                   keep_prob,
                                                   batch_size)
        decoding_scope.reuse_variables()
        test_predictions = decode_test_set(encoder_state,
                                           decoder_cell,
                                           decoder_embeddings_matrix,
                                           word2int['<SOS>'],
                                           word2int['<EOS>'],
                                           sequence_length - 1,
                                           num_words,
                                           decoding_scope,
                                           output_function,
                                           keep_prob,
                                           batch_size)
    return training_predictions, test_predictions
 
# Building the seq2seq model
def seq2seq_model(inputs, targets, keep_prob, batch_size, sequence_length, answers_num_words, questions_num_words, encoder_embedding_size, decoder_embedding_size, rnn_size, num_layers, questionswords2int):
    encoder_embedded_input = tf.contrib.layers.embed_sequence(inputs,
                                                              answers_num_words + 1,
                                                              encoder_embedding_size,
                                                              initializer = tf.random_uniform_initializer(0, 1))
    encoder_state = encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, sequence_length)
    preprocessed_targets = preprocess_targets(targets, questionswords2int, batch_size)
    decoder_embeddings_matrix = tf.Variable(tf.random_uniform([questions_num_words + 1, decoder_embedding_size], 0, 1))
    decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessed_targets)
    training_predictions, test_predictions = decoder_rnn(decoder_embedded_input,
                                                         decoder_embeddings_matrix,
                                                         encoder_state,
                                                         questions_num_words,
                                                         sequence_length,
                                                         rnn_size,
                                                         num_layers,
                                                         questionswords2int,
                                                         keep_prob,
                                                         batch_size)
    return training_predictions, test_predictions

### Creating placeholders for the inputs and the targets

- All the variables used in tensors must be defined by a TF     placeholder. 
- Placeholders are advance datastructures that can hold tensors and other features as well

### Preprocessing the targets

- preprocessing of targets is necessary because the decoder will accept only a certain format of targets
- The format of targets is 2 fold:
    1. The targets have to be in batches (eg: batches of 10 answers are fed in the neural network)
    2. Each answer in the batch must start with SOS token.
- If we add SOS in the beginning of each answer, we have to remove the last word of each answer. We do this by using concatination


### Creating the Encoder RNN

- rnn_size is the number of input tensors of the encoder layer
- encoder cell contains various LSTM layers
- The bidirectional RNN function returns two values, encoder_state and encoder_output. Hence we use _, before encoder_state to specify that we only need the encoder state and not the first value


- Embeddings: Embedding means converting the words into vector format
- variable_scope: variable scope is a advanced data-structure that wraps the tensor flow variable 

### Decoding the training set

- step 1: Initializing attention_states as a 3D matrix containing only zeros.
- attention keys = keys to be compared with the target state.
- attention values = Values that we will use to construct the context vectors (context is returned by encoder and used by decoder)
- attention score = It is used to compare the similarities between keys and the target states.
- attention construct = It is used to build the attention state.
- We get decoder_output from dynamic_rnn_decoder. The dynamic_rnn_decoder gives out 3 values decoder_output, final_state and the final_context_state of the decoder. And hence we use _, _, as we only want decoder_output.


### Decoding the test/validation set

- for validation and test set we are using attention_decoder_fn_inference function
- We are using this function because we want to deduce logically the answer to the question so that the chatbot can answer on his own based on the logic it has developed.
- We are taking 4 new arguments [sos_id, eos_id, maximum_length, num_words] for attention_decoder_fn_infer function


### Creating the Decoder RNN

- We are making this decoding rnn layer in the decoding scope
- Then we create the LSTM layer
- for fully connected layer, x is the input, num_words ie the num of words in answer will be the output
- the fully connected layer will take the features from the stacked lstm layers and will return the final scores
- By using softmax, we will return the final answer
- In test_predictions, we did sequence_length - 1 because to exclude the last token


### Building the final Seq2Seq model

- In building the seq2seq layer, we will assemble the encoder rnn and decoder rnn together
- The function seq2seq model is the brain of our chatbot
- questionsword2int dictionary is used to preprocess the targets
- The function will return training predictions and test predictions
- the encoder will give the encoder state and the decoder will give the training and test predictions
- embed_sequence returns the embedded input of the encoder
- encoder state is the output of the encoder and input to the decoder



In [0]:
# Setting the hyperparameters

epochs = 100
batch_size = 64
rnn_size = 512
num_layers = 3
encoding_embedding_size = 512
decoding_embedding_size = 512
learning_rate = 0.01
learning_rate_decay = 0.9
minimum_learning_rate = 0.0001
keep_probability = 0.5

### Setting the hyperparameters

- The dropout is used to avoid overfitting. Dropout is present only during training the neural network. DUring test and validation, all the neurons are used.
- The keep_prob (p) value is used to control the dropout rate p = 1 - dropout
- According to paper by Geoffery Hinton, dropout of 20% in the input layer and 50% in the hidden layer is optimal.
- learning_rate_decay = the rate at which the learning rate will reduce from 0.01 so that the model does indepth training
- minimum_learning_rate = beacuse of the decay, the learning rate should not go below 0.0001
- encoding_embedding_size = number of columns in your embedding matrix
- decoding_embedding_size = number of columns in your decoding matrix.

In [0]:
# Defining a session

tf.reset_default_graph()
session = tf.InteractiveSession()

### Defining a session

- Here we have defined a tf session on which all the tf training will be run
- For creating a session, we will first reset the tf graph.

In [0]:
# Loading the model inputs

inputs, targets, lr, keep_prob = model_inputs()

### Loading the model inputs

- inputs = Questions that we will feed in the model
- targets = Answers that we will receive for the questions
- lr = learning rate
- keep_prob = parameter to control the dropout

In [0]:
# Setting the sequence length
sequence_length = tf.placeholder_with_default(25, None, name = 'sequence_length')
 
# Getting the shape of the inputs tensor
input_shape = tf.shape(inputs)

### Setting the Sequence length variable

- arguments of sequence length:
    1. input = 25 is the max length of the sequence
    2. shape = tf shape ie list of integer which is set to None as the sequence_length is just a value and not a tensor.
    3. name =  name of sequence length
    
- sequence_length = 25 means in training we will not use questions and answers with length more than 25

### Getting the shape of input tensor

- The shape of input tensor will be an argument for one specific function used for training
- The function is called ones function by tf. and the dimension of ones function is set to input shape

In [0]:
# Getting the training and test predictions
training_predictions, test_predictions = seq2seq_model(tf.reverse(inputs, [-1]),
                                                       targets,
                                                       keep_prob,
                                                       batch_size,
                                                       sequence_length,
                                                       len(answerswords2int),
                                                       len(questionswords2int),
                                                       encoding_embedding_size,
                                                       decoding_embedding_size,
                                                       rnn_size,
                                                       num_layers,
                                                       questionswords2int)

### Getting the training and Test predictions

- Here we are trying to get the training and test predictions when we are loading the model inputs value in the neural network (see above)
- The training predictions and test predictions are not the same as the local variables used in the seq2seq function.
- These are real variables used in the training later on.
- The inputs are not in the right shape. Hence we have to use reshape to shape the inputs
- the tf.reverse function reverses the dimension of a tensor.

In [0]:
#Setting up the loss error, the optimizer and gradient clipping

with tf.name_scope("optimization"):
  loss_error = tf.contrib.seq2seq.sequence_loss(training_predictions, targets, tf.ones([input_shape[0], sequence_length]))
  optimizer = tf.train.AdamOptimizer(learning_rate)
  gradients = optimizer.compute_gradients(loss_error)
  clipped_gradients = [(tf.clip_by_value(grad_tensor, -5, 5), grad_variable) for grad_tensor, grad_variable in gradients if grad_tensor is not None]
  optimizer_gradient_clipping = optimizer.apply_gradients(clipped_gradients)

### Setting up the loss error, the optimizer and gradient clipping

- Gradient clipping is a technique to keep the gradients in the graph between minimum value and the maximum value to avoid exploding and vanishing gradient issue
- The loss error chosen is the weighted cross_entropy loss error as it is the best loss error for sequences
- The loss error will be calculated between training_predictions and targets
- The gradient in the graph is attached to a variable
 

In [0]:
# Padding the sequence with <PAD> token

def apply_padding(batch_of_sequences, word2count):
  max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
  return [sequence + [word2count["<PAD>"]] * (max_sequence_length - len(sequence)) for sequence in batch_of_sequences]

### Padding the sequence with <PAD> token

- We apply padding to the sequences with pad token
- padding is done coz all the sentences in a batch whether they are questions or answers must have the same length
- First we find the max length of the sequence present in the batch
- Now we will use the max length of other sequences which are shorter
- list of pad tokens is added to list of sequences

In [0]:
# Split the data into batches of questions and answers

def split_into_batches(questions, answers, batch_size):
  for batch_index in range (0, len(questions)//batch_size):
    start_index = batch_index * batch_size
    questions_in_batch = questions[start_index : start_index + batch_size]
    answers_in_batch = answers[start_index : start_index + batch_size]
    padded_questions_in_batch = np.array(apply_padding(questions_in_batch, questionswords2int))
    padded_answers_in_batch = np.array(apply_padding(answers_in_batch, answerswords2int))
    yield padded_questions_in_batch, padded_answers_in_batch

### Split the data into batches of questions and answers


- We have to create batches based on total number of questions
- Hence we have to figure out what will be the total number of batches
- Once the batch is formed, we will apply (apply_padding) so that all the questions and answers are of the same size.
- We have to keep a track of which batch we are on. Hence we make use of batch_index
- // is used coz we want an integer
- start_index will be the index of the first question we are adding in the batch
- for eg: batch_size = 64, batch_index = 0
- Therefore, start_index = 0 * 64 hence first batch will have questions from 0-63
- Next batch = 1 * 64 will have questions from 63 to 122
- yield is similar to return but it performs better while dealing with sequences

In [0]:
# Splitting the questions and answers to training and validation set

training_validation_split = int(len(sorted_clean_questions) * 0.15) # integer value of 15% of sorted clean questions
training_questions = sorted_clean_questions[training_validation_split :]
training_answers = sorted_clean_answers[training_validation_split :]
validation_questions = sorted_clean_questions[: training_validation_split]
validation_answers = sorted_clean_answers[: training_validation_split]

### Splitting the questions and answers to training and validation set

- The training set will be 85% of questions and answers and the testing sets will be 15% of the questions and answers
- Training questions and answers will be all the questions and answers after training_validation_split
- Validation questions and answers will be all the 15% questions and answers before the split 

In [0]:
# Training

batch_index_check_training_loss = 100
batch_index_check_validation_loss = ((len(training_questions)) // batch_size // 2) - 1
total_training_loss_error = 0
list_validation_loss_error = []
early_stopping_check = 0
early_stopping_stop = 1000
checkpoint = "chatbot_weights.ckpt"
session.run(tf.global_variables_initializer())
for epoch in range (1, epochs + 1):
  for batch_index, (padded_questions_in_batch, padded_answers_in_batch) in enumerate(split_into_batches(training_questions, training_answers, batch_size)):
    starting_time = time.time()
    _, batch_training_loss_error = session.run([optimizer_gradient_clipping, loss_error], {inputs : padded_questions_in_batch, 
                                                                                           targets: padded_answers_in_batch,
                                                                                           lr : learning_rate,
                                                                                           sequence_length: padded_answers_in_batch.shape[1],
                                                                                           keep_prob: keep_probability})
    total_training_loss_error += batch_training_loss_error
    ending_time = time.time()
    batch_time = ending_time - starting_time
    if batch_index % batch_index_check_validation_loss == 0:
      print('Epoch: {:>3}/{}, Batch: {:>4}/{}, Training Loss Error: {:>6.3f}, Training Time on 100 Batches: {:d} seconds'.format(epoch,
                                                                                                                                       epochs,
                                                                                                                                       batch_index,
                                                                                                                                       len(training_questions) // batch_size,
                                                                                                                                       total_training_loss_error / batch_index_check_training_loss,
                                                                                                                                       int(batch_time * batch_index_check_training_loss)))
      total_training_loss_error = 0
    if batch_index % batch_index_check_validation_loss == 0 and batch_index > 0:
      total_validation_loss_error = 0
      starting_time = time.time()
      for batch_index_validation, (padded_questions_in_batch, padded_answers_in_batch) in enumerate(split_into_batches(validation_questions, validation_answers, batch_size)):
        batch_validation_loss_error = session.run(loss_error, {inputs : padded_questions_in_batch, 
                                                                  targets: padded_answers_in_batch,
                                                                  lr : learning_rate,
                                                                  sequence_length : padded_answers_in_batch.shape[1],
                                                                  keep_prob : 1})
        total_training_loss_error += batch_validation_loss_error
      ending_time = time.time()
      batch_time = ending_time - starting_time
      average_validation_loss_error = total_validation_loss_error / (len(validation_questions) / batch_size)
      print("Validation Loss Error: {:>6.3f}, Batch Validation Time: {:d} seconds".format(average_validation_loss_error, int(batch_time)))
      learning_rate += learning_rate_decay
      if learning_rate < minimum_learning_rate:
        learning_rate = minimum_learning_rate
      list_validation_loss_error.append(average_validation_loss_error)
      if average_validation_loss_error <= min(list_validation_loss_error):
        print("I speak better now !!")
        early_stopping_check = 0
        saver = tf.train.Saver()
        saver.save(session, checkpoint)
      else:
        print("I do not speak better, I need to practice more.")
        early_stopping_check += 1
        if early_stopping_check == early_stopping_stop:
          break
    if early_stopping_check == early_stopping_stop:
      print("My apologies, I cannot speak better anymore. This is the best I can do.")
      break
print("Game Over")




### Training the chatbot


- We will check the training loss per 100 batches.
- The list_validation_loss_errors is used coz we will be using the early stopping technique which consists of checking if we have reached the min of all the losses that we get.
- Whenever there is no change in the validation loss, the (early_stopping_check) will increase
- If early_stopping_check reaches early_stopping_stop value then the entire operation will be stopped.
- checkpoint = file containing the weights
- First we iterate through each epoch and then in each epoch we iterate through each batch to get padded questions and answers
- After starting_time, we will get the training loss error of the single batch
- This training_loss_error will be added to total training loss error.
- batch_time = training time of a single batch
- We will now calculate the average of the training_loss_errors on 100 batches and we will print that error to keep track of training loss error 
- Next we calculate the average loss error for the validation set
- We initialized total_validation_loss_error = 0
- Since we are not doing any training with the validation, we remove the gradient_clipping_optimizer
- Finally we will compute the average of validation losses
- {:>6.3f} = float value with 6 figures and 3 decimals
- Next we will apply decay to our learning rate
- Next, we have to take care of early stopping
- if average_validation_loss_error <= min(list_validation_loss_error): It means the error is reducing and there is an improvement and hence the chatbot will give the first sentence. Then we reset early stopping check to 0
- Then we save the model

In [0]:
# Loading the Weights and Running the session

checkpoint = "./chatbot_weights.ckpt"
session = tf.InteractiveSession()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(session, checkpoint)

NotFoundError: ignored

In [0]:
# Converting the questions from strings to list of Integers

def convert_word2int(question, word2int):
  question = clean_text(question)
  return [word2int.get(word, word2int["<OUT>"]) for word in question.split()]

### Converting the questions from strings to list of Integers

- First we clean the questions
- Return a list of integers after splitting the questions into words. And get the int value for out if the word is not in the word2int list

In [0]:
# Setting up the chat

while(True):
  question = input("You: ")
  if question == "Goodbye" or question == "Bye":
    print ("Chatbot: Bye. Have a wonderful day!")
    break
  question = convert_word2int(question, questionswords2int) # convert question words to integers
  question = question + [questionswords2int["<PAD>"]] * (20 - len(question)) # padding all the questions to the length of training questions
  # putting the question in fake batch with one single questions and all other zeros as rnn accepts questions only in batches
  fake_batch = np.zeros((batch_size, 20)) # all training questions have length 20
  fake_batch[0] = question
  predicted_answer = session.run(test_predictions, {inputs: fake_batch, keep_prob: 0.5})[0] # the predicted answer is the first element of the list hence we used [0] in the end
  answer = "" # answer given by chatbot. We will replace i by I and <EOS> by full-stop. Once we have a full-stop, answer will break
  for i in np.argmax(predicted_answer, 1):
    if answersints2word[i] == "i":
      token = "I"
    elif answersints2word[i] == "<EOS>":
      token = "."
    elif answersints2word[i] == "<OUT>":
      token = "out"
    else:
      token = " " + answersints2word[i]
    answer += token
    if token == ".":
      break
  print("Chatbot: " + answer)
    
     





You: Bye
Chatbot: Bye. Have a wonderful day!


- Argmax takes the token_ids of the values in the predicted answer