In [0]:
# Building a ChatBot with Deep NLP
 
# Importing the libraries
import numpy as np
import tensorflow as tf
import re
import time
from tensorflow.python.layers.core import Dense
from google.colab import files
 
 
########## PART 1 - DATA PREPROCESSING ##########
 
 
 
# Importing the dataset
lines = open('movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')
 
# Creating a dictionary that maps each line and its id
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]
 
# Creating a list of all of the conversations
conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(','))
 
# Getting separately the questions and the answers
questions = []
answers = []
for conversation in conversations_ids:
    for i in range(len(conversation) - 1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])
 
# Doing a first cleaning of the texts
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    return text
 
# Cleaning the questions
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
 
# Cleaning the answers
clean_answers = []
for answer in answers:
    clean_answers.append(clean_text(answer))
 
# Filtering out the questions and answers that are too short or too long
short_questions = []
short_answers = []
i = 0
for question in clean_questions:
    if 2 <= len(question.split()) <= 25:
        short_questions.append(question)
        short_answers.append(clean_answers[i])
    i += 1
clean_questions = []
clean_answers = []
i = 0
for answer in short_answers:
    if 2 <= len(answer.split()) <= 25:
        clean_answers.append(answer)
        clean_questions.append(short_questions[i])
    i += 1
 
# Creating a dictionary that maps each word to its number of occurrences
word2count = {}
for question in clean_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
for answer in clean_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
 
# Creating two dictionaries that map the questions words and the answers words to a unique integer
threshold_questions = 15
questionswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_questions:
        questionswords2int[word] = word_number
        word_number += 1
threshold_answers = 15
answerswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_answers:
        answerswords2int[word] = word_number
        word_number += 1
 
# Adding the last tokens to these two dictionaries
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    questionswords2int[token] = len(questionswords2int) + 1
for token in tokens:
    answerswords2int[token] = len(answerswords2int) + 1
 
# Creating the inverse dictionary of the answerswords2int dictionary
answersints2word = {w_i: w for w, w_i in answerswords2int.items()}
 
# Adding the End Of String token to the end of every answer
for i in range(len(clean_answers)):
    clean_answers[i] += ' <EOS>'
 
# Translating all the questions and the answers into integers
# and Replacing all the words that were filtered out by <OUT> 
questions_into_int = []
for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int['<OUT>'])
        else:
            ints.append(questionswords2int[word])
    questions_into_int.append(ints)
answers_into_int = []
for answer in clean_answers:
    ints = []
    for word in answer.split():
        if word not in answerswords2int:
            ints.append(answerswords2int['<OUT>'])
        else:
            ints.append(answerswords2int[word])
    answers_into_int.append(ints)
 
# Sorting questions and answers by the length of questions
sorted_clean_questions = []
sorted_clean_answers = []
for length in range(1, 25 + 1):
    for i in enumerate(questions_into_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])

In [0]:
########## PART 2 - BUILDING THE SEQ2SEQ MODEL ##########
 
 
 
# Creating placeholders for the inputs and the targets
def model_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name = 'input')
    targets = tf.placeholder(tf.int32, [None, None], name = 'target')
    lr = tf.placeholder(tf.float32, name = 'learning_rate')
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    target_seq_len=tf.placeholder(tf.int32,[None],name="target_sequence_length")
    max_target_seq_len=tf.reduce_max(target_seq_len)
    source_seq_len=tf.placeholder(tf.int32,[None],name="source_sequence_length")
    return inputs,targets,lr,keep_prob,target_seq_len,max_target_seq_len,source_seq_len
 
# Preprocessing the targets
def preprocess_targets(targets, word2int, batch_size):
    left_side = tf.fill([batch_size, 1], word2int['<SOS>'])
    right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1])
    preprocessed_targets = tf.concat([left_side, right_side], 1)
    return preprocessed_targets
 
# Creating the Encoder RNN
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob, 
                   source_sequence_length, source_vocab_size, 
                   encoding_embedding_size):
    embed_input = tf.contrib.layers.embed_sequence(rnn_inputs,
                                                   vocab_size=source_vocab_size,
                                                   embed_dim=encoding_embedding_size)
    
    def lstm_cell():
        cell = tf.nn.rnn_cell.LSTMCell(num_units=rnn_size,name='basic_lstm_cell')
        lstm_cell=tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=keep_prob)
        return lstm_cell
    
    stacked_lstm = tf.contrib.rnn.MultiRNNCell(
                    [lstm_cell() for _ in range(num_layers)])
    enc_output,enc_FinalState = tf.nn.dynamic_rnn(stacked_lstm,
                                          embed_input,
                                          source_sequence_length,
                                          dtype=tf.float32)
    return enc_output,enc_FinalState


In [0]:
def decoding_layer_train(encoder_output,encoder_state, dec_cell, dec_embed_input, 
                         source_sequence_length,target_sequence_length, max_summary_length, 
                         output_layer, rnn_size,keep_prob,beam_width):
    
    with tf.variable_scope("myScope"):

        attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units = rnn_size,
                                                                memory = encoder_output,
                                                                memory_sequence_length = source_sequence_length)
        dec_cell = tf.contrib.seq2seq.AttentionWrapper(
        cell = dec_cell,
        attention_mechanism = attention_mechanism,
        attention_layer_size = rnn_size/2)

        attn_zero = dec_cell.zero_state(batch_size,tf.float32)
        attn_zero = attn_zero.clone(cell_state=encoder_state)

        # Helper for the training process. Used by BasicDecoder to read inputs.
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                            sequence_length=target_sequence_length,
                                                            time_major=False)


        # Basic decoder
        training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                           training_helper,
                                                           attn_zero,
                                                           output_layer) 

        # Perform dynamic decoding using the decoder
        training_decoder_output= tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                       impute_finished=True,
                                                                       output_time_major=False,
                                                                       maximum_iterations=max_summary_length)[0]

        return training_decoder_output

In [0]:
def decoding_layer_infer(encoder_output,encoder_state, dec_cell, dec_embeddings, start_of_sequence_id,
                         end_of_sequence_id, source_sequence_length,max_target_sequence_length,
                         vocab_size, output_layer, batch_size,rnn_size, keep_prob,beam_width):
    with tf.variable_scope("myScope",reuse=True):
        tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
        encoder_output, multiplier=beam_width)
        tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
            encoder_state, multiplier=beam_width)
        tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
            source_sequence_length, multiplier=beam_width)
        
        attention_mechanism_beam = tf.contrib.seq2seq.LuongAttention(num_units = rnn_size,
                                                                     memory = tiled_encoder_outputs,
                                                                     memory_sequence_length = tiled_sequence_length)
        dec_cell_beam = tf.contrib.seq2seq.AttentionWrapper(
        cell = dec_cell,
        attention_mechanism = attention_mechanism_beam,
        attention_layer_size = rnn_size/2)

        attn_zero_beam = dec_cell_beam.zero_state(batch_size * beam_width,tf.float32)
        attn_zero_beam = attn_zero_beam.clone(cell_state=tiled_encoder_final_state)

        start_of_sequence_ids = tf.tile([start_of_sequence_id], [batch_size])

        # Basic decoder
        inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder( dec_cell_beam,
                                                           embedding = dec_embeddings,
                                                           start_tokens = start_of_sequence_ids,
                                                           end_token = end_of_sequence_id,
                                                           initial_state = attn_zero_beam,
                                                           beam_width = beam_width
                                                           ,output_layer=output_layer)

        # Perform dynamic decoding using the decoder
        inference_decoder_output,t1,t2 = tf.contrib.seq2seq.dynamic_decode(inference_decoder,               
                                                            maximum_iterations=max_target_sequence_length)

        beam_logits = tf.no_op()
        return inference_decoder_output

In [0]:
def decoding_layer(dec_input,encoder_output, encoder_state,
                   source_sequence_length,target_sequence_length,
                   max_target_sequence_length,rnn_size,
                   num_layers, target_vocab_to_int, target_vocab_size,
                   batch_size, keep_prob, decoding_embedding_size,beam_width):
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    def lstm_cell():
        cell = tf.nn.rnn_cell.LSTMCell(num_units=rnn_size,name='basic_lstm_cell')
        
        lstm_cell=tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=keep_prob)
        return lstm_cell
    
    dec_cell = tf.contrib.rnn.MultiRNNCell(
                    [lstm_cell() for _ in range(num_layers)])
    output_layer = Dense(target_vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))

    with tf.variable_scope("decode") as scope:
        training_decoder_output = decoding_layer_train(encoder_output,encoder_state, dec_cell, dec_embed_input,
                                                       source_sequence_length,target_sequence_length, max_target_sequence_length, 
                                                       output_layer, rnn_size,keep_prob,beam_width)
    
        inference_decoder_output = decoding_layer_infer(encoder_output,encoder_state, dec_cell, dec_embeddings, target_vocab_to_int['<SOS>'], 
                                                        target_vocab_to_int['<EOS>'],source_sequence_length, max_target_sequence_length,
                                                        target_vocab_size, output_layer, batch_size, rnn_size, keep_prob,beam_width)
    
    return training_decoder_output, inference_decoder_output



In [0]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size,
                  source_sequence_length, target_sequence_length,
                  max_target_sentence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size,
                  rnn_size, num_layers, target_vocab_to_int,beam_width):
    encoder_output, enc_state = encoding_layer(input_data, 
                                  rnn_size, 
                                  num_layers, 
                                  keep_prob,
                                  source_sequence_length,
                                  source_vocab_size, 
                                  enc_embedding_size)
    
    # Prepare the target sequences we'll feed to the decoder in training mode
    dec_input = preprocess_targets(target_data, target_vocab_to_int, batch_size)
    
    # Pass encoder state and decoder inputs to the decoders
    training_decoder_output, inference_decoder_output = decoding_layer(dec_input,
                                                                       encoder_output,
                                                                       enc_state,
                                                                       source_sequence_length,
                                                                       target_sequence_length,
                                                                       max_target_sentence_length,
                                                                       rnn_size,
                                                                       num_layers, 
                                                                       target_vocab_to_int,
                                                                       target_vocab_size,
                                                                       batch_size,
                                                                       keep_prob,
                                                                       dec_embedding_size,
                                                                       beam_width) 
    
    return training_decoder_output, inference_decoder_output

In [0]:
# Setting the Hyperparameters
epochs = 20
batch_size = 128
rnn_size = 1024
num_layers = 3
beam_width=5
encoding_embedding_size = 1024
decoding_embedding_size = 1024
learning_rate = 0.001
learning_rate_decay = 0.9
min_learning_rate = 0.0001
keep_probability = 0.5
max_target_sentence_length = 25

In [0]:
# Defining a session
train_graph = tf.Graph()
with train_graph.as_default():
 
      # Loading the model inputs
      inputs,targets,lr,keep_prob,target_seq_len,max_target_seq_len,source_seq_len = model_inputs()

      # Getting the shape of the inputs tensor
      input_shape = tf.shape(inputs)

      # Getting the training and test predictions
      training_predictions, test_predictions = seq2seq_model(tf.reverse(inputs, [-1]),
                                                         targets,
                                                         keep_prob,
                                                         batch_size,
                                                         source_seq_len,
                                                         target_seq_len,
                                                         max_target_seq_len,
                                                         len(answerswords2int),
                                                         len(questionswords2int),
                                                         encoding_embedding_size,
                                                         decoding_embedding_size,
                                                         rnn_size,
                                                         num_layers,
                                                         questionswords2int,
                                                         beam_width)
      training_logits = tf.identity(training_predictions.rnn_output, name='logits')
      inference_logits = tf.identity(test_predictions.predicted_ids, name='predictions')

      masks = tf.sequence_mask(target_seq_len, max_target_seq_len, dtype=tf.float32, name='masks')

      
      with tf.name_scope("optimization"):
          # Loss function
          cost = tf.contrib.seq2seq.sequence_loss(
              training_logits,
              targets,
              masks)

          # Optimizer
          optimizer = tf.train.AdamOptimizer(lr)

          # Gradient Clipping
          gradients = optimizer.compute_gradients(cost)
          capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
          train_op = optimizer.apply_gradients(capped_gradients)
      saver = tf.train.Saver()

In [0]:
def pad_sentence_batch(sentence_batch, pad_int):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]


def get_batches(sources, targets, batch_size, source_pad_int, target_pad_int):
    """Batch targets, sources, and the lengths of their sentences together"""
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size

        # Slice the right amount for the batch
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]

        # Pad
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))

        # Need the lengths for the _lengths parameters
        pad_targets_lengths = []
        for target in pad_targets_batch:
            pad_targets_lengths.append(len(target))

        pad_source_lengths = []
        for source in pad_sources_batch:
            pad_source_lengths.append(len(source))

        yield pad_sources_batch, pad_targets_batch, pad_source_lengths, pad_targets_lengths


In [0]:
# Split data to training and validation sets
training_validation_split = int(len(sorted_clean_questions) * 0.15)
train_source = sorted_clean_questions[training_validation_split:]
train_target = sorted_clean_answers[training_validation_split:]
valid_source = sorted_clean_questions[:training_validation_split]
valid_target =sorted_clean_answers[:training_validation_split]
(valid_sources_batch, valid_targets_batch, valid_sources_lengths, valid_targets_lengths ) = next(get_batches(valid_source,
                                                                                                             valid_target,
                                                                                                             batch_size,
                                                                                                             questionswords2int['<PAD>'],
                                                                                                             answerswords2int['<PAD>']))                                                                                                  

In [0]:
# Training
batch_index_check_training_loss =500
batch_index_check_validation_loss = ((len(train_source)) // batch_size // 2) - 1
total_training_loss_error = 0
list_validation_loss_error = []
early_stopping_check = 0
early_stopping_stop = 100
checkpoint = "checkpoints/chatbot_weights.ckpt"

In [12]:
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch in range(epochs):
        for batch_i, (source_batch, target_batch, sources_lengths, targets_lengths) in enumerate(
                get_batches(train_source, train_target, batch_size,
                            questionswords2int['<PAD>'],
                            answerswords2int['<PAD>'])):
            starting_time = time.time()
            _, batch_training_loss = sess.run(
                [train_op, cost],
                {inputs: source_batch,
                 targets: target_batch,
                 lr: learning_rate,
                 target_seq_len: targets_lengths,
                 source_seq_len: sources_lengths,
                 keep_prob: keep_probability})
            total_training_loss_error += batch_training_loss
            ending_time = time.time()
            batch_time = ending_time - starting_time

            if batch_i % batch_index_check_training_loss == 0 and batch_i > 0:
                 print('Epoch: {:>3}/{}, Batch: {:>4}/{}, Training Loss Error: {:>6.3f}, Training Time on 100 Batches: {:d} seconds'.format(epoch,
                                                                                                                                       epochs,
                                                                                                                                       batch_i,
                                                                                                                                       len(train_source) // batch_size,
                                                                                                                                       total_training_loss_error / batch_index_check_training_loss,
                                                                                                                                       int(batch_time * batch_index_check_training_loss)))
                 total_training_loss_error = 0


            if batch_i % batch_index_check_validation_loss == 0 and batch_i > 0:
                  total_validation_loss_error = 0
                  for batch_v, (valid_batch, target_batch, valid_lengths, targets_lengths) in enumerate(get_batches(valid_source, valid_target, batch_size,questionswords2int['<PAD>'],answerswords2int['<PAD>'])):
                        _, batch_validation_loss = sess.run(
                            [train_op, cost],
                            {inputs: valid_batch,
                             targets: target_batch,
                             lr: learning_rate,
                             target_seq_len: targets_lengths,
                             source_seq_len: valid_lengths,
                             keep_prob: keep_probability})
                        total_validation_loss_error += batch_validation_loss
                  ending_time = time.time()
                  batch_time = ending_time - starting_time

                  average_validation_loss_error = total_validation_loss_error / (len(valid_source) / batch_size)
                  print('Validation Loss Error: {:>6.3f}, Batch Validation Time: {:d} seconds'.format(average_validation_loss_error, int(batch_time)))
                  learning_rate *= learning_rate_decay
                  if learning_rate < min_learning_rate:
                      learning_rate = min_learning_rate
                  list_validation_loss_error.append(average_validation_loss_error)
                  if average_validation_loss_error <= min(list_validation_loss_error):
                      print('I speak better now!!')
                      early_stopping_check = 0
                      saver.save(sess, checkpoint)
                      #files.download(checkpoint+'.meta')
                      print('Model Trained and Saved')
                  else:
                      print("Sorry I do not speak better, I need to practice more.")
                      early_stopping_check += 1
                      if early_stopping_check == early_stopping_stop:
                          break
        if early_stopping_check == early_stopping_stop:
            print("My apologies, I cannot speak better anymore. This is the best I can do.")
            break

Epoch:   0/20, Batch:  500/1030, Training Loss Error:  2.171, Training Time on 100 Batches: 399 seconds
Validation Loss Error:  1.871, Batch Validation Time: 125 seconds
I speak better now!!
Model Trained and Saved
Epoch:   0/20, Batch: 1000/1030, Training Loss Error:  1.880, Training Time on 100 Batches: 546 seconds
Validation Loss Error:  1.755, Batch Validation Time: 125 seconds
I speak better now!!
Model Trained and Saved
Epoch:   1/20, Batch:  500/1030, Training Loss Error:  1.859, Training Time on 100 Batches: 398 seconds
Validation Loss Error:  1.693, Batch Validation Time: 124 seconds
I speak better now!!
Model Trained and Saved
Epoch:   1/20, Batch: 1000/1030, Training Loss Error:  1.756, Training Time on 100 Batches: 542 seconds
Validation Loss Error:  1.651, Batch Validation Time: 125 seconds
I speak better now!!
Model Trained and Saved
Epoch:   2/20, Batch:  500/1030, Training Loss Error:  1.777, Training Time on 100 Batches: 399 seconds
Validation Loss Error:  1.612, Batch

In [14]:
########## PART 4 - TESTING THE SEQ2SEQ MODEL ##########
 
 
with tf.Session(graph=train_graph) as session:
    # Loading the weights and Running the session
    checkpoint = "checkpoints/chatbot_weights.ckpt"
    session.run(tf.global_variables_initializer())
    tf.train.Saver().restore(session, checkpoint)

    # Converting the questions from strings to lists of encoding integers
    def convert_string2int(question, word2int):
        question = clean_text(question)
        return [word2int.get(word, word2int['<OUT>']) for word in question.split()]

    # Setting up the chat
    while(True):
        question = input("You: ")
        if question == 'Goodbye':
            break
        input_data = train_graph.get_tensor_by_name('input:0')
        logits = train_graph.get_tensor_by_name('predictions:0')
        target_sequence_length = train_graph.get_tensor_by_name('target_sequence_length:0')
        source_sequence_length = train_graph.get_tensor_by_name('source_sequence_length:0')
        keep_prob = train_graph.get_tensor_by_name('keep_prob:0')
        question = convert_string2int(question, questionswords2int)
        question = question + [questionswords2int['<PAD>']] * (25 - len(question))
        fake_batch = np.zeros((batch_size, 25))
        fake_batch[0] = question
        predicted_answer = session.run(inference_logits, {input_data: fake_batch, 
                                                          target_sequence_length: [len(question)]*batch_size,
                                                          source_sequence_length: [len(question)]*batch_size,
                                                          keep_prob: 0.5})[0]
        answer = ''
        for i in predicted_answer[0]:
            if answersints2word[i] == 'i':
                token = ' I'
            elif answersints2word[i] == '<EOS>':
                token = '.'
            elif answersints2word[i] == '<OUT>':
                token = 'out'
            else:
                token = ' ' + answersints2word[i]
            answer += token
            if token == '.':
                break
        print('ChatBot: ' + answer)

INFO:tensorflow:Restoring parameters from checkpoints/chatbot_weights.ckpt
You: Hi
ChatBot: outout I what I
You: how are you
ChatBot:  they the they the the
You: good morning
ChatBot:  they I I I I
You: have a great day
ChatBot:  theoutout youout
You: let me know if you need anything
ChatBot:  what what that I I
You: its great to know that the model is working.
ChatBot:  the what what what what
You: hopefully we cam increase the accuracy and make you speak better
ChatBot:  what what what what what
You: Goodbye
