<a href="https://colab.research.google.com/github/abhishek203/conversational-chatbot/blob/master/Encoder_Decoder_model_for_ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
import tensorflow.keras.layers.experimental.preprocessing as tokens
import re
import io
import os
from sklearn import datasets
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [0]:
path_to_file = "/content/chatbot_data_small.txt"

In [0]:
data = io.open(path_to_file,errors = 'ignore').read().strip().split('- - ')

In [0]:
raw_question = []
raw_ans = []
for i in range(len(data)):
    pair = data[i].split('  - ')
    for j in range(len(pair)-1):
        raw_question.append(pair[0])
        raw_ans.append(pair[j+1])


In [0]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = " ".join(text.split())
    return text

In [0]:
clean_questions = []
for question in raw_question:
    clean_questions.append(clean_text(question))
    
clean_answers = []    
for answer in raw_ans:
    clean_answers.append(clean_text(answer))

In [0]:
question = []
ans = []
pair =[]
for i in range(len(clean_questions)):
  question.append('SOS '+ clean_questions[i] +' EOS')
  ans.append('SOS '+ clean_answers[i] +' EOS')
  pair.append([question[i],ans[i]])



In [0]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='',oov_token='<oov>')

tokenizer.fit_on_texts(question)
question_seq = tokenizer.texts_to_sequences(question)
question_seq = tf.keras.preprocessing.sequence.pad_sequences(question_seq,padding='post')
tokenizer.fit_on_texts(ans)
ans_seq = tokenizer.texts_to_sequences(ans)
ans_seq = tf.keras.preprocessing.sequence.pad_sequences(ans_seq, padding = 'post')

In [111]:
len(question_seq[0])

24

In [112]:
len(ans_seq[0])

44

In [113]:
len(tokenizer.word_index)

2217

In [0]:
BATCH_SIZE =32
dataset = tf.data.Dataset.from_tensor_slices((question_seq, ans_seq)).shuffle(1000)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [0]:
class EncoderDecoder(tf.keras.Model):
  def __init__(self,vocab_input=2218,vocab_output=2218,BATCH_SIZE=32):
    super(EncoderDecoder,self).__init__()
    #Encoder
    self.encoder_embedding = tf.keras.layers.Embedding(vocab_input,256)
    self.encoder_lstm = tf.keras.layers.LSTM(512,return_sequences=True,return_state=True)


    #Decoder
    self.decoder_embedding = tf.keras.layers.Embedding(vocab_output,256)
    self.fc = tf.keras.layers.Dense(vocab_output)
    self.decoder_lstm = tf.keras.layers.LSTM(512,return_sequences=True,return_state=True)


  @tf.function
  def train_step(self,data):
    loss_val =0
    input,targ = data
    with tf.GradientTape() as tape:
      input = self.encoder_embedding(input)
      enc_output,enc_h,enc_c = self.encoder_lstm(input)

      dec_h = enc_h
      dec_input = tf.expand_dims([tokenizer.word_index['sos']]*BATCH_SIZE,1)
      predictions = []

      for t in range(1,targ.shape[1]):
        dec_input = self.decoder_embedding(dec_input)
        dec_output,dec_h,dec_c = self.decoder_lstm(dec_input,initial_state=[dec_h,enc_c])
        dec_output = tf.reshape(dec_output, (-1, dec_output.shape[2]))
        predictions = self.fc(dec_output)

        loss_val+=loss_fn(targ[:,t],predictions)

        dec_input = tf.expand_dims(targ[:, t], 1)

      gradients = tape.gradient(loss_val,self.trainable_variables)

      self.optimizer.apply_gradients(zip(gradients,self.trainable_variables))

      self.compiled_metrics.update_state(targ,predictions)

      return {m.name:m.result() for m in self.metrics}

  def predict(self,sentence):
    max_length_inp = 24
    max_length_targ =44
    

    inputs = [tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_inp,
                                                          padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''
    inputs = self.encoder_embedding(inputs)
    enc_out, enc_h,enc_c = self.encoder_lstm(inputs)

    dec_h = enc_h
    dec_input = tf.expand_dims([tokenizer.word_index['sos']], 0)

    for t in range(max_length_targ):
      dec_input = self.decoder_embedding(dec_input)
      dec_output,dec_h,dec_c = self.decoder_lstm(dec_input,initial_state=[dec_h,enc_c])
      dec_output = tf.reshape(dec_output, (-1, dec_output.shape[2]))
      predictions = self.fc(dec_output)
      
      predicted_id = tf.argmax(predictions[0]).numpy()

      result += tokenizer.index_word[predicted_id] + ' '

      if tokenizer.index_word[predicted_id] == 'eos':
        return result
      dec_input = tf.expand_dims([predicted_id], 0)

    return result
  @tf.function
  def call(self,x):
    return 0

In [0]:
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  
  def loss_fn(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [0]:
model = EncoderDecoder()
model.compile(loss = loss_fn, optimizer = 'adam',metrics=['accuracy'])

In [118]:
model.fit(dataset,epochs=1)



<tensorflow.python.keras.callbacks.History at 0x7f8971ef9898>

In [119]:
model.predict('hi')

'i eos '

In [0]:
model._set_inputs(inputs=tf.TensorSpec(
    shape=[32,24], dtype=tf.dtypes.float32
))

In [121]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)

tflite_model = converter.convert()
open("chatbot_small.tflite", "wb").write(tflite_model)

324