In [13]:
import tensorflow as tf
import numpy as np

# Sample conversation dataset

In [14]:
questions = ["hi", "how are you", "what's your name", "bye"]
answers = ["hello", "i'm fine", "i'm a chatbot", "goodbye"]
 
# Add <start> and <end> to target sentences
answers = [f"<start> {a} <end>" for a in answers]

# Tokenize input and output

In [15]:
q_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
a_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
 
q_tokenizer.fit_on_texts(questions)
a_tokenizer.fit_on_texts(answers)
 
q_sequences = q_tokenizer.texts_to_sequences(questions)
a_sequences = a_tokenizer.texts_to_sequences(answers)
 
max_q_len = max(len(q) for q in q_sequences)
max_a_len = max(len(a) for a in a_sequences)
 
q_padded = tf.keras.preprocessing.sequence.pad_sequences(q_sequences, maxlen=max_q_len, padding='post')
a_padded = tf.keras.preprocessing.sequence.pad_sequences(a_sequences, maxlen=max_a_len, padding='post')

# Split decoder input and output

In [16]:
decoder_input = a_padded[:, :-1]
decoder_target = tf.keras.utils.to_categorical(a_padded[:, 1:], num_classes=len(a_tokenizer.word_index) + 1)

# Define attention mechanism

In [17]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
 
    def call(self, enc_output, dec_hidden):
        dec_hidden = tf.expand_dims(dec_hidden, 1)
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(dec_hidden)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context = attention_weights * enc_output
        context_vector = tf.reduce_sum(context, axis=1)
        return context_vector, attention_weights

class RepeatContext(tf.keras.layers.Layer):
    def call(self, context, decoder_outputs):
        context = tf.expand_dims(context, 1)
        context = tf.repeat(context, tf.shape(decoder_outputs)[1], axis=1)
        return context

# Define layers for reuse

In [18]:
embedding_dim = 64
lstm_units = 64
vocab_inp_size = len(q_tokenizer.word_index) + 1
vocab_tar_size = len(a_tokenizer.word_index) + 1

# Define encoder model layers

In [19]:
encoder_inputs = tf.keras.Input(shape=(max_q_len,))
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_inp_size, embedding_dim)
encoder_lstm_layer = tf.keras.layers.LSTM(lstm_units, return_sequences=True, return_state=True)

# Apply encoder layers

In [20]:
encoder_embedded = encoder_embedding_layer(encoder_inputs)
encoder_outputs, state_h, state_c = encoder_lstm_layer(encoder_embedded)

# Define decoder layers

In [21]:
decoder_inputs = tf.keras.Input(shape=(max_a_len - 1,))
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_tar_size, embedding_dim)
decoder_lstm_layer = tf.keras.layers.LSTM(lstm_units, return_sequences=True, return_state=True)

decoder_embedded = decoder_embedding_layer(decoder_inputs)
decoder_outputs, _, _ = decoder_lstm_layer(decoder_embedded, initial_state=[state_h, state_c])

# Attention mechanism

In [22]:
attention = BahdanauAttention(lstm_units)
context_vector, _ = attention(encoder_outputs, state_h)
context_vector_repeated = RepeatContext()(context_vector, decoder_outputs)

concat = tf.keras.layers.Concatenate(axis=-1)([decoder_outputs, context_vector_repeated])
final_output = tf.keras.layers.Dense(vocab_tar_size, activation='softmax')(concat)

# Compile model

In [23]:
model = tf.keras.Model([encoder_inputs, decoder_inputs], final_output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train

In [24]:
model.fit([q_padded, decoder_input], decoder_target, epochs=500, verbose=0)

<keras.src.callbacks.history.History at 0x2a17c726540>

# Inference function

In [48]:
def chat(input_text):
    seq = q_tokenizer.texts_to_sequences([input_text])
    seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_q_len, padding='post')

    # Encoder
    enc_embed = encoder_embedding_layer(seq)
    enc_out, h, c = encoder_lstm_layer(enc_embed)

    # Decoder
    dec_input = tf.constant([[a_tokenizer.word_index['<start>']]])
    result = []

    for _ in range(max_a_len):
        dec_embed = decoder_embedding_layer(dec_input)
        dec_out, h, c = decoder_lstm_layer(dec_embed, initial_state=[h, c])

        context_vec, _ = attention(enc_out, h)
        context_vec = tf.expand_dims(context_vec, 1)
        context_vec = tf.repeat(context_vec, tf.shape(dec_out)[1], axis=1)

        concat = tf.concat([dec_out, context_vec], axis=-1)
        pred = tf.keras.layers.Dense(vocab_tar_size, activation='softmax')(concat)

        token = tf.argmax(pred[:, -1, :], axis=-1).numpy()[0]
        word = a_tokenizer.index_word.get(token, '')

        if word == '<end>':
            break
        if word != '<start>':  # ✅ Skip start token in output
            result.append(word)

        dec_input = tf.constant([[token]])

    return ' '.join(result)


# Try chatbot

In [50]:
print("User: hi")
print("Bot:", chat("hi"))
print("User: what's your name")
print("Bot:", chat("what's your name"))
print("User: bye")
print("Bot:", chat("bye"))

User: hi
Bot: goodbye  fine
User: what's your name
Bot: goodbye hello a goodbye goodbye
User: bye
Bot: a
