In [1]:
import re
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input, Bidirectional, Concatenate, Dropout, Attention

In [2]:
class AttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state
            inputs: (batchsize * 1 * de_in_dim)
            states: (batchsize * 1 * de_latent_dim)
            """

            assert_msg = "States must be an iterable. Got {} of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch size * en_seq_len * latent_dim
            W_a_dot_s = K.dot(encoder_out_seq, self.W_a)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>', U_a_dot_h.shape)

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h)
            if verbose:
                print('Ws+Uh>', Ws_plus_Uh.shape)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1)
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            if verbose:
                print('ei>', e_i.shape)

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """

            assert_msg = "States must be an iterable. Got {} of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]

        fake_state_c = K.sum(encoder_out_seq, axis=1)
        fake_state_e = K.sum(encoder_out_seq, axis=2)  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

In [3]:
train = pd.read_csv('../dataset/empatheticdialogues/train.csv', on_bad_lines='skip')
valid = pd.read_csv('../dataset/empatheticdialogues/valid.csv', on_bad_lines='skip')
train.head()

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:1,1,sentimental,I remember going to the fireworks with my best...,1,I remember going to see the fireworks with my ...,5|5|5_2|2|5,
1,hit:0_conv:1,2,sentimental,I remember going to the fireworks with my best...,0,Was this a friend you were in love with_comma_...,5|5|5_2|2|5,
2,hit:0_conv:1,3,sentimental,I remember going to the fireworks with my best...,1,This was a best friend. I miss her.,5|5|5_2|2|5,
3,hit:0_conv:1,4,sentimental,I remember going to the fireworks with my best...,0,Where has she gone?,5|5|5_2|2|5,
4,hit:0_conv:1,5,sentimental,I remember going to the fireworks with my best...,1,We no longer talk.,5|5|5_2|2|5,


In [4]:
prompt_train = train['prompt']
utterance_train = train['utterance']
context_train = train['context']

prompt_valid = valid['prompt']
utterance_valid = valid['utterance']
context_valid = valid['context']
del (train, valid)
len(prompt_train)

76668

In [5]:
context_train.unique()

array(['sentimental', 'afraid', 'proud', 'faithful', 'terrified',
       'joyful', 'angry', 'sad', 'jealous', 'grateful', 'prepared',
       'embarrassed', 'excited', 'annoyed', 'lonely', 'ashamed', 'guilty',
       'surprised', 'nostalgic', 'confident', 'furious', 'disappointed',
       'caring', 'trusting', 'disgusted', 'anticipating', 'anxious',
       'hopeful', 'content', 'impressed', 'apprehensive', 'devastated'],
      dtype=object)

In [6]:
#fix the max length <-- this is character length including spaces
MAX_LENGTH = 23
trimmed_prompt_train = []
trimmed_utterance_train = []
trimmed_context_train = []
for i in range(len(prompt_train)):
    #trim dataset to the MAX_LENGTH range [0, MAX_LENGTH]
    trimmed_prompt_train.append(prompt_train[i][:MAX_LENGTH].strip())
    trimmed_utterance_train.append(utterance_train[i].strip())
    trimmed_context_train.append(context_train[i].strip())

trimmed_prompt_valid = []
trimmed_utterance_valid = []
trimmed_context_valid = []
for i in range(len(prompt_valid)):
    trimmed_prompt_valid.append(prompt_valid[i][:MAX_LENGTH].strip())
    trimmed_utterance_valid.append(utterance_valid[i].strip())
    trimmed_context_valid.append(context_valid[i].strip())

del (prompt_train, prompt_valid, utterance_train, utterance_valid, context_train, context_valid, i)

In [7]:
len(trimmed_prompt_train)

76668

In [8]:
def expand_contractions_and_clean(text):
    # mapping of contractions to their expanded forms
    contractions = {
        r"i'm": "i am",
        r"he's": "he is",
        r"she's": "she is",
        r"that's": "that is",
        r"what's": "what is",
        r"where's": "where is",
        r"\'ll": " will",
        r"\'ve": " have",
        r"\'re": " are",
        r"\'d": " would",
        r"won't": "will not",
        r"can't": "can not",
    }
    
    text = text.lower()
    
    # expand contractions
    for contraction, expansion in contractions.items():
        text = re.sub(contraction, expansion, text)
    
    # remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    
    return text

In [9]:
cleaned_prompt_train = []
cleaned_utterance_train = []
for line in trimmed_prompt_train:
    cleaned_prompt_train.append(expand_contractions_and_clean(line))
        
for line in trimmed_utterance_train:
    cleaned_utterance_train.append(expand_contractions_and_clean(line))

cleaned_prompt_valid = []
cleaned_utterance_valid = []
for line in trimmed_prompt_valid:
    cleaned_prompt_valid.append(expand_contractions_and_clean(line))
        
for line in trimmed_utterance_valid:
    cleaned_utterance_valid.append(expand_contractions_and_clean(line))

del (trimmed_prompt_train, trimmed_utterance_train, trimmed_prompt_valid, trimmed_utterance_valid)

In [10]:
#counting words
word_count = {}
#for train
for line in cleaned_prompt_train:
    for word in line.split():
        if word not in word_count:
            word_count[word] = 1
        else:
            word_count[word] += 1

del (line, word)

for line in cleaned_utterance_train:
    for word in line.split():
        if word not in word_count:
            word_count[word] = 1
        else:
            word_count[word] += 1

del (line, word)

#for valid
for line in cleaned_prompt_valid:
    for word in line.split():
        if word not in word_count:
            word_count[word] = 1
        else:
            word_count[word] += 1

del (line, word)

for line in cleaned_utterance_valid:
    for word in line.split():
        if word not in word_count:
            word_count[word] = 1
        else:
            word_count[word] += 1

del (line, word)

In [11]:
#omit less frequent words
threshold = 5

vocab = {}
num = 0
for word, count in word_count.items():
    if count >= threshold:
        vocab[word] = num
        num += 1

del(word, count, num)

In [12]:
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>', '<SEP>']
for i in range(len(cleaned_utterance_train)):
    cleaned_utterance_train[i] = '<SOS> ' + cleaned_utterance_train[i] + ' <EOS>'
del(i)

for i in range(len(cleaned_utterance_valid)):
    cleaned_utterance_valid[i] = '<SOS> ' + cleaned_utterance_valid[i] + ' <EOS>'
del(i)

for i in range(len(cleaned_prompt_train)):
    cleaned_prompt_train[i] = cleaned_prompt_train[i] + ' <SEP> ' + trimmed_context_train[i]
del(i)

for i in range(len(cleaned_prompt_valid)):
    cleaned_prompt_valid[i] = cleaned_prompt_valid[i] + ' <SEP> ' + trimmed_context_valid[i]
del(i)

#for context words <-- here because don't want to lose context by threshold
for word in trimmed_context_train:
    if word not in vocab:
        vocab[word] = len(vocab)
del (word)

for word in trimmed_context_valid:
    if word not in vocab:
        vocab[word] = len(vocab)
del (word)

#for tokens
for token in tokens:
    if token not in vocab:
        vocab[token] = len(vocab)
del(token)

#bring <PAD> to front and shift 0th word to <PAD> value
first_word = ''
for k, v in vocab.items():
    if v == 0:
        first_word = k
        break

pad_value = vocab['<PAD>']
vocab['<PAD>'] = 0
vocab[first_word] = pad_value

#save vocab as json
with open('vocab.json', 'w', encoding='utf-8') as f:
    json.dump(vocab, f, ensure_ascii=False, indent=4)

In [13]:
inverse_vocab = {count:word for word, count in vocab.items()}

In [14]:
list(vocab.keys())[-20:] #just to make sure we have context and tokens in our vocab

['vr',
 'thatthat',
 'timethat',
 'itoh',
 'itthat',
 'loser_comma_',
 '15_comma_000',
 'outi',
 'youoh',
 'dooh',
 'nowthat',
 'thenthat',
 'herthat',
 'timeoh',
 'regiment',
 '<PAD>',
 '<EOS>',
 '<OUT>',
 '<SOS>',
 '<SEP>']

In [15]:
vocab['<PAD>']

0

In [16]:
encoder_input = []
for line in cleaned_prompt_train:
    word_list = []
    for word in line.split():
        if word not in vocab:
            word_list.append(vocab['<OUT>'])
        else:
            word_list.append(vocab[word])
        
    encoder_input.append(word_list)


encoder_input_valid = []
for line in cleaned_prompt_valid:
    word_list = []
    for word in line.split():
        if word not in vocab:
            word_list.append(vocab['<OUT>'])
        else:
            word_list.append(vocab[word])
        
    encoder_input_valid.append(word_list)

decoder_input = []
for line in cleaned_utterance_train:
    word_list = []
    for word in line.split():
        if word not in vocab:
            word_list.append(vocab['<OUT>'])
        else:
            word_list.append(vocab[word])        
    decoder_input.append(word_list)

decoder_input_valid = []
for line in cleaned_utterance_valid:
    word_list = []
    for word in line.split():
        if word not in vocab:
            word_list.append(vocab['<OUT>'])
        else:
            word_list.append(vocab[word])        
    decoder_input_valid.append(word_list)

del(cleaned_prompt_train, cleaned_utterance_train, cleaned_prompt_valid, cleaned_utterance_valid, line, word)


In [17]:
encoder_input = pad_sequences(encoder_input, MAX_LENGTH, padding='post', truncating='post')
decoder_input = pad_sequences(decoder_input, MAX_LENGTH, padding='post', truncating='post')

encoder_input_valid = pad_sequences(encoder_input_valid, MAX_LENGTH, padding='post', truncating='post')
decoder_input_valid = pad_sequences(decoder_input_valid, MAX_LENGTH, padding='post', truncating='post')

decoder_output = []
for i in decoder_input:
    decoder_output.append(i[1:]) 

decoder_output = pad_sequences(decoder_output, MAX_LENGTH, padding='post', truncating='post')

decoder_output_valid = []
for i in decoder_input_valid:
    decoder_output_valid.append(i[1:]) 

decoder_output_valid = pad_sequences(decoder_output_valid, MAX_LENGTH, padding='post', truncating='post')

del(i)

In [18]:
VOCAB_SIZE = len(vocab)
print(decoder_output.shape, decoder_input.shape, encoder_input.shape, len(vocab), len(inverse_vocab), inverse_vocab[0])

(76668, 23) (76668, 23) (76668, 23) 9657 9657 <PAD>


In [19]:
#GLOVE source <-- https://nlp.stanford.edu/projects/glove/
embedding_index = {}
with open('../dataset/glove/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
    f.close()

print("GloVe Loded!")

GloVe Loded!


In [20]:
#model params
EMBEDDING_DIMENSION = 100
LSTM_UNITS = 256

In [21]:
def embedding_matrix_creater(dimension, word_index):
    embedding_matrix = np.zeros((len(word_index)+1, dimension))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

embedding_matrix = embedding_matrix_creater(dimension=EMBEDDING_DIMENSION, word_index=vocab)

del(embedding_index)

In [22]:
embedding_matrix.shape

(9658, 100)

In [23]:
embedding_matrix[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [24]:
embedding_layer = Embedding(VOCAB_SIZE+1, 
                  EMBEDDING_DIMENSION, 
                  input_length=MAX_LENGTH,
                  trainable=True)

embedding_layer.build((None,))
embedding_layer.set_weights([embedding_matrix])

Metal device set to: Apple M1


2024-06-17 14:52:03.542579: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-17 14:52:03.542833: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [25]:
#building model
encoder_input_layer = Input(shape=(MAX_LENGTH, ), name='encoder_input_layer')

encoder_embedding_layer = embedding_layer(encoder_input_layer)
encoder_bidirectional_lstm_layer = Bidirectional(
    LSTM(LSTM_UNITS, return_state=True, dropout=0.05, return_sequences = True, name='encoder_lstm_layer'),
    name='encoder_bidirectional_lstm'
)

encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_bidirectional_lstm_layer(encoder_embedding_layer)

state_h = Concatenate(name='state_h_concat')([forward_h, backward_h])
state_c = Concatenate(name='state_c_concat')([forward_c, backward_c])

encoder_states = [state_h, state_c]

decoder_input_layer = Input(shape=(MAX_LENGTH, ), name='decoder_input_layer')
decoder_embedding_layer = embedding_layer(decoder_input_layer)
decoder_lstm_layer = LSTM(LSTM_UNITS * 2, return_state=True, return_sequences=True, dropout=0.05, name='decoder_lstm_layer')
decoder_outputs, _, _ = decoder_lstm_layer(decoder_embedding_layer, initial_state=encoder_states)

# attention
attention_layer = AttentionLayer(name='attention_layer')
attention_output, attention_state = attention_layer([encoder_outputs, decoder_outputs])
decoder_concat_input_layer = Concatenate(axis=-1, name='decoder_concat_input_layer')([decoder_outputs, attention_output])


decoder_dense_layer = Dense(VOCAB_SIZE, activation='softmax', name='decoder_dense_layer')
final_output_layer = decoder_dense_layer(decoder_concat_input_layer)

model = Model([encoder_input_layer, decoder_input_layer], final_output_layer)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_input_layer (InputLaye  [(None, 23)]        0           []                               
 r)                                                                                               
                                                                                                  
 encoder_input_layer (InputLaye  [(None, 23)]        0           []                               
 r)                                                                                               
                                                                                                  
 embedding (Embedding)          (None, 23, 100)      965800      ['encoder_input_layer[0][0]',    
                                                                  'decoder_input_layer[0][0]']

In [26]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

In [None]:
EPOCHS = 100
BATCH_SIZE = 64

history = model.fit(
    [encoder_input, decoder_input], decoder_output,
    epochs=EPOCHS, batch_size=BATCH_SIZE,
    validation_data=([encoder_input_valid, decoder_input_valid], decoder_output_valid)
)

In [28]:
#save model and weights
model.save('chatbot.h5')
model.save_weights('chatbot_weights.h5')

#save history for later analysis
history_dict = history.history

# convert NumPy arrays to lists for JSON compatibility
for key in history_dict.keys():
    history_dict[key] = [float(i) for i in history_dict[key]]

# save the history dictionary to a JSON file
with open('chatbot_history.json', 'w') as f:
    json.dump(history_dict, f, indent=4)

In [None]:
plt.plot(history.history['sparse_categorical_accuracy'], label='accuracy')
#plt.plot(history.history['val_sparse_categorical_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0, 1])
plt.legend(loc='lower right')
plt.show()

In [30]:
#model inference for generating responses
inference_encoder_model = Model(encoder_input_layer, [encoder_outputs, encoder_states])

inference_decoder_state_input_h = Input(shape=( LSTM_UNITS * 2,))
inference_decoder_state_input_c = Input(shape=( LSTM_UNITS * 2,))

inference_decoder_states_inputs = [inference_decoder_state_input_h, inference_decoder_state_input_c]


inference_decoder_outputs, inference_state_h, inference_state_c = decoder_lstm_layer(decoder_embedding_layer , initial_state=inference_decoder_states_inputs)


inference_decoder_states = [inference_state_h, inference_state_c]

#decoder_output = dec_dense(decoder_outputs)

inference_decoder_model = Model(
    [decoder_input_layer, inference_decoder_states_inputs],
    [inference_decoder_outputs] + inference_decoder_states
)


In [None]:
separator = '-'
input_text = ""
while input_text != 'exit':
    input_text = input("YOU : ")
    if(input_text == 'exit'):
        print("Bye! Talk to you later.")
    else:
        try:
            if separator in input_text:
                prompt, context = input_text.split(separator, 1)  # split only on the first occurrence
            else:
                prompt = input_text
                context = "joyful"
                
            input_cleaned = expand_contractions_and_clean(prompt.strip()) + ' <SEP> ' + context.strip()
            input_cleaned = [input_cleaned]
            
            text = []
            for x in input_cleaned:
                lst = []
                for y in x.split():
                    try:
                        lst.append(vocab[y])
                    except:
                        lst.append(vocab['<OUT>'])
                text.append(lst)
            text = pad_sequences(text, MAX_LENGTH, padding='post')
    
            inference_encoder_output, inference_encoder_states = inference_encoder_model.predict( text )
    
            empty_target_seq = np.zeros( ( 1 , 1) )
            empty_target_seq[0, 0] = vocab['<SOS>']
            stop_condition = False
            decoded_translation = ''
    
            while not stop_condition :
                inference_decoder_outputs , h , c = inference_decoder_model.predict([ empty_target_seq ] + inference_encoder_states )
                
                ###
                ###########################
                inference_attention_output, inference_attention_states = attention_layer([inference_encoder_output, inference_decoder_outputs])
                inference_decoder_concat_input = Concatenate(axis=-1)([inference_decoder_outputs, inference_attention_output])
                inference_decoder_concat_input = decoder_dense_layer(inference_decoder_concat_input)
                ###########################
    
                sampled_word_index = np.argmax( inference_decoder_concat_input[0, -1, :] )
                sampled_word = inverse_vocab[sampled_word_index] + ' '
    
                if sampled_word != '<EOS> ':
                    decoded_translation += sampled_word           
    
                if sampled_word == '<EOS> ' or len(decoded_translation.split()) > MAX_LENGTH:
                    stop_condition = True
    
                empty_target_seq = np.zeros( ( 1 , 1 ) )  
                empty_target_seq[ 0 , 0 ] = sampled_word_index
                inference_encoder_states = [ h , c ] 
    
            print("CHATBOT : ", decoded_translation )
    
        except Exception as e:
            #print("I couldn't get you. Please try again.")
            print(e)
            