## Chatbot

In [24]:
import pickle
import numpy as np

In [25]:
with open('train_qa.txt','rb') as f:
    train_data = pickle.load(f)

with open('test_qa.txt','rb') as f:
    test_data = pickle.load(f)

print(type(train_data))
print(type(test_data))

<class 'list'>
<class 'list'>


In [26]:
all_data = test_data+train_data
len(all_data)

11000

In [27]:
for story, query, answer in all_data:
    print(' '.join(story), ' '.join(query), ''.join(answer))

Mary got the milk there . John moved to the bedroom . Is John in the kitchen ? no
Mary got the milk there . John moved to the bedroom . Mary discarded the milk . John went to the garden . Is John in the kitchen ? no
Mary got the milk there . John moved to the bedroom . Mary discarded the milk . John went to the garden . Daniel moved to the bedroom . Daniel went to the garden . Is John in the garden ? yes
Mary got the milk there . John moved to the bedroom . Mary discarded the milk . John went to the garden . Daniel moved to the bedroom . Daniel went to the garden . Daniel travelled to the bathroom . Sandra travelled to the bedroom . Is Daniel in the bathroom ? yes
Mary got the milk there . John moved to the bedroom . Mary discarded the milk . John went to the garden . Daniel moved to the bedroom . Daniel went to the garden . Daniel travelled to the bathroom . Sandra travelled to the bedroom . Mary took the football there . Sandra grabbed the milk there . Is Daniel in the bedroom ? no
D

In [28]:
vocab = set()
for story, query, answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(query))
vocab.add('yes')
vocab.add('no')
vocab_len = len(vocab)+1
print(vocab)
print(vocab_len)
    

{'discarded', 'back', '.', 'travelled', 'office', 'down', 'put', 'left', 'grabbed', '?', 'hallway', 'milk', 'went', 'picked', 'no', 'Sandra', 'kitchen', 'there', 'took', 'Mary', 'yes', 'John', 'bedroom', 'in', 'Is', 'bathroom', 'garden', 'journeyed', 'Daniel', 'up', 'football', 'apple', 'to', 'the', 'dropped', 'got', 'moved'}
38


In [29]:
all_story_len = [len(data[0]) for data in all_data]
max_story_len = max(all_story_len) # Required for padding sequences

all_query_len = [len(data[1]) for data in all_data]
max_query_len = max(all_query_len) # Required for padding sequences

### Part 2: Vectorizer

In [30]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [31]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

tokenizer.word_index

{'discarded': 1,
 'back': 2,
 '.': 3,
 'travelled': 4,
 'office': 5,
 'down': 6,
 'put': 7,
 'left': 8,
 'grabbed': 9,
 '?': 10,
 'hallway': 11,
 'milk': 12,
 'went': 13,
 'picked': 14,
 'no': 15,
 'sandra': 16,
 'kitchen': 17,
 'there': 18,
 'took': 19,
 'mary': 20,
 'yes': 21,
 'john': 22,
 'bedroom': 23,
 'in': 24,
 'is': 25,
 'bathroom': 26,
 'garden': 27,
 'journeyed': 28,
 'daniel': 29,
 'up': 30,
 'football': 31,
 'apple': 32,
 'to': 33,
 'the': 34,
 'dropped': 35,
 'got': 36,
 'moved': 37}

In [32]:
train_story_text = []
train_query_text = []
train_answer_text = []

for story,query,answer in train_data:
    train_story_text.append(story)
    train_query_text.append(query)
    train_answer_text.append(answer)

train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [33]:
#function to create vectorised stories
def vectorize_stories(data, word_index = tokenizer.word_index,max_story_len = max_story_len, max_query_len = max_query_len):
    #Stories is our X
    X = []
    # Questions is another feature
    Xq = []
    # Y is the output answer(Y/n)
    Y = []

    for s,q,a in data:
        # for each story
        x = [word_index[word.lower()] for word in s]
        xq = [word_index[word.lower()] for word in q]
        
        y = np.zeros(len(word_index)+1)
        y[word_index[a]] = 1

        X.append(x)
        Xq.append(xq)
        Y.append(y)

    return (pad_sequences(X,maxlen=max_story_len),pad_sequences(Xq,maxlen=max_query_len),np.array(Y))

inputs_train, queries_train, answers_train = vectorize_stories(train_data)
inputs_test, queries_test, answers_test = vectorize_stories(test_data)


### Building the Network

In [34]:
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM,Dense,Dropout,Input,Activation,Permute,Dropout,add,dot,concatenate

input_sequence = Input((max_story_len,)) 
#Placeholder for story with shape of (max_story_len,batch_size) since we do not have the batch_size

question = Input((max_query_len,))

vocab_size = len(vocab)+1

In [35]:
#Input Encoder M

input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim = vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3)) #Dropout layer to prevent overfitting by turning off neurons randomly

#Output - (samples, story_maxlen, embedding_dim)

In [36]:
#Input Encoder C

input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim = vocab_size,output_dim=max_query_len))
input_encoder_c.add(Dropout(0.3)) #Dropout layer to prevent overfitting by turning off neurons randomly

#Output - (samples, story_maxlen, max_query_len)

In [37]:
#Question Encoder

question_encoder = Sequential()
question_encoder.add(Embedding(input_dim = vocab_size,output_dim=64,input_length=max_query_len))
question_encoder.add(Dropout(0.3)) #Dropout layer to prevent overfitting by turning off neurons randomly

#Output - (samples, query_maxlen, embedding_dim)

In [38]:
# Encoded = Encoder(Input)
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [39]:
match = dot([input_encoded_m,question_encoded],axes=(2,2))
match = Activation('softmax')(match)

response = add([match,input_encoded_c])
response = Permute((2,1))(response)

answer = concatenate([response,question_encoded])
answer

<KerasTensor: shape=(None, 6, 220) dtype=float32 (created by layer 'concatenate_1')>

In [40]:
answer = LSTM(32)(answer)
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer) # Output (samples, vocab_size)
answer = Activation('softmax')(answer) # To output yes or no

In [41]:
# Model
model_cb = Model([input_sequence,question],answer)
model_cb.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
model_cb.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 156)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 6)]          0           []                               
                                                                                                  
 sequential_3 (Sequential)      (None, None, 64)     2432        ['input_3[0][0]']                
                                                                                                  
 sequential_5 (Sequential)      (None, 6, 64)        2432        ['input_4[0][0]']                
                                                                                            

### Training the network and testing

In [42]:
history = model_cb.fit([inputs_train,queries_train],answers_train,epochs=100,batch_size=32,validation_data=([inputs_test,queries_test],answers_test))
#model_cb.save('cb_model.h5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
 21/313 [=>............................] - ETA: 3s - loss: 0.6966 - accuracy: 0.4658

In [None]:
# Using pre-trained model
model_cb.load_weights('chatbot_10.h5')
pred_results = model_cb.predict([inputs_test,queries_test])

#Using argmax to get the index of the highest probability
val_max = np.argmax(pred_results[2])
for key,val in tokenizer.word_index.items():
    if val == val_max:
        print(key)

took
