<img src="../Pics/MLSb-T.png" width="160">
<br><br>
<center><u><H1>Developing a Chatbot</H1></u></center>

In [1]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
sess = tf.Session(config=config)
set_session(sess)

Using TensorFlow backend.


### The bAbi project:
The dataset was made open source by Facebook AI research.
https://research.fb.com/downloads/babi/

It is a set of 20 QA tasks, each consisting of several context-question-answer triplets.

https://github.com/facebook/bAbI-tasks

The original paper: Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks.
https://arxiv.org/abs/1502.05698


In [2]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, CuDNNLSTM, Permute, Dropout, BatchNormalization, add, dot, concatenate
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from functools import reduce
import tarfile
import numpy as np
import re

## Loading the data:

In [3]:
def tokenize(sent): ## splitting the text in tokens including punctuation
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

In [4]:
#Parse stories provided in the bAbi tasks format
#If only_supporting is true, only the sentences
#that support the answer are kept.
def parse_stories(lines, only_supporting=False):
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        id, line = line.split(' ', 1)
        id = int(id)
        if id == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data

In [5]:
#Given a file, read the file, retrieve the stories,
#and then convert the sentences into a single story.
#If max_length is supplied, any stories longer than max_length tokens will be discarded.
def get_stories(file, only_supporting=False, max_length=None):
    data = parse_stories(file.readlines(), only_supporting=only_supporting)
    flat = lambda data: reduce(lambda i, j: i + j, data)
    data = [(flat(story), question, answer) for story, question, answer in data if not max_length or len(flat(story)) < max_length]
    return data

In [6]:
def vectorize_stories(data, word_id, story_maxlen, question_maxlen):
    X = []
    Q = []
    Y = []
    for story, question, answer in data:
        x = [word_id[i] for i in story]
        q = [word_id[i] for i in question]
        # Index 0 is reserved
        y = np.zeros(len(word_id) + 1)
        y[word_id[answer]] = 1
        X.append(x)
        Q.append(q)
        Y.append(y)
    return (pad_sequences(X, maxlen=story_maxlen), pad_sequences(Q, maxlen=question_maxlen), np.array(Y))

## Loading the data:

In [7]:
tar_file = tarfile.open(get_file('babi-tasks-v1-2.tar.gz',
                                 origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz'))

In [8]:
challenges = {
    'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt', # QA1 with 10,000 samples
    'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt', # QA2 with 10,000 samples
}
challenge_type = 'single_supporting_fact_10k'
challenge = challenges[challenge_type]

train_stories = get_stories(tar_file.extractfile(challenge.format('train')))
test_stories = get_stories(tar_file.extractfile(challenge.format('test')))

  return _compile(pattern, flags).split(string, maxsplit)


In [9]:
print('Train stories lenght:', len(train_stories))
print('Test stories lenght:', len(test_stories))

Train stories lenght: 10000
Test stories lenght: 1000


### stories are tuple (input, question, answer)

In [10]:
train_stories[1]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'John',
  'went',
  'to',
  'the',
  'hallway',
  '.',
  'Daniel',
  'went',
  'back',
  'to',
  'the',
  'hallway',
  '.',
  'Sandra',
  'moved',
  'to',
  'the',
  'garden',
  '.'],
 ['Where', 'is', 'Daniel', '?'],
 'hallway')

In [11]:
test_stories[1]

(['John',
  'travelled',
  'to',
  'the',
  'hallway',
  '.',
  'Mary',
  'journeyed',
  'to',
  'the',
  'bathroom',
  '.',
  'Daniel',
  'went',
  'back',
  'to',
  'the',
  'bathroom',
  '.',
  'John',
  'moved',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Where', 'is', 'Mary', '?'],
 'bathroom')

In [12]:
vocab = set()
for story, question, answer in train_stories + test_stories:
    vocab |= set(story + question + [answer])
vocab = sorted(vocab)
print(vocab)

In [14]:
# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
vocab_size

22

In [15]:
story_maxlen = max(map(len, (s for s, _, _ in train_stories + test_stories)))
question_maxlen = max(map(len, (s for _, s, _ in train_stories + test_stories)))

In [16]:
print('Story max length:', story_maxlen)
print('Question max length:', question_maxlen)

Story max length: 68
Query max length: 4


### Vectorizing stories:

In [17]:
word2id = dict((w, i + 1) for i, w in enumerate(vocab))
print(word2id)

{'.': 1, '?': 2, 'Daniel': 3, 'John': 4, 'Mary': 5, 'Sandra': 6, 'Where': 7, 'back': 8, 'bathroom': 9, 'bedroom': 10, 'garden': 11, 'hallway': 12, 'is': 13, 'journeyed': 14, 'kitchen': 15, 'moved': 16, 'office': 17, 'the': 18, 'to': 19, 'travelled': 20, 'went': 21}


In [18]:
inputs_train, questions_train, answers_train = vectorize_stories(train_stories, word2id, story_maxlen, question_maxlen)

In [19]:
inputs_test, questions_test, answers_test = vectorize_stories(test_stories, word2id, story_maxlen, question_maxlen)

In [20]:
print('inputs_train shape:', inputs_train.shape)
print('inputs_test shape:', inputs_test.shape)

inputs_train shape: (10000, 68)
inputs_test shape: (1000, 68)


In [21]:
print('queries_train shape:', questions_train.shape)
print('queries_test shape:', questions_test.shape)

queries_train shape: (10000, 4)
queries_test shape: (1000, 4)


In [22]:
print('answers_train shape:', answers_train.shape)
print('answers_test shape:', answers_test.shape)

answers_train shape: (10000, 22)
answers_test shape: (1000, 22)


## Creating the model an End to End Memory Network:

## Memory module:

In [23]:
story_sequence = Input((story_maxlen,))
question = Input((question_maxlen,))

# embed the input sequence into a sequence of vectors for the stories
input_encoder_s = Sequential()
input_encoder_s.add(Embedding(input_dim=vocab_size, output_dim=64))
input_encoder_s.add(Dropout(0.3))

# embed the input into a sequence of vectors of size question_maxlen
# output: (samples, story_maxlen, question_maxlen)
input_encoder_q = Sequential()
input_encoder_q.add(Embedding(input_dim=vocab_size, output_dim=question_maxlen))
input_encoder_q.add(Dropout(0.3))

# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=question_maxlen))
question_encoder.add(Dropout(0.3))

## Controller module:

In [24]:
# encode input sequence and questions to sequences of dense vectors
input_encoded_s = input_encoder_s(story_sequence)
input_encoded_q = input_encoder_q(story_sequence)
question_encoded = question_encoder(question)

# compute a 'match' between the first input vector sequence
# and the question vector sequence
# shape: `(samples, story_maxlen, question_maxlen)`
match = dot([input_encoded_s, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

# add the match matrix with the second input vector sequence
response = add([match, input_encoded_q])  # (samples, story_maxlen, question_maxlen)
response = Permute((2, 1))(response)  # (samples, question_maxlen, story_maxlen)

# concatenate the match matrix with the question vector sequence
answer = concatenate([response, question_encoded])

answer = CuDNNLSTM(32)(answer)  # (samples, 32)
answer = Dropout(0.3)(answer)
answer = BatchNormalization()(answer)

output = Dense(vocab_size)(answer)  # (samples, vocab_size)
output = Activation('softmax')(output)

In [25]:
model = Model([story_sequence, question], output)

In [26]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 68)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 4)            0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       multiple             1408        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 4, 64)        1408        input_2[0][0]                    
__________________________________________________________________________________________________
dot_1 (Dot

In [27]:
model.compile(optimizer=Adam(0.005), loss='categorical_crossentropy', metrics=['accuracy'])

## Training the model:

In [28]:
%%time
model.fit([inputs_train, questions_train], answers_train,
          batch_size=128,
          epochs=120,
          validation_data=([inputs_test, questions_test], answers_test))

Train on 10000 samples, validate on 1000 samples
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/12

Epoch 118/120
Epoch 119/120
Epoch 120/120
Wall time: 1min 32s


<keras.callbacks.History at 0x2863e8846d8>

In [29]:
model.save('../data/chatbot/chatbot_model.h5')

In [30]:
model.load_weights('../data/chatbot/chatbot_model.h5')
pred = model.predict(([inputs_test, questions_test]))

## Testing with random samples:

In [31]:
n = np.random.randint(0,1000)

In [32]:
story_list = test_stories[n][0]
story =' '.join(word for word in story_list)
print("Story is:",story)

Story is: Daniel went to the hallway . Daniel went to the bedroom .


In [33]:
question_list = test_stories[n][1]
question =' '.join(word for word in question_list)
print("Question is: ", question)

Question is:  Where is Daniel ?


In [34]:
answer = test_stories[n][2]
print("Actual answer is: ", answer)

Actual answer is:  bedroom


### Showing probabilities:

In [35]:
max_value = np.argmax(pred[n])

Machine answer is:  bedroom
Machine says: I am  0.99981207 certain of it


In [None]:
for key, val in word2id.items():
    if val == max_value:
        k = key

print("Machine answer is: ", k)
print("Machine says: I am ", pred[n][max_value], "certain of it")

## Reference:

https://research.fb.com/downloads/babi/