In [1]:
#import all the necessary libraries
import tensorflow as tf
import numpy as np
import ast
import re
from tensorflow import keras
import time
import os
import io

In [2]:
#contains code under the Apache License, Version 2.0 copyright 2019 the TensorFlow Authors
#sections of code have been modified for other usage

In [100]:
#syntax operations on a given sentence to format it
def preprocess(sentence):
    return re.sub(r"[^a-zA-Z?.!,\']+", " ", re.sub(r'[" "]+', " ", re.sub(r"([?.!,])", r" \1 ", sentence))).strip()

In [3]:
#uses movie conversations and movie lines to create a (prompts, responses) tuple
def create_data():
    dialouge = open('./movie_lines.txt').read()
    conversations = open('./movie_conversations.txt').read()
    dialogue_data = {} #key val map with move linenumas the key and dialouge as the val

    prompts, responses = [], []

    #process the dialouge first
    for l in dialouge.split('\n'):
        line = l.split(' +++$+++ ')
        if not len(line) == 5: #skip if too short
            continue
        #treate puncation as sepereate by spacing them out and then removing all but the recognized characters
        dialogue_data[(line[2] + line[0])] = preprocess(line[4])

    #use the conversations file to assemble conversations
    for l in conversations.split('\n'):
        line = l.split(' +++$+++ ')

        if not len(line) == 4: #skip if too short
            continue
        #parse the last element as a list
        lines = ast.literal_eval(line[3])
        for i in range(1, len(lines)):
            #assemble the lists
            prompts.append(dialogue_data[line[2] + lines[i-1]])
            responses.append(dialogue_data[line[2] + lines[i]])

    return prompts, responses


In [4]:
#call the data set creation function
original_prompts, original_responses = create_data()

#cut down the responses
MAX_PAIRS = 1000
original_prompts = original_prompts[:MAX_PAIRS]
original_responses = original_responses[:MAX_PAIRS]

In [5]:
#print an example prompt/response
print(original_prompts[0], original_responses[0])

Can we make this quick ? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break up on the quad . Again . Well , I thought we'd start with pronunciation , if that's okay with you .


In [94]:
#create the tokenizer for the sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# handle everything including punctuation
c_tokenizer = Tokenizer(filters='', oov_token='<OOV>', num_words=2**13)
# fit on all data
c_tokenizer.fit_on_texts(original_prompts + original_responses)

START_TOKEN, END_TOKEN = [len(c_tokenizer.word_index)], [len(c_tokenizer.word_index) + 1]

VOCAB_SIZE = len(c_tokenizer.word_index) + 2


In [127]:
#create and pad the sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_LENGTH = 50

tokenized_x = c_tokenizer.texts_to_sequences(original_prompts)
tokenized_y = c_tokenizer.texts_to_sequences(original_responses)

data_x, data_y = [], []

#iterate through sequences making sure to only use sentences under max length
for (question, answer) in zip(tokenized_x, tokenized_y):
    if len(question) <= MAX_LENGTH and len(answer) <= MAX_LENGTH:
        data_x.append(START_TOKEN + question + END_TOKEN)
        data_y.append(START_TOKEN + answer + END_TOKEN)

data_x = pad_sequences(data_x, padding='post', maxlen=MAX_LENGTH)
data_y = pad_sequences(data_y, padding='post', maxlen=MAX_LENGTH)

st'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <class 'list'>
<class 'list'> <clas

In [8]:
print(data_x.shape, data_y.shape)

(983, 50) (983, 50)


In [87]:
#create a tensorflow dataset for training purposes
BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': data_x,
        'dec_inputs': data_y[:, :-1]
    },
    {
        'outputs': data_y[:, 1:]
    }
))

dataset = dataset.cache() #creates a dataset in memory
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [10]:
# following layers and functions pulled from https://colab.research.google.com/github/tensorflow/examples/blob/master/community/en/transformer_chatbot.ipynb#scrollTo=L9eYssGIAG4h
def scaled_dot_product_attention(query, key, value, mask):
    """Calculate the attention weights. """
    matmul_qk = tf.matmul(query, key, transpose_b=True)

    # scale matmul_qk
    depth = tf.cast(tf.shape(key)[-1], tf.float32)
    logits = matmul_qk / tf.math.sqrt(depth)

    # add the mask to zero out padding tokens
    if mask is not None:
        logits += (mask * -1e9)

    # softmax is normalized on the last axis (seq_len_k)
    attention_weights = tf.nn.softmax(logits, axis=-1)

    output = tf.matmul(attention_weights, value)

    return output

In [11]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, name="multi_head_attention"):
        super(MultiHeadAttention, self).__init__(name=name)
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.query_dense = tf.keras.layers.Dense(units=d_model)
        self.key_dense = tf.keras.layers.Dense(units=d_model)
        self.value_dense = tf.keras.layers.Dense(units=d_model)

        self.dense = tf.keras.layers.Dense(units=d_model)

    def split_heads(self, inputs, batch_size):
        inputs = tf.reshape(
            inputs, shape=(batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(inputs, perm=[0, 2, 1, 3])

    def call(self, inputs):
        query, key, value, mask = inputs['query'], inputs['key'], inputs[
            'value'], inputs['mask']
        batch_size = tf.shape(query)[0]

        # linear layers
        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)

        # split heads
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        # scaled dot-product attention
        scaled_attention = scaled_dot_product_attention(query, key, value, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        # concatenation of heads
        concat_attention = tf.reshape(scaled_attention,
                                    (batch_size, -1, self.d_model))

        # final linear layer
        outputs = self.dense(concat_attention)

        return outputs

In [49]:
#gets rid of 0 tokens to not affect calculations
def create_padding_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    # (batch_size, 1, 1, sequence length)
    return mask[:, tf.newaxis, tf.newaxis, :]

In [65]:
#creates a mask for predicting future words
def create_look_ahead_mask(x):
    seq_len = tf.shape(x)[1]
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    padding_mask = create_padding_mask(x)
    return tf.maximum(look_ahead_mask, padding_mask)

In [66]:
#gives words meaning based on position
class PositionalEncoding(tf.keras.layers.Layer):

    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, d_model)

    def get_angles(self, position, i, d_model):
        angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        return position * angles

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
            position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
            i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
            d_model=d_model)
        # apply sin to even index in the array
        sines = tf.math.sin(angle_rads[:, 0::2])
        # apply cos to odd index in the array
        cosines = tf.math.cos(angle_rads[:, 1::2])

        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]
        return tf.cast(pos_encoding, tf.float32)

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

In [67]:
#gives the input meaning to be used in the decoder
def encoder_layer(units, d_model, num_heads, dropout, name="encoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

    attention = MultiHeadAttention(
        d_model, num_heads, name="attention")({
            'query': inputs,
            'key': inputs,
            'value': inputs,
            'mask': padding_mask
        })
    attention = tf.keras.layers.Dropout(rate=dropout)(attention)
    attention = tf.keras.layers.LayerNormalization(
        epsilon=1e-6)(inputs + attention)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(
        epsilon=1e-6)(attention + outputs)

    return tf.keras.Model(
        inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [68]:
#combines previous functions to give words meaning
def encoder(vocab_size,
            num_layers,
            units,
            d_model,
            num_heads,
            dropout,
            name="encoder"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    for i in range(num_layers):
        outputs = encoder_layer(
            units=units,
            d_model=d_model,
            num_heads=num_heads,
            dropout=dropout,
            name="encoder_layer_{}".format(i),
        )([outputs, padding_mask])

    return tf.keras.Model(
        inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [69]:
#decoding layer uses encoded inputs to create new words
def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")
    look_ahead_mask = tf.keras.Input(
        shape=(1, None, None), name="look_ahead_mask")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

    attention1 = MultiHeadAttention(
        d_model, num_heads, name="attention_1")(inputs={
            'query': inputs,
            'key': inputs,
            'value': inputs,
            'mask': look_ahead_mask
        })
    attention1 = tf.keras.layers.LayerNormalization(
        epsilon=1e-6)(attention1 + inputs)

    attention2 = MultiHeadAttention(
        d_model, num_heads, name="attention_2")(inputs={
            'query': attention1,
            'key': enc_outputs,
            'value': enc_outputs,
            'mask': padding_mask
        })
    attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
    attention2 = tf.keras.layers.LayerNormalization(
        epsilon=1e-6)(attention2 + attention1)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention2)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(
        epsilon=1e-6)(outputs + attention2)

    return tf.keras.Model(
        inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
        outputs=outputs,
        name=name)

In [70]:
def decoder(vocab_size,
            num_layers,
            units,
            d_model,
            num_heads,
            dropout,
            name='decoder'):
    inputs = tf.keras.Input(shape=(None,), name='inputs')
    enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs')
    look_ahead_mask = tf.keras.Input(
        shape=(1, None, None), name='look_ahead_mask')
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
    
    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    for i in range(num_layers):
        outputs = decoder_layer(
            units=units,
            d_model=d_model,
            num_heads=num_heads,
            dropout=dropout,
            name='decoder_layer_{}'.format(i),
        )(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask])

    return tf.keras.Model(
        inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
        outputs=outputs,
        name=name)

In [86]:
def transformer(vocab_size,
                num_layers,
                units,
                d_model,
                num_heads,
                dropout,
                name="transformer"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")

    enc_padding_mask = tf.keras.layers.Lambda(
        create_padding_mask, output_shape=(1, 1, None),
        name='enc_padding_mask')(inputs)
    # mask the future tokens for decoder inputs at the 1st attention block
    look_ahead_mask = tf.keras.layers.Lambda(
        create_look_ahead_mask,
        output_shape=(1, None, None),
        name='look_ahead_mask')(dec_inputs)
    # mask the encoder outputs for the 2nd attention block
    dec_padding_mask = tf.keras.layers.Lambda(
        create_padding_mask, output_shape=(1, 1, None),
        name='dec_padding_mask')(inputs)

    enc_outputs = encoder(
        vocab_size=vocab_size,
        num_layers=num_layers,
        units=units,
        d_model=d_model,
        num_heads=num_heads,
        dropout=dropout,
    )(inputs=[inputs, enc_padding_mask])

    dec_outputs = decoder(
        vocab_size=vocab_size,
        num_layers=num_layers,
        units=units,
        d_model=d_model,
        num_heads=num_heads,
        dropout=dropout,
    )(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])

    outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)

    return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)

In [95]:
#create a transformer model (change the values as needed)
NUM_LAYERS = 2
D_MODEL = 256
NUM_HEADS = 8
UNITS = 512
DROPOUT = 0.1

model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    units=UNITS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)

In [96]:
#determines innaccuracy
def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
    
    loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')(y_true, y_pred)

    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)

    return tf.reduce_mean(loss)

In [97]:
#variable learning rate to optimize model
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [98]:
#put everything together
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9
)

def accuracy(y_true, y_pred):
    # ensure labels have shape (batch_size, MAX_LENGTH - 1)
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])
model.summary()

Model: "transformer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, None)]       0                                            
__________________________________________________________________________________________________
dec_inputs (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
enc_padding_mask (Lambda)       (None, 1, 1, None)   0           inputs[0][0]                     
__________________________________________________________________________________________________
encoder (Functional)            (None, None, 256)    1645824     inputs[0][0]                     
                                                                 enc_padding_mask[0][0] 

In [99]:
#train the model
EPOCHS = 20

model.fit(dataset, epochs=EPOCHS)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x2488caab948>

In [234]:
#create a prediction
def predict(sentence):
    sentence = preprocess(sentence)

    #tokenize the sentence
    sentence = START_TOKEN + c_tokenizer.texts_to_sequences([sentence])[0] + END_TOKEN
    #expand dimension to comply with the model
    sentence = tf.expand_dims(sentence, axis=0)
    #create a placeholder output
    output = tf.expand_dims(START_TOKEN, axis=0)

    #predict word by word
    for i in range(MAX_LENGTH):
        predictions = model(inputs=[sentence, output], training=False)

        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        #stop if the sentence has reached END_TOKEN
        if tf.equal(predicted_id, END_TOKEN[0]):
            break
        output = tf.concat([output, predicted_id], axis=-1)

    #flatten the output and remove the first number
    output = [tf.squeeze(output, axis=0).numpy().tolist()]
    
    #convert it to a string
    output = c_tokenizer.sequences_to_texts(output)
    return output

In [235]:
predict("What's good ma")

[[2309, 6, 2]]


['thrown i .']