In [154]:
import tensorflow as tf
import tensorflow_datasets as tfds

import os
import re
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from keras.layers import Input, LSTM, Dense, Embedding, TimeDistributed, Dropout, Flatten, Bidirectional, Concatenate, Reshape, Average, Add

from tensorflow.keras.models import Model
import tensorflow_hub as hub
import bert
FullTokenizer = bert.bert_tokenization.FullTokenizer
import math

Load the data

In [187]:
num_samples = 20000 # Number of samples to train on.

In [188]:
#read from dataset and split into input and targets
df = pd.read_csv("char_cleaned_data3.csv")
# df.fillna('', inplace=True)
df.dropna(subset = ["x1"], inplace=True)
df.dropna(subset = ["y1"], inplace=True)
df.head()

Unnamed: 0,x1,y1
0,what kind of phones do you guys have,i have a it is pretty great much better than w...
1,i have a it is pretty great much better than w...,does it really charge all the way in min
2,does it really charge all the way in min,pretty fast i have never it but it is under ha...
3,what kind of phones do you guys have,samsung galaxy j it is my first cell phone and...
4,samsung galaxy j it is my first cell phone and...,what do you think of it anything you do not like


In [189]:
print(len(df))
questions = df["x1"].tolist()[:num_samples]
answers = df["y1"].tolist()[:num_samples]

print(len(questions))
print(len(answers))

47365
20000
20000


In [190]:
max_seq_length = 128 
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)

In [191]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [192]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [193]:
def tokenize (s):
    stokens = tokenizer.tokenize(s)
    stokens =  ["[CLS]"] + stokens + ["[SEP]"]
    
    return stokens

input_ids = []
input_masks = []
input_segments = []

for line in questions:
    stokens = tokenize(line)
#     print(stokens)
    input_ids.append(get_ids(stokens, tokenizer, max_seq_length))
    input_masks.append(get_masks(stokens, max_seq_length))
    input_segments.append(get_segments(stokens, max_seq_length))
    
outputs = []
for line in answers:
    stokens = tokenize(line)
    outputs.append(get_ids(stokens, tokenizer, max_seq_length))

In [194]:
outputs = np.array(outputs)

In [195]:
input_ids = np.array(input_ids)
input_masks = np.array(input_masks)
input_segments = np.array(input_segments)

In [196]:
outputs.shape

(20000, 128)

In [220]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

# decoder inputs use the previous target as input
# remove START_TOKEN from targets
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_word_ids': input_ids,
        'input_mask': input_masks,
        'segment_ids': input_segments,
        'dec_inputs': outputs
    },
    {
        'outputs': outputs
    },
))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [198]:
 outputs[:, :-1]

array([[  101,  1045,  2031, ...,     0,     0,     0],
       [  101,  2515,  2009, ...,     0,     0,     0],
       [  101,  3492,  3435, ...,     0,     0,     0],
       ...,
       [  101, 12043,  2007, ...,     0,     0,     0],
       [  101,  7561,  2227, ...,     0,     0,     0],
       [  101, 26478,  8609, ...,     0,     0,     0]])

In [199]:
outputs[:, 1:]

array([[ 1045,  2031,  1037, ...,     0,     0,     0],
       [ 2515,  2009,  2428, ...,     0,     0,     0],
       [ 3492,  3435,  1045, ...,     0,     0,     0],
       ...,
       [12043,  2007,  2028, ...,     0,     0,     0],
       [ 7561,  2227,  2007, ...,     0,     0,     0],
       [26478,  8609,  2015, ...,     0,     0,     0]])

In [152]:
outputs

array([[  101,  1045,  2031, ...,     0,     0,     0],
       [  101,  2515,  2009, ...,     0,     0,     0],
       [  101,  3492,  3435, ...,     0,     0,     0],
       ...,
       [  101, 12043,  2007, ...,     0,     0,     0],
       [  101,  7561,  2227, ...,     0,     0,     0],
       [  101, 26478,  8609, ...,     0,     0,     0]])

In [201]:
print(dataset)

<PrefetchDataset shapes: ({input_word_ids: (None, 128), input_mask: (None, 128), segment_ids: (None, 128), dec_inputs: (None, 128)}, {outputs: (None, 127)}), types: ({input_word_ids: tf.int32, input_mask: tf.int32, segment_ids: tf.int32, dec_inputs: tf.int32}, {outputs: tf.int32})>


In [13]:
# tokens = set()
# for line in questions:
#     t = tokenizer.tokenize(line)
#     for i in t:
#         tokens.add(i)

# tokens = list(tokens)

In [202]:
type(tokenizer.vocab)
print(tokenizer.vocab['[CLS]'] )

101


In [203]:
START_TOKEN, END_TOKEN = [tokenizer.vocab['[CLS]']],  [tokenizer.vocab['[SEP]']]

In [108]:
START_TOKEN

[101]

In [169]:
len(tokenizer.vocab)

30522

In [204]:
def scaled_dot_product_attention(query, key, value, mask):
    """Calculate the attention weights. """
    matmul_qk = tf.matmul(query, key, transpose_b=True)

    # scale matmul_qk
    depth = tf.cast(tf.shape(key)[-1], tf.float32)
    logits = matmul_qk / tf.math.sqrt(depth)

    # add the mask to zero out padding tokens
    if mask is not None:
        logits += (mask * -1e9) #close to -infinity

    # softmax is normalized on the last axis (seq_len_k)
    attention_weights = tf.nn.softmax(logits, axis=-1)

    output = tf.matmul(attention_weights, value)

    return output

In [205]:
class MultiHeadAttention(tf.keras.layers.Layer):

    def __init__(self, d_model, num_heads, name="multi_head_attention"):
        super(MultiHeadAttention, self).__init__(name=name)
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.query_dense = tf.keras.layers.Dense(units=d_model)
        self.key_dense = tf.keras.layers.Dense(units=d_model)
        self.value_dense = tf.keras.layers.Dense(units=d_model)

        self.dense = tf.keras.layers.Dense(units=d_model)

    def split_heads(self, inputs, batch_size):
        inputs = tf.reshape(
            inputs, shape=(batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(inputs, perm=[0, 2, 1, 3])

    def call(self, inputs):
        query, key, value, mask = inputs['query'], inputs['key'], inputs[
            'value'], inputs['mask']
        batch_size = tf.shape(query)[0]

        # linear layers
        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)

        # split heads
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        # scaled dot-product attention
        scaled_attention = scaled_dot_product_attention(query, key, value, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        # concatenation of heads(attention)
        concat_attention = tf.reshape(scaled_attention,
                                      (batch_size, -1, self.d_model))

        # final linear layer
        outputs = self.dense(concat_attention)

        return outputs

In [206]:
def create_padding_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    # (batch_size, 1, 1, sequence length)
    return mask[:, tf.newaxis, tf.newaxis, :]

In [207]:
def create_look_ahead_mask(x):
    seq_len = tf.shape(x)[1]
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    padding_mask = create_padding_mask(x)
    return tf.maximum(look_ahead_mask, padding_mask)

In [208]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, d_model)

    def get_angles(self, position, i, d_model):
        angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        return position * angles

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
            position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
            i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
            d_model=d_model)
        # apply sin to even index in the array
        sines = tf.math.sin(angle_rads[:, 0::2])
        # apply cos to odd index in the array
        cosines = tf.math.cos(angle_rads[:, 1::2])

        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]
        return tf.cast(pos_encoding, tf.float32)

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

In [209]:
#each encoder layer consists of: multi-head attention(with padding mask), 2 dense layers with dropout
def encoder_layer(units, d_model, num_heads, dropout, name="encoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

    attention = MultiHeadAttention(
      d_model, num_heads, name="attention")({
          'query': inputs,
          'key': inputs,
          'value': inputs,
          'mask': padding_mask
      })
    attention = tf.keras.layers.Dropout(rate=dropout)(attention)
    attention = tf.keras.layers.LayerNormalization(
        epsilon=1e-6)(inputs + attention)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(
        epsilon=1e-6)(attention + outputs)

    return tf.keras.Model(
          inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [210]:
#each encoder has an: input embedding, positional encoding, num_layers encoder layers
#embedding is summed with positional encoding, and then is passed to encoder layers 
#output of encoder goes to decoder

def encoder(vocab_size,
            num_layers,
            units,
            d_model,
            num_heads,
            dropout,
            name="encoder"):
    
#     inputs = tf.keras.Input(shape=(None,), name="inputs")
    input_word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32,
                                       name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32,
                                   name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32,
                                    name="segment_ids")
    inputs = [input_word_ids, input_mask, segment_ids]
    _, embeddings = bert_layer(inputs)


    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

#     embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    for i in range(num_layers):
        outputs = encoder_layer(
            units=units,
            d_model=d_model,
            num_heads=num_heads,
            dropout=dropout,
            name="encoder_layer_{}".format(i),
        )([outputs, padding_mask])

    return tf.keras.Model(
        inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [211]:
def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    
    enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")
    look_ahead_mask = tf.keras.Input(
        shape=(1, None, None), name="look_ahead_mask")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

    attention1 = MultiHeadAttention(
      d_model, num_heads, name="attention_1")(inputs={
          'query': inputs,
          'key': inputs,
          'value': inputs,
          'mask': look_ahead_mask
      })
    attention1 = tf.keras.layers.LayerNormalization(
          epsilon=1e-6)(attention1 + inputs)

    attention2 = MultiHeadAttention(
      d_model, num_heads, name="attention_2")(inputs={
          'query': attention1,
          'key': enc_outputs,
          'value': enc_outputs,
          'mask': padding_mask
      })
    attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
    attention2 = tf.keras.layers.LayerNormalization(
        epsilon=1e-6)(attention2 + attention1)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention2)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(
        epsilon=1e-6)(outputs + attention2)

    return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,
      name=name)

In [212]:
def decoder(vocab_size,
            num_layers,
            units,
            d_model,
            num_heads,
            dropout,
            name='decoder'):
    inputs = tf.keras.Input(shape=(None,), name='inputs')
    enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs')
    look_ahead_mask = tf.keras.Input(
        shape=(1, None, None), name='look_ahead_mask')
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')
  
    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    for i in range(num_layers):
        outputs = decoder_layer(
            units=units,
            d_model=d_model,
            num_heads=num_heads,
            dropout=dropout,
            name='decoder_layer_{}'.format(i),
        )(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask])

    return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,
      name=name)


In [213]:
#lambda layer is an arbitrary layer so tensorflow functions can be used to create models 
def transformer(vocab_size,
                num_layers,
                units,
                d_model,
                num_heads,
                dropout,
                name="transformer"):
    #encoder inputs 
#     inputs = tf.keras.Input(shape=(None,), name="inputs")

    input_word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32,
                                       name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32,
                                   name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32,
                                    name="segment_ids")
    
    inputs = [input_word_ids, input_mask, segment_ids]
    
    
    
    #decoder inputs 
    dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")
    
    #create padding mask for encoder inputs 
    # (batch_size, 1, 1, sequence length)
    enc_padding_mask = tf.keras.layers.Lambda(
      create_padding_mask, output_shape=(1, 1, None),
      name='enc_padding_mask')(input_word_ids)

    #decoder padding masks:
    # mask the future tokens for decoder inputs at the 1st attention block
    look_ahead_mask = tf.keras.layers.Lambda(
      create_look_ahead_mask,
      output_shape=(1, None, None),
      name='look_ahead_mask')(dec_inputs)
    # mask the encoder outputs for the 2nd attention block
    dec_padding_mask = tf.keras.layers.Lambda(
      create_padding_mask, output_shape=(1, 1, None),
      name='dec_padding_mask')(input_word_ids)

  #encoder 
    enc_outputs = encoder(
      vocab_size=vocab_size,
      num_layers=num_layers,
      units=units,
      d_model=d_model,
      num_heads=num_heads,
      dropout=dropout,
    )(inputs=[inputs, enc_padding_mask])

  #decoder 
    dec_outputs = decoder(
      vocab_size=vocab_size,
      num_layers=num_layers,
      units=units,
      d_model=d_model,
      num_heads=num_heads,
      dropout=dropout,
    )(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])

    outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)

    return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)

In [214]:
tf.keras.backend.clear_session()
# VOCAB_SIZE = len(tokens) + 2
VOCAB_SIZE = len(tokenizer.vocab)

# Hyper-parameters
NUM_LAYERS = 4
D_MODEL = 768
NUM_HEADS = 8
UNITS = 512
DROPOUT = 0.2 

model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    units=UNITS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)

In [221]:
#since target sequences are padded, we apply a padding mask when calculating loss
def loss_function(y_true, y_pred):
#     y_true = tf.reshape(y_true, shape=(-1, max_seq_length - 1))

    loss = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none')(y_true, y_pred)

    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)

    return tf.reduce_mean(loss)

In [216]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [222]:
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def accuracy(y_true, y_pred):
  # ensure labels have shape (batch_size, MAX_LENGTH - 1)
#     y_true = tf.reshape(y_true, shape=(-1, max_seq_length - 1))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [224]:
model.summary()

Model: "transformer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
dec_inputs (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, None)]       0                                            
________________________________________________________________________________________

In [225]:
EPOCHS = 5
model.fit(dataset, epochs=EPOCHS)

Epoch 1/5
  2/313 [..............................] - ETA: 6:05:38 - loss: 0.8860 - accuracy: 0.0000e+00

KeyboardInterrupt: 

In [226]:
def evaluate(sentence):
    output = tf.expand_dims(START_TOKEN, 0)


    stokens = tokenizer.tokenize(sentence)
    stokens =  ["[CLS]"] + stokens + ["[SEP]"]

    ids = get_ids(stokens, tokenizer, max_seq_length)
    masks = get_masks(stokens, max_seq_length)
    segments = get_segments(stokens, max_seq_length)
                            
    ids = np.array(ids)
    masks = np.array(masks)
    segments = np.array(segments) 
    
    ids = tf.expand_dims(ids, axis=0)
    masks= tf.expand_dims(masks, axis=0)
    segments = tf.expand_dims(segments, axis = 0)
    
    input_sentence = [ids, masks, segments]
                            
#     for i in range(max_seq_length):
    for i in range(10):
        predictions = model(inputs=[input_sentence, output], training=False)

        # select the last word from the seq_len dimension
        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if tf.equal(predicted_id, END_TOKEN[0]):
            break

    # concatenated the predicted_id to the output which is given to the decoder
    # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

        
#     print( tf.squeeze(output, axis=0))
    return tf.squeeze(output, axis=0)


def predict(sentence):
        prediction = evaluate(sentence)
        
        predicted_ids = []
        for i in prediction:
            predicted_ids.append(int(i))

        predicted_sentence = tokenizer.convert_ids_to_tokens(predicted_ids[1:])

        print('Input: {}'.format(sentence))
        print('Output: {}'.format(predicted_sentence))

        return predicted_sentence

testing begins here

In [227]:
sentence = "what is good"

output = tf.expand_dims(START_TOKEN, 0)


stokens = tokenizer.tokenize(sentence)
stokens =  ["[CLS]"] + stokens + ["[SEP]"]

ids = get_ids(stokens, tokenizer, max_seq_length)
masks = get_masks(stokens, max_seq_length)
segments = get_segments(stokens, max_seq_length)

In [228]:
ids = np.array(ids)
masks = np.array(masks)
segments = np.array(segments)
print (ids.shape)
print (masks.shape)
print (segments.shape)

(128,)
(128,)
(128,)


In [229]:
tokenize(sentence)

['[CLS]', 'what', 'is', 'good', '[SEP]']

In [230]:
print (ids)
print (masks)
print (segments)

[ 101 2054 2003 2204  102    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
[1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 

In [231]:
ids = tf.expand_dims(ids, axis=0)
masks= tf.expand_dims(masks, axis=0)
segments = tf.expand_dims(segments, axis = 0)
print (ids.shape)
print (masks.shape)
print (segments.shape)

(1, 128)
(1, 128)
(1, 128)


In [232]:
ids

<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[ 101, 2054, 2003, 2204,  102,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]])>

In [233]:
inp = [ids, masks, segments]

In [234]:
test = model(inputs=[inp, output]) 

In [235]:
# for i in range(max_seq_length):
for i in range(10):
    predictions = model(inputs=[inp, output], training=False)
    # select the last word from the seq_len dimension
    predictions = predictions[:, -1:, :]
    print(predictions)
    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

#     # return the result if the predicted_id is equal to the end token
    if tf.equal(predicted_id, END_TOKEN[0]):
        break

# concatenated the predicted_id to the output which is given to the decoder
# as its input.
    output = tf.concat([output, predicted_id], axis=-1)

tf.Tensor(
[[[ 0.06889357 -0.1428927   0.07445737 ... -0.02647485  0.12695286
   -0.41949028]]], shape=(1, 1, 30522), dtype=float32)
tf.Tensor(
[[[ 0.08211425 -0.15104069  0.08288462 ... -0.01188275  0.19640334
   -0.43308565]]], shape=(1, 1, 30522), dtype=float32)
tf.Tensor(
[[[ 0.06236197 -0.18824516  0.08535066 ... -0.02224628  0.18221529
   -0.43844217]]], shape=(1, 1, 30522), dtype=float32)
tf.Tensor(
[[[ 0.04670067 -0.16321854  0.03585767 ... -0.01863001  0.17366885
   -0.4274327 ]]], shape=(1, 1, 30522), dtype=float32)
tf.Tensor(
[[[ 0.05483151 -0.16425365  0.06003812 ...  0.003889    0.17027712
   -0.40324575]]], shape=(1, 1, 30522), dtype=float32)
tf.Tensor(
[[[ 0.04302364 -0.15839083  0.02453383 ... -0.01949962  0.1707272
   -0.41989267]]], shape=(1, 1, 30522), dtype=float32)
tf.Tensor(
[[[ 0.04925175 -0.16303502  0.01994527 ... -0.02417405  0.17770198
   -0.42641944]]], shape=(1, 1, 30522), dtype=float32)
tf.Tensor(
[[[ 0.05344554 -0.16723838  0.01447541 ... -0.02749291  0.1

In [236]:
output

<tf.Tensor: shape=(1, 11), dtype=int32, numpy=
array([[  101,  3628, 30154,  6208,  5804,  6208,  6208,  6208,  6208,
         6208,  6208]])>

In [237]:
test = tf.squeeze(output, axis=0)

In [238]:
new_test = []
for i in test:
    new_test.append(int(i))

print (new_test)
    
predicted_sentence = tokenizer.convert_ids_to_tokens(new_test[1:])

[101, 3628, 30154, 6208, 5804, 6208, 6208, 6208, 6208, 6208, 6208]


In [239]:
predicted_sentence

['trees',
 '##♯',
 'revolutionary',
 '##mann',
 'revolutionary',
 'revolutionary',
 'revolutionary',
 'revolutionary',
 'revolutionary',
 'revolutionary']

testing ends here

In [240]:
predict("this is a nice sentence")

Input: this is a nice sentence
Output: ['trees', 'trees', 'trees', 'trees', 'sends', 'sends', 'sends', 'sends', 'sends', 'sends']


['trees',
 'trees',
 'trees',
 'trees',
 'sends',
 'sends',
 'sends',
 'sends',
 'sends',
 'sends']

In [308]:
output = predict("what is good")

Input: what is good
Output: 


In [241]:
text=[]
with open('testing.csv', mode='r') as f: 
  for line in f:
    text.append(line)

for line in text:
    sentence = predict(line)
    print ('')

Input: ï»¿i am an amazon warrior it means i shop on amazon a lot

Output: ['equation', 'worm', 'junction', 'equation', 'worm', 'worm', 'worm', 'worm', 'worm', 'worm']

Input: how are you doing today

Output: ['trees', 'trees', 'trees', 'trees', 'trees', 'trees', 'trees', 'trees', 'trees', 'trees']

Input: i do not like you because of your face

Output: ['cambridge', 'cambridge', 'cambridge', 'cambridge', 'cambridge', 'cambridge', 'cambridge', 'cambridge', 'cambridge', 'cambridge']

Input: what do you think about trump

Output: ['trees', 'trees', 'trees', 'trees', 'trees', 'trees', 'trees', 'trees', 'trees', 'trees']

Input: i suppose you are smart can you make me a sandwich

Output: ['makeshift', '##sti', '##sti', '##sti', '##sti', '##sti', '##sti', '##sti', '##sti', '##sti']

Input: can you do math on your own

Output: ['seoul', 'seoul', 'seoul', 'seoul', 'seoul', 'seoul', 'seoul', 'seoul', 'seoul', 'seoul']

Input: so i got a girlfriend today

Output: ['sends', 'sends', 'sends', 'sen