This model is similar to the 3rd in that it both uses LSTM, this one, however, has one extra layer. It takes almost double the time to train

In [16]:
%cd ..
%cd root
import numpy as np
import pandas as pd
df = pd.read_csv("AL_clean_2.csv")
df.head()
df.fillna('', inplace=True)
print(len(df))
x = df["x1"].tolist()
y = df["y1"].tolist()

/
/root
56297


In [17]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.

# Vectorize the data.
input_texts = []
target_texts = []
input_words = set()
target_words = set()

for line in x:
    input_texts.append(['\t']+ str(line).split(" ")+ ['\n'])
    for word in str(line).split(" "):
        if word not in input_words:
            input_words.add(word)

for line in y:
  target_texts.append(['\t'] + str(line).split(" ") + ['\n'])
  for word in str(line).split(" "):
    if word not in target_words:
        target_words.add(word)
target_words.add("\t")
target_words.add("\n")
input_words.add("\t")
input_words.add("\n")

input_words = sorted(list(input_words))
target_words = sorted(list(target_words))
num_encoder_tokens = len(input_words)
num_decoder_tokens = len(target_words)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 56297
Number of unique input tokens: 7625
Number of unique output tokens: 7292
Max sequence length for inputs: 27
Max sequence length for outputs: 27


In [0]:
input_token_index = dict(
    [(word, i) for i, word in enumerate(input_words)])
target_token_index = dict(
    [(word, i) for i, word in enumerate(target_words)])

In [19]:
import tensorflow as tf
input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length),
    dtype='float32')
output_data = np.zeros(
    (len(input_texts), max_decoder_seq_length),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, word in enumerate(input_text):
        input_data[i, t] = input_token_index[word]
    for t, word in enumerate(target_text):
        output_data[i, t] = target_token_index[word]

# maybe expand dims?
# import numpy as np
# input_data = np.expand_dims(input_data, axis=-1)
# output_data = np.expand_dims(output_data, axis=-1)
dataset = tf.data.Dataset.from_tensor_slices((input_data, output_data)).shuffle(len(input_data)).batch(64, drop_remainder=True)
print(input_data.shape)

(56297, 27)


In [20]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 27]), TensorShape([64, 27]))

In [0]:

from keras import backend as K
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm1 = tf.keras.layers.LSTM(self.enc_units,
                                   return_sequences=True)
    self.lstm2 = tf.keras.layers.LSTM(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    x = self.lstm1(x)
    output, stateh, statec = self.lstm2(x, initial_state = hidden)
    state = [stateh, statec]
    return output, state

  def initialize_hidden_state(self):
    return [tf.zeros((self.batch_sz, self.enc_units)),tf.zeros((self.batch_sz, self.enc_units))]


class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)
  def call(self, query, values):
    query = K.sum(query, axis=0)  
    query_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm1 = tf.keras.layers.LSTM(self.dec_units,
                                   return_sequences=True)
    self.lstm2 = tf.keras.layers.LSTM(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)
    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)
    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    # passing the concatenated vector to the GRU
    x = self.lstm1(x)
    output, stateh, statec = self.lstm2(x)
    state = [stateh, statec]
    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))
    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [0]:
encoder = Encoder(num_encoder_tokens, 100, 1024, 64)
decoder = Decoder(num_decoder_tokens, 100, 1024, 64)
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [0]:
def train_step(inp, targ, enc_hidden):
  loss = 0
  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_token_index['\t']] * BATCH_SIZE, 1)
    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)
  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  return batch_loss

In [0]:
EPOCHS = 10
BATCH_SIZE = 64
embedding_dim = 256
units = 1024


import time
steps_per_epoch = len(input_data)//64
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss
    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 2.7126
Epoch 1 Batch 100 Loss 1.9755
Epoch 1 Batch 200 Loss 2.1146
Epoch 1 Batch 300 Loss 1.8773
Epoch 1 Batch 400 Loss 1.7733
Epoch 1 Batch 500 Loss 1.9891
Epoch 1 Batch 600 Loss 2.0662
Epoch 1 Batch 700 Loss 1.6666
Epoch 1 Batch 800 Loss 1.6719
Epoch 1 Loss 1.8849
Time taken for 1 epoch 436.0906000137329 sec

Epoch 2 Batch 0 Loss 1.5848
Epoch 2 Batch 100 Loss 1.8971
Epoch 2 Batch 200 Loss 1.7636
Epoch 2 Batch 300 Loss 1.6071
Epoch 2 Batch 400 Loss 1.5203
Epoch 2 Batch 500 Loss 1.6501
Epoch 2 Batch 600 Loss 1.7125
Epoch 2 Batch 700 Loss 1.8210
Epoch 2 Batch 800 Loss 1.8154
Epoch 2 Loss 1.6971
Time taken for 1 epoch 431.2004041671753 sec

Epoch 3 Batch 0 Loss 1.7852
Epoch 3 Batch 100 Loss 1.6061
Epoch 3 Batch 200 Loss 1.6281
Epoch 3 Batch 300 Loss 1.7245
Epoch 3 Batch 400 Loss 1.6364
Epoch 3 Batch 500 Loss 1.6458
Epoch 3 Batch 600 Loss 1.6286
Epoch 3 Batch 700 Loss 1.5751
Epoch 3 Batch 800 Loss 1.6481
Epoch 3 Loss 1.6223
Time taken for 1 epoch 427.86418175697327 se

In [0]:
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [0]:
def evaluate(sentence):
  attention_plot = np.zeros((max_decoder_seq_length, max_encoder_seq_length))

  inputs = [input_token_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_encoder_seq_length,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)
  result = ''
  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)
  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([target_token_index['\t']], 0)
  for t in range(max_decoder_seq_length):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += reverse_target_char_index[predicted_id] + ' '

    if reverse_target_char_index[predicted_id] == '\n':
      return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot

In [0]:
def translate(sentence):
  result, sentence, attention_plot = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))


In [0]:
translate("i want to die")
translate("can you kill me")
translate("i am sad")
translate("hello what is up")
translate("i made something")
translate("this is why you should not do this")
translate("can you stop")
translate("why do dogs have ears")
translate("why is trump nice")