In [1]:
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import unicodedata
import re
import os
import io
import time
from bpemb import BPEmb

In [2]:
bpemb_de = BPEmb(lang='de', vs=10000, dim=100)
bpemb_en = BPEmb(lang='en', vs=10000, dim=100)

In [3]:
path_to_file = "./datasets/deu.txt"

lines = io.open(path_to_file, encoding='UTF-8').read().strip().split('\n')

temp_list = []
corpus = []

for i in range(len(lines)):
    temp_list =  lines[i].split('\t')[:-1]
    corpus.append(temp_list)
    

In [4]:
en, de = np.array(corpus).T

en_encoded = []
de_encoded = []

cnt_en = 0
cnt_de = 0

for i in range(len(en)):
    en_encoded_temp = bpemb_en.encode_ids(en[i])
    de_encoded_temp = bpemb_de.encode_ids(de[i])

    if (len(en_encoded_temp)<=40) and (len(de_encoded_temp)<=40):
        en_encoded.append([10000] + en_encoded_temp + [10001])
        de_encoded.append([10000] + de_encoded_temp + [10001])
    

In [5]:
en_padded = tf.keras.preprocessing.sequence.pad_sequences(en_encoded, padding='post')
de_padded = tf.keras.preprocessing.sequence.pad_sequences(de_encoded, padding='post')

In [6]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(en_padded, de_padded, test_size=0.2)


In [7]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [1]:
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)

dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

NameError: name 'tf' is not defined

In [8]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  pos_encoding = angle_rads[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

In [9]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [10]:
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.
  
  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
    return output, attention_weights

In [11]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [12]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
    return out2

In [13]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)
    
    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    
    return out3, attn_weights_block1, attn_weights_block2

In [14]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    #self.pre_embedding = tf.keras.layers.Dense(input_vocab_size, 100)
    
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                            self.d_model)
    
    
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)
        
  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]
    
    #print("The shape of 'x' is " + str(tf.shape(x)))
    #x = self.pre_embedding(x)
    #print("After self embedding.....")
    #print("The shape of 'x' is " + str(tf.shape(x)))

    # adding embedding and position encoding.
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)
    
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    
    return x  # (batch_size, input_seq_len, d_model)


In [15]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    #self.pre_embedding = tf.keras.layers.Dense(target_vocab_size, 100)
    
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
    
    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    #print("The shape of 'x' is " + str(tf.shape(x)))

    seq_len = tf.shape(x)[1]
    #print("'seq_len' is " + str(seq_len))
    
    attention_weights = {}
    
    #x = self.pre_embedding(x)

    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
    
    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)
      
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
    # x.shape == (batch_size, target_seq_len, d_model)
    #print("The shape of 'x' is " + str(tf.shape(x)))
    
    return x, attention_weights

In [16]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                           input_vocab_size, pe_input, rate)

    self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                           target_vocab_size, pe_target, rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
  def call(self, inp, tar, training, enc_padding_mask, 
           look_ahead_mask, dec_padding_mask):

    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    
    '''
    The output of the last layer of the encoder is passed to all the layers of the decoder. 
    '''
    #dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(
        tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    
    '''
    Tee final part of Transformer model. In case of machine translation, you predict a 
    'target_vocab_size' dimensional vector at every potition of the target sentence. 
    '''
    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    
    return final_output, attention_weights

In [37]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [20]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = 10000 + 2
target_vocab_size = 10000 + 2
dropout_rate = 0.1

In [39]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [40]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [41]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [42]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

In [43]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, 
                          pe_input=input_vocab_size, 
                          pe_target=target_vocab_size,
                          rate=dropout_rate)

In [44]:
def create_masks(inp, tar):
  # Encoder padding mask
  enc_padding_mask = create_padding_mask(inp)
  
  # Used in the 2nd attention block in the decoder.
  # This padding mask is used to mask the encoder outputs.
  dec_padding_mask = create_padding_mask(inp)
  
  # Used in the 1st attention block in the decoder.
  # It is used to pad and mask future tokens in the input received by 
  # the decoder.
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  dec_target_padding_mask = create_padding_mask(tar)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
  return enc_padding_mask, combined_mask, dec_padding_mask

In [45]:
checkpoint_path = "./checkpoints_deu/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

In [46]:
EPOCHS = 30

In [47]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  
  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp, 
                                 True, 
                                 enc_padding_mask, 
                                 combined_mask, 
                                 dec_padding_mask)
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
  train_loss(loss)
  train_accuracy(tar_real, predictions)

In [49]:
tf.debugging.set_log_device_placement(True)
with tf.device('/GPU:0'):

    for epoch in range(EPOCHS):
      start = time.time()
  
      train_loss.reset_states()
      train_accuracy.reset_states()
  
      # inp -> portuguese, tar -> english
      for (batch, (inp, tar)) in enumerate(dataset.take(steps_per_epoch)):
        train_step(inp, tar)
    
        if batch % 50 == 0:
          print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
              epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
      if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))
    
      print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

      print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.7406 Accuracy 0.2195
Epoch 1 Batch 50 Loss 0.7292 Accuracy 0.2199
Epoch 1 Batch 100 Loss 0.7118 Accuracy 0.2190
Epoch 1 Batch 150 Loss 0.7136 Accuracy 0.2198
Epoch 1 Batch 200 Loss 0.7044 Accuracy 0.2200
Epoch 1 Batch 250 Loss 0.7053 Accuracy 0.2200
Epoch 1 Batch 300 Loss 0.7083 Accuracy 0.2197
Epoch 1 Batch 350 Loss 0.7109 Accuracy 0.2195
Epoch 1 Batch 400 Loss 0.7130 Accuracy 0.2195
Epoch 1 Batch 450 Loss 0.7103 Accuracy 0.2197
Epoch 1 Batch 500 Loss 0.7128 Accuracy 0.2196
Epoch 1 Batch 550 Loss 0.7142 Accuracy 0.2198
Epoch 1 Batch 600 Loss 0.7158 Accuracy 0.2199
Epoch 1 Batch 650 Loss 0.7182 Accuracy 0.2199
Epoch 1 Batch 700 Loss 0.7188 Accuracy 0.2199
Epoch 1 Batch 750 Loss 0.7191 Accuracy 0.2199
Epoch 1 Batch 800 Loss 0.7208 Accuracy 0.2199
Epoch 1 Batch 850 Loss 0.7210 Accuracy 0.2199
Epoch 1 Batch 900 Loss 0.7232 Accuracy 0.2198
Epoch 1 Batch 950 Loss 0.7233 Accuracy 0.2196
Epoch 1 Batch 1000 Loss 0.7253 Accuracy 0.2196
Epoch 1 Batch 1050 Loss 0.7267 Accur

Epoch 4 Batch 0 Loss 0.5321 Accuracy 0.2306
Epoch 4 Batch 50 Loss 0.6658 Accuracy 0.2194
Epoch 4 Batch 100 Loss 0.6744 Accuracy 0.2193
Epoch 4 Batch 150 Loss 0.6730 Accuracy 0.2197
Epoch 4 Batch 200 Loss 0.6725 Accuracy 0.2203
Epoch 4 Batch 250 Loss 0.6716 Accuracy 0.2206
Epoch 4 Batch 300 Loss 0.6690 Accuracy 0.2206
Epoch 4 Batch 350 Loss 0.6690 Accuracy 0.2209
Epoch 4 Batch 400 Loss 0.6712 Accuracy 0.2211
Epoch 4 Batch 450 Loss 0.6713 Accuracy 0.2209
Epoch 4 Batch 500 Loss 0.6745 Accuracy 0.2213
Epoch 4 Batch 550 Loss 0.6747 Accuracy 0.2213
Epoch 4 Batch 600 Loss 0.6764 Accuracy 0.2213
Epoch 4 Batch 650 Loss 0.6796 Accuracy 0.2214
Epoch 4 Batch 700 Loss 0.6783 Accuracy 0.2214
Epoch 4 Batch 750 Loss 0.6792 Accuracy 0.2214
Epoch 4 Batch 800 Loss 0.6799 Accuracy 0.2213
Epoch 4 Batch 850 Loss 0.6803 Accuracy 0.2213
Epoch 4 Batch 900 Loss 0.6805 Accuracy 0.2213
Epoch 4 Batch 950 Loss 0.6812 Accuracy 0.2213
Epoch 4 Batch 1000 Loss 0.6832 Accuracy 0.2212
Epoch 4 Batch 1050 Loss 0.6846 Accur

Epoch 6 Loss 0.6946 Accuracy 0.2207
Time taken for 1 epoch: 1033.7624320983887 secs

Epoch 7 Batch 0 Loss 0.7654 Accuracy 0.2184
Epoch 7 Batch 50 Loss 0.6507 Accuracy 0.2216
Epoch 7 Batch 100 Loss 0.6614 Accuracy 0.2219
Epoch 7 Batch 150 Loss 0.6496 Accuracy 0.2216
Epoch 7 Batch 200 Loss 0.6513 Accuracy 0.2214
Epoch 7 Batch 250 Loss 0.6530 Accuracy 0.2217
Epoch 7 Batch 300 Loss 0.6522 Accuracy 0.2216
Epoch 7 Batch 350 Loss 0.6517 Accuracy 0.2217
Epoch 7 Batch 400 Loss 0.6538 Accuracy 0.2217
Epoch 7 Batch 450 Loss 0.6548 Accuracy 0.2219
Epoch 7 Batch 500 Loss 0.6540 Accuracy 0.2220
Epoch 7 Batch 550 Loss 0.6552 Accuracy 0.2219
Epoch 7 Batch 600 Loss 0.6564 Accuracy 0.2217
Epoch 7 Batch 650 Loss 0.6586 Accuracy 0.2216
Epoch 7 Batch 700 Loss 0.6598 Accuracy 0.2216
Epoch 7 Batch 750 Loss 0.6608 Accuracy 0.2217
Epoch 7 Batch 800 Loss 0.6617 Accuracy 0.2217
Epoch 7 Batch 850 Loss 0.6639 Accuracy 0.2217
Epoch 7 Batch 900 Loss 0.6638 Accuracy 0.2216
Epoch 7 Batch 950 Loss 0.6651 Accuracy 0.221

Epoch 9 Loss 0.6829 Accuracy 0.2211
Time taken for 1 epoch: 1028.964898109436 secs

Epoch 10 Batch 0 Loss 0.6308 Accuracy 0.2252
Epoch 10 Batch 50 Loss 0.6351 Accuracy 0.2254
Epoch 10 Batch 100 Loss 0.6361 Accuracy 0.2244
Epoch 10 Batch 150 Loss 0.6279 Accuracy 0.2243
Epoch 10 Batch 200 Loss 0.6345 Accuracy 0.2237
Epoch 10 Batch 250 Loss 0.6365 Accuracy 0.2235
Epoch 10 Batch 300 Loss 0.6422 Accuracy 0.2232
Epoch 10 Batch 350 Loss 0.6408 Accuracy 0.2231
Epoch 10 Batch 400 Loss 0.6421 Accuracy 0.2231
Epoch 10 Batch 450 Loss 0.6414 Accuracy 0.2227
Epoch 10 Batch 500 Loss 0.6432 Accuracy 0.2227
Epoch 10 Batch 550 Loss 0.6429 Accuracy 0.2226
Epoch 10 Batch 600 Loss 0.6455 Accuracy 0.2225
Epoch 10 Batch 650 Loss 0.6449 Accuracy 0.2224
Epoch 10 Batch 700 Loss 0.6457 Accuracy 0.2224
Epoch 10 Batch 750 Loss 0.6474 Accuracy 0.2224
Epoch 10 Batch 800 Loss 0.6484 Accuracy 0.2225
Epoch 10 Batch 850 Loss 0.6480 Accuracy 0.2224
Epoch 10 Batch 900 Loss 0.6486 Accuracy 0.2223
Epoch 10 Batch 950 Loss 0.

Epoch 12 Batch 2600 Loss 0.6703 Accuracy 0.2215
Epoch 12 Batch 2650 Loss 0.6707 Accuracy 0.2216
Epoch 12 Batch 2700 Loss 0.6708 Accuracy 0.2215
Epoch 12 Batch 2750 Loss 0.6716 Accuracy 0.2215
Epoch 12 Batch 2800 Loss 0.6724 Accuracy 0.2215
Epoch 12 Loss 0.6725 Accuracy 0.2215
Time taken for 1 epoch: 1030.0560247898102 secs

Epoch 13 Batch 0 Loss 0.5989 Accuracy 0.2168
Epoch 13 Batch 50 Loss 0.5912 Accuracy 0.2250
Epoch 13 Batch 100 Loss 0.6142 Accuracy 0.2247
Epoch 13 Batch 150 Loss 0.6194 Accuracy 0.2244
Epoch 13 Batch 200 Loss 0.6235 Accuracy 0.2244
Epoch 13 Batch 250 Loss 0.6243 Accuracy 0.2241
Epoch 13 Batch 300 Loss 0.6251 Accuracy 0.2241
Epoch 13 Batch 350 Loss 0.6263 Accuracy 0.2241
Epoch 13 Batch 400 Loss 0.6273 Accuracy 0.2240
Epoch 13 Batch 450 Loss 0.6271 Accuracy 0.2241
Epoch 13 Batch 500 Loss 0.6273 Accuracy 0.2240
Epoch 13 Batch 550 Loss 0.6304 Accuracy 0.2239
Epoch 13 Batch 600 Loss 0.6328 Accuracy 0.2236
Epoch 13 Batch 650 Loss 0.6348 Accuracy 0.2235
Epoch 13 Batch 700 

Epoch 15 Batch 2400 Loss 0.6581 Accuracy 0.2220
Epoch 15 Batch 2450 Loss 0.6584 Accuracy 0.2220
Epoch 15 Batch 2500 Loss 0.6591 Accuracy 0.2220
Epoch 15 Batch 2550 Loss 0.6599 Accuracy 0.2220
Epoch 15 Batch 2600 Loss 0.6606 Accuracy 0.2220
Epoch 15 Batch 2650 Loss 0.6609 Accuracy 0.2220
Epoch 15 Batch 2700 Loss 0.6613 Accuracy 0.2219
Epoch 15 Batch 2750 Loss 0.6619 Accuracy 0.2219
Epoch 15 Batch 2800 Loss 0.6623 Accuracy 0.2219
Saving checkpoint for epoch 15 at ./checkpoints_deu/train_test/ckpt-3
Epoch 15 Loss 0.6623 Accuracy 0.2219
Time taken for 1 epoch: 1107.8165571689606 secs

Epoch 16 Batch 0 Loss 0.4358 Accuracy 0.2149
Epoch 16 Batch 50 Loss 0.6044 Accuracy 0.2250
Epoch 16 Batch 100 Loss 0.6122 Accuracy 0.2244
Epoch 16 Batch 150 Loss 0.6114 Accuracy 0.2242
Epoch 16 Batch 200 Loss 0.6154 Accuracy 0.2253
Epoch 16 Batch 250 Loss 0.6106 Accuracy 0.2252
Epoch 16 Batch 300 Loss 0.6152 Accuracy 0.2247
Epoch 16 Batch 350 Loss 0.6152 Accuracy 0.2244
Epoch 16 Batch 400 Loss 0.6153 Accuracy

Epoch 18 Batch 2150 Loss 0.6486 Accuracy 0.2226
Epoch 18 Batch 2200 Loss 0.6490 Accuracy 0.2226
Epoch 18 Batch 2250 Loss 0.6497 Accuracy 0.2226
Epoch 18 Batch 2300 Loss 0.6511 Accuracy 0.2226
Epoch 18 Batch 2350 Loss 0.6513 Accuracy 0.2225
Epoch 18 Batch 2400 Loss 0.6524 Accuracy 0.2225
Epoch 18 Batch 2450 Loss 0.6525 Accuracy 0.2225
Epoch 18 Batch 2500 Loss 0.6527 Accuracy 0.2224
Epoch 18 Batch 2550 Loss 0.6532 Accuracy 0.2223
Epoch 18 Batch 2600 Loss 0.6536 Accuracy 0.2223
Epoch 18 Batch 2650 Loss 0.6541 Accuracy 0.2223
Epoch 18 Batch 2700 Loss 0.6542 Accuracy 0.2223
Epoch 18 Batch 2750 Loss 0.6546 Accuracy 0.2223
Epoch 18 Batch 2800 Loss 0.6550 Accuracy 0.2223
Epoch 18 Loss 0.6551 Accuracy 0.2223
Time taken for 1 epoch: 1114.3307218551636 secs

Epoch 19 Batch 0 Loss 0.7097 Accuracy 0.2172
Epoch 19 Batch 50 Loss 0.6255 Accuracy 0.2237
Epoch 19 Batch 100 Loss 0.6233 Accuracy 0.2234
Epoch 19 Batch 150 Loss 0.6187 Accuracy 0.2231
Epoch 19 Batch 200 Loss 0.6173 Accuracy 0.2232
Epoch 19 B

Epoch 21 Batch 1900 Loss 0.6394 Accuracy 0.2231
Epoch 21 Batch 1950 Loss 0.6407 Accuracy 0.2231
Epoch 21 Batch 2000 Loss 0.6413 Accuracy 0.2230
Epoch 21 Batch 2050 Loss 0.6415 Accuracy 0.2229
Epoch 21 Batch 2100 Loss 0.6421 Accuracy 0.2229
Epoch 21 Batch 2150 Loss 0.6423 Accuracy 0.2229
Epoch 21 Batch 2200 Loss 0.6425 Accuracy 0.2228
Epoch 21 Batch 2250 Loss 0.6432 Accuracy 0.2228
Epoch 21 Batch 2300 Loss 0.6441 Accuracy 0.2227
Epoch 21 Batch 2350 Loss 0.6445 Accuracy 0.2228
Epoch 21 Batch 2400 Loss 0.6454 Accuracy 0.2227
Epoch 21 Batch 2450 Loss 0.6456 Accuracy 0.2227
Epoch 21 Batch 2500 Loss 0.6461 Accuracy 0.2226
Epoch 21 Batch 2550 Loss 0.6464 Accuracy 0.2226
Epoch 21 Batch 2600 Loss 0.6467 Accuracy 0.2226
Epoch 21 Batch 2650 Loss 0.6470 Accuracy 0.2226
Epoch 21 Batch 2700 Loss 0.6475 Accuracy 0.2226
Epoch 21 Batch 2750 Loss 0.6475 Accuracy 0.2226
Epoch 21 Batch 2800 Loss 0.6477 Accuracy 0.2226
Epoch 21 Loss 0.6477 Accuracy 0.2226
Time taken for 1 epoch: 1044.5922532081604 secs

Ep

Epoch 24 Batch 1700 Loss 0.6312 Accuracy 0.2231
Epoch 24 Batch 1750 Loss 0.6311 Accuracy 0.2231
Epoch 24 Batch 1800 Loss 0.6315 Accuracy 0.2231
Epoch 24 Batch 1850 Loss 0.6318 Accuracy 0.2231
Epoch 24 Batch 1900 Loss 0.6323 Accuracy 0.2230
Epoch 24 Batch 1950 Loss 0.6330 Accuracy 0.2230
Epoch 24 Batch 2000 Loss 0.6333 Accuracy 0.2230
Epoch 24 Batch 2050 Loss 0.6336 Accuracy 0.2230
Epoch 24 Batch 2100 Loss 0.6341 Accuracy 0.2230
Epoch 24 Batch 2150 Loss 0.6343 Accuracy 0.2230
Epoch 24 Batch 2200 Loss 0.6351 Accuracy 0.2230
Epoch 24 Batch 2250 Loss 0.6359 Accuracy 0.2230
Epoch 24 Batch 2300 Loss 0.6361 Accuracy 0.2229
Epoch 24 Batch 2350 Loss 0.6368 Accuracy 0.2229
Epoch 24 Batch 2400 Loss 0.6377 Accuracy 0.2229
Epoch 24 Batch 2450 Loss 0.6378 Accuracy 0.2228
Epoch 24 Batch 2500 Loss 0.6379 Accuracy 0.2228
Epoch 24 Batch 2550 Loss 0.6386 Accuracy 0.2229
Epoch 24 Batch 2600 Loss 0.6392 Accuracy 0.2229
Epoch 24 Batch 2650 Loss 0.6395 Accuracy 0.2229
Epoch 24 Batch 2700 Loss 0.6399 Accuracy

Epoch 27 Batch 1450 Loss 0.6210 Accuracy 0.2235
Epoch 27 Batch 1500 Loss 0.6216 Accuracy 0.2235
Epoch 27 Batch 1550 Loss 0.6223 Accuracy 0.2234
Epoch 27 Batch 1600 Loss 0.6230 Accuracy 0.2234
Epoch 27 Batch 1650 Loss 0.6241 Accuracy 0.2233
Epoch 27 Batch 1700 Loss 0.6247 Accuracy 0.2233
Epoch 27 Batch 1750 Loss 0.6263 Accuracy 0.2233
Epoch 27 Batch 1800 Loss 0.6263 Accuracy 0.2232
Epoch 27 Batch 1850 Loss 0.6268 Accuracy 0.2232
Epoch 27 Batch 1900 Loss 0.6277 Accuracy 0.2233
Epoch 27 Batch 1950 Loss 0.6278 Accuracy 0.2232
Epoch 27 Batch 2000 Loss 0.6283 Accuracy 0.2232
Epoch 27 Batch 2050 Loss 0.6281 Accuracy 0.2232
Epoch 27 Batch 2100 Loss 0.6282 Accuracy 0.2232
Epoch 27 Batch 2150 Loss 0.6284 Accuracy 0.2232
Epoch 27 Batch 2200 Loss 0.6287 Accuracy 0.2232
Epoch 27 Batch 2250 Loss 0.6296 Accuracy 0.2232
Epoch 27 Batch 2300 Loss 0.6304 Accuracy 0.2231
Epoch 27 Batch 2350 Loss 0.6312 Accuracy 0.2231
Epoch 27 Batch 2400 Loss 0.6317 Accuracy 0.2231
Epoch 27 Batch 2450 Loss 0.6322 Accuracy

Epoch 30 Batch 1250 Loss 0.6133 Accuracy 0.2241
Epoch 30 Batch 1300 Loss 0.6141 Accuracy 0.2240
Epoch 30 Batch 1350 Loss 0.6145 Accuracy 0.2240
Epoch 30 Batch 1400 Loss 0.6152 Accuracy 0.2240
Epoch 30 Batch 1450 Loss 0.6159 Accuracy 0.2239
Epoch 30 Batch 1500 Loss 0.6166 Accuracy 0.2238
Epoch 30 Batch 1550 Loss 0.6170 Accuracy 0.2237
Epoch 30 Batch 1600 Loss 0.6178 Accuracy 0.2237
Epoch 30 Batch 1650 Loss 0.6176 Accuracy 0.2237
Epoch 30 Batch 1700 Loss 0.6175 Accuracy 0.2237
Epoch 30 Batch 1750 Loss 0.6185 Accuracy 0.2237
Epoch 30 Batch 1800 Loss 0.6188 Accuracy 0.2237
Epoch 30 Batch 1850 Loss 0.6189 Accuracy 0.2237
Epoch 30 Batch 1900 Loss 0.6197 Accuracy 0.2237
Epoch 30 Batch 1950 Loss 0.6207 Accuracy 0.2236
Epoch 30 Batch 2000 Loss 0.6213 Accuracy 0.2235
Epoch 30 Batch 2050 Loss 0.6219 Accuracy 0.2235
Epoch 30 Batch 2100 Loss 0.6225 Accuracy 0.2235
Epoch 30 Batch 2150 Loss 0.6237 Accuracy 0.2234
Epoch 30 Batch 2200 Loss 0.6242 Accuracy 0.2234
Epoch 30 Batch 2250 Loss 0.6249 Accuracy