In [None]:
from matplotlib import pyplot as plt
import numpy as np
import tensorflow as tf
from Transformers_Google import *
import dataloaders_git as d

In [2]:
en, fr = d.create_dataset("./pairs_en_fr.txt")
input_tensor, inp_lang_tokenizer = d.tokenize(list(en))
target_tensor, targ_lang_tokenizer = d.tokenize(list(fr))

In [3]:
from sklearn.model_selection import train_test_split
input_tensor_train, \
input_tensor_val, \
target_tensor_train, \
target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2, random_state=1234)

## Keeping gold for use later.. 
en_train, \
en_val, \
fr_train, \
fr_val = train_test_split(en, fr, test_size=0.2, random_state=1234)

In [4]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE

train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val)).shuffle(BUFFER_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE*2, drop_remainder=True)

In [5]:
num_layers = 4 #6
d_model = 256  #512
dff = 1024      #2048
num_heads = 8  

input_vocab_size = len(inp_lang_tokenizer.word_index)+1
target_vocab_size = len(targ_lang_tokenizer.word_index)+1
dropout_rate = 0.1

learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')
val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='val_accuracy')

transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, 
                          pe_input=input_vocab_size, 
                          pe_target=target_vocab_size,
                          rate=dropout_rate)

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


In [6]:
checkpoint_path = "./checkpoints/train/"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
# if ckpt_manager.latest_checkpoint:
#   ckpt.restore(ckpt_manager.latest_checkpoint)
#   print ('Latest checkpoint restored!!')

In [7]:
EPOCHS = 60

In [8]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

# train_step_signature = [
#     tf.TensorSpec(shape=(None, None), dtype=tf.int64),
#     tf.TensorSpec(shape=(None, None), dtype=tf.int64),
# ]

# @tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, tar_inp, 
                                     True, 
                                     enc_padding_mask, 
                                     combined_mask, 
                                     dec_padding_mask)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)
    
    
def val_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    predictions, _ = transformer(inp, tar_inp, 
                                 False, 
                                 enc_padding_mask, 
                                 combined_mask, 
                                 dec_padding_mask)
    loss = loss_function(tar_real, predictions)
        
    val_loss(loss)
    val_accuracy(tar_real, predictions)

In [9]:
def max_length(tensor):
    return max(len(t) for t in tensor)

max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

def evaluate_batch(inp_tensor):
  # Expecting input from the val_dataset which is already tokenised.
  
    encoder_input = tf.convert_to_tensor(inp_tensor)
    decoder_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']] * BATCH_SIZE*2, axis=1)
    output = decoder_input
    
    for i in range(max_length_targ):

        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input, 
                                                     output,
                                                     False,
                                                     enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token for all the batches
        if (predicted_id == targ_lang_tokenizer.word_index['<end>']).numpy().all():
            return output, attention_weights

        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return output, attention_weights


def translate_batch(inp, tar):
    output,_ = evaluate_batch(inp)
    pred_sentences = targ_lang_tokenizer.sequences_to_texts(output.numpy())
    pred_sentences = [x.split("<end>")[0].replace("<start>","").strip() for x in pred_sentences]
    gold_sentences = targ_lang_tokenizer.sequences_to_texts(tar.numpy())
    gold_sentences = [x.replace('<start> ', "").replace(' <end>', "").replace('<OOV>', "").strip() for x in gold_sentences]
    return gold_sentences, pred_sentences


In [10]:
import time
print("--Training--")
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()
    val_loss.reset_states()
    val_accuracy.reset_states()
  
    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)
        
        if batch % 50 == 0:
            print ('Epoch {} Batch {} Train Loss {:.4f} Train Accuracy {:.4f}'.format(
              epoch + 1, batch, train_loss.result(), train_accuracy.result()))
    
    if epoch==0 or (epoch+1)%5==0:
        for (batch, (inp, tar)) in enumerate(val_dataset):
            val_step(inp, tar)
        
        print ('Epoch {} Train Loss {:.4f} Train Accuracy {:.4f} Val Loss {:.4f} Val Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result(),
                                                val_loss.result(),                                                    
                                                val_accuracy.result()))
        
        
    else:
        print ('Epoch {} Train Loss {:.4f} Train Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))    
    
    if (epoch+1)%5==0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                             ckpt_save_path))

  

    print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
    
    if (epoch+1)%10==0:
        new_start = time.time()
        print("--Saving files to get Bleu Scores--")
        print("Batch Size for Evaluation", BATCH_SIZE*2)
        gold_file_path = "get_bleu_gold_epoch_"+str(epoch+1)+".txt"
        pred_file_path = "get_bleu_preds_epoch_"+str(epoch+1)+".txt"
        with open(gold_file_path, 'w', encoding='utf-8', buffering=1) as gold_file, open(pred_file_path, 'w', encoding='utf-8', buffering=1) as pred_file:
            for (batch, (inp, tar)) in enumerate(val_dataset):
                if batch%5==0:
                    print("Evaluating for batch", batch)
                gold_fr,pred_fr = translate_batch(inp, tar)
                for g_fr,p_fr in zip(gold_fr, pred_fr):
                    gold_file.write(g_fr.strip() + '\n')
                    pred_file.write(p_fr.strip() + '\n')
        
        
        print('Time taken for Evaluation: {} secs\n'.format(time.time() - new_start))
        print("Files saved:", gold_file_path)
        print("-------------")
        

--Training--
Epoch 1 Batch 0 Train Loss 9.8063 Train Accuracy 0.0000
Epoch 1 Batch 50 Train Loss 9.7243 Train Accuracy 0.0031
Epoch 1 Batch 100 Train Loss 9.5925 Train Accuracy 0.0057
Epoch 1 Train Loss 9.4850 Train Accuracy 0.0064 Val Loss 9.0125 Val Accuracy 0.0086
Time taken for 1 epoch: 133.52070951461792 secs

Epoch 2 Batch 0 Train Loss 9.0149 Train Accuracy 0.0088
Epoch 2 Batch 50 Train Loss 8.7858 Train Accuracy 0.0101
Epoch 2 Batch 100 Train Loss 8.5141 Train Accuracy 0.0119
Epoch 2 Train Loss 8.3083 Train Accuracy 0.0130
Time taken for 1 epoch: 119.84666609764099 secs

Epoch 3 Batch 0 Train Loss 7.4687 Train Accuracy 0.0171
Epoch 3 Batch 50 Train Loss 7.2187 Train Accuracy 0.0177
Epoch 3 Batch 100 Train Loss 7.0253 Train Accuracy 0.0194
Epoch 3 Train Loss 6.9297 Train Accuracy 0.0204
Time taken for 1 epoch: 119.87369441986084 secs

Epoch 4 Batch 0 Train Loss 6.5702 Train Accuracy 0.0221
Epoch 4 Batch 50 Train Loss 6.5141 Train Accuracy 0.0232
Epoch 4 Batch 100 Train Loss 6.461

## translate one sentence

In [18]:
def evaluate(inp_sentence):
  start_token = 3 #targ_lang_tokenizer.index_word[4]
  end_token = 4 #[tokenizer_pt.vocab_size + 1]
  
  sentence = preprocess_sentence(inp_sentence)

  inputs = [inp_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')

#   inp_sentence = tf.convert_to_tensor(inputs)
  encoder_input = tf.convert_to_tensor(inputs)#tf.expand_dims(inputs, 0)
  
  # as the target is english, the first word to the transformer should be the
  # english start token.
  decoder_input = [targ_lang_tokenizer.word_index['<start>']]
  output = tf.expand_dims(decoder_input, 0)
    
    

  result = ''
  for i in range(max_length_targ):
#     print(encoder_input.shape)
#     print(encoder_input)
#     print(output.shape)
#     print(output)
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        encoder_input, output)
    


    # predictions.shape == (batch_size, seq_len, vocab_size)
    predictions, attention_weights = transformer(encoder_input, 
                                                 output,
                                                 False,
                                                 enc_padding_mask,
                                                 combined_mask,
                                                 dec_padding_mask)
    
    # select the last word from the seq_len dimension
    predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
    
    
    # return the result if the predicted_id is equal to the end token
    if predicted_id == targ_lang_tokenizer.word_index['<end>']:
      return result, tf.squeeze(output, axis=0), attention_weights
    
    # concatentate the predicted_id to the output which is given to the decoder
    # as its input.
    output = tf.concat([output, predicted_id], axis=-1)
 
    result += targ_lang_tokenizer.index_word[predicted_id[0].numpy()[0]] + ' '

  return result, output, attention_weights


def translate(sentence):
  result, output, attention_plot = evaluate(sentence)
  
#   print('Input: %s' % (sentence))
#   print('Predicted translation: {}'.format(result))
  return result

In [40]:
translate("I live in my house.")

'je veux que mon maison a mon maison . '

In [None]:
sentences = inp_lang_tokenizer.sequences_to_texts(input_tensor_val)
count = 0
with open('final_results_input_25_2k.txt', 'w', encoding='utf-8') as input_file, open('final_results_prediction_25_2k.txt', 'w', encoding='utf-8') as pred_file:
    for sent in sentences:
      count+=1
      if count%50==0:
        print(count)
      if count>1000:  
        sent = sent.replace('<start> ', "")
        sent = sent.replace(' <end>', "")
        sent = sent.replace('<OOV>', "")
        input_file.write(sent.strip() + '\n')
        res = translate(sent)
        pred_file.write(res.strip() + '\n')
      if count>1500:
        break
      
        
        

50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
