In [1]:
from pathlib import Path
import os
from collections import Counter
import numpy as np
import time
from tqdm.notebook import tqdm

np.random.seed(8080)

data_path = Path("/content/drive/My Drive/Adv Projects in ML/data")
print(data_path)
print(os.listdir(data_path))

os.chdir("/content/drive/My Drive/Adv Projects in ML/")

!nvidia-smi

/content/drive/My Drive/Adv Projects in ML/data
['train.lang2', 'unaligned.en', 'unaligned.fr', 'train.lang1', 'split_train.lang1', 'split_val.lang2', 'split_train.lang2', 'split_val.lang1', 'unalignedtry.en', 'split_train_unaligned_tokenized_rempunc.en', 'split_val_unaligned_tokenized_rempunc.en', 'split_train_unaligned_tokenized.en', 'split_val_unaligned_tokenized.en', 'split_train_unaligned_tokenized_rempunc.fr', 'split_val_unaligned_tokenized_rempunc.fr', 'split_train_unaligned_tokenized.fr', 'split_val_unaligned_tokenized.fr', 'bpe', 'unaligned_tokenized_rempunc.en']
Mon Apr 13 09:38:04 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Co

In [2]:
%tensorflow_version 2.x
import tensorflow as tf
from transformer import Transformer, CustomSchedule, create_masks

print("Tensorflow version " + tf.__version__)

tf.random.set_seed(8080)
# make sure numpy seeded

Tensorflow version 2.2.0-rc2


In [3]:
# read data
with open(data_path/"split_train.lang1","r") as f:
    english = f.read()
print(len(english.split("\n")), english[:200])
    
with open(data_path/"split_train.lang2","r") as f:
    french = f.read()
print(len(french.split("\n")), french[:200])

with open(data_path/"split_val.lang1","r") as f:
    english_val = f.read()
print(len(english_val.split("\n")), english_val[:200])

with open(data_path/"split_val.lang2","r") as f:
    french_val = f.read()
print(len(french_val.split("\n")), french_val[:200])

# create vocab
english_vocab = list(set(english.replace("\n", " <eos> ").split()))
french_vocab = list(set(french.replace("\n", " <eos> ").split()))
len(english_vocab), len(french_vocab)

english_counter = Counter(english.replace("\n", " <eos> ").split())
french_counter = Counter(french.replace("\n", " <eos> ").split())
len(english_counter), len(french_counter)

english_counter.update({"<unk>":0})
french_counter.update({"<unk>":0})
english_counter.update({"<start>":0})
french_counter.update({"<start>":0})
len(english_counter), len(french_counter)

english_vocab = list(english_counter.keys())
french_vocab = list(french_counter.keys())

# # trim vocab to 10k+2, 12k+2
# english_vocab = ["<start>","<unk>"]
# for i in english_counter.most_common(10000):
#   english_vocab.append(i[0])
# french_vocab = ["<start>","<unk>"]
# for i in french_counter.most_common(12000):
#   french_vocab.append(i[0])

english_word2id = {}
english_id2word = {}
french_word2id = {}
french_id2word = {}

# start enumerate from 1 so that 0 is reserved for padding seqs 
for i, w in enumerate(english_vocab, start=1):
  english_word2id[w] = i
  english_id2word[i] = w

for i, w in enumerate(french_vocab, start=1):
  french_word2id[w] = i
  french_id2word[i] = w

len(english_word2id), len(english_id2word), len(french_word2id), len(french_id2word)

def transform_data(english_lang1, french_lang2):
  english_lines = english_lang1.split("\n")
  french_lines = french_lang2.split("\n")

  data_english = []
  data_french = []

  for line in english_lines:
    line2id = [english_word2id["<start>"]]
    for word in line.split():
      try:
        line2id.append(english_word2id[word])
      except:
        line2id.append(english_word2id["<unk>"])
    line2id.append(english_word2id["<eos>"])
    data_english.append(line2id)

  for line in french_lines:
    line2id = [french_word2id["<start>"]]
    for word in line.split():
      try:
        line2id.append(french_word2id[word])
      except:
        line2id.append(french_word2id["<unk>"])
    line2id.append(french_word2id["<eos>"])
    data_french.append(line2id)

  print(len(data_english), len(data_french))
  return data_english, data_french

data_english, data_french = transform_data(english, french)
data_english_val, data_french_val = transform_data(english_val, french_val)

english_id2word[54], len(data_english), len(data_french), len(data_english_val), len(data_french_val)

8800 as mr de castro is not present mr le foll who is replacing mr de castro has the floor
on the other hand if you 're visiting an underdeveloped country and 25 dollars buys you a gourmet meal it 's exorb
8800 Comme M. De Castro est absent , M. Le Foll , qui le remplace , a la parole .
D' un autre côté , si vous êtes dans un pays en voie de développement , où 25 dollars peuvent vous obtenir un repas de luxe
2200 what action does the council intend to take in the face of this seriously discriminatory attitude which runs contrary to the principles of the eu
where would you like to go next
if that were not enoug
2200 Quelles mesures le Conseil compte-t-il adopter face à cette attitude qui constitue une grave discrimination et est contraire aux principes sur lesquels l' Union européenne est fondée ?
Où souhaiteriez
8800 8800
2200 2200


('gas', 8800, 8800, 2200, 2200)

In [0]:
np.savez("data_and_vocab_backtranslation.npz", data_english=data_english, data_french=data_french, data_english_val=data_english_val, data_french_val=data_french_val, 
         english_word2id=english_word2id, english_id2word=english_id2word, french_word2id=french_word2id,french_id2word=french_id2word)

In [5]:
BUFFER_SIZE = len(data_french)
BATCH_SIZE = 64
EPOCHS = 50
print("No. of batches: ", np.ceil(len(data_french)/BATCH_SIZE))

# transformer hyperparams
num_layers = 1
d_model = 1024
dff = 1024
num_heads = 8
input_vocab_size = len(french_vocab) + 1
target_vocab_size = len(english_vocab) + 1
dropout_rate = 0.4
pe_input = max(max([len(i) for i in data_french]),max([len(i) for i in data_french_val]))
pe_target = max(max([len(i) for i in data_english]),max([len(i) for i in data_english_val]))
print(input_vocab_size,target_vocab_size)

# pe_input = 230
# pe_target = 200

No. of batches:  138.0
16315 12363


In [0]:
tensor_train = tf.data.Dataset.from_tensor_slices((
    tf.keras.preprocessing.sequence.pad_sequences(data_french, padding='post'),
    tf.keras.preprocessing.sequence.pad_sequences(data_english, padding='post')
)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=False)
tensor_val = tf.data.Dataset.from_tensor_slices((
    tf.keras.preprocessing.sequence.pad_sequences(data_french_val, padding='post'),
    tf.keras.preprocessing.sequence.pad_sequences(data_english_val, padding='post')
)).batch(BATCH_SIZE, drop_remainder=False)

In [7]:
transformer = Transformer(
    num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, 
    input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size, 
    pe_input=pe_input, pe_target=pe_target, rate=dropout_rate)

temp_input = tf.random.uniform((BATCH_SIZE, pe_input), dtype=tf.int64, minval=0, maxval=200)
temp_target = tf.random.uniform((BATCH_SIZE, pe_target), dtype=tf.int64, minval=0, maxval=200)

fn_out, _ = transformer(temp_input, temp_target, training=False, 
                               enc_padding_mask=None, 
                               look_ahead_mask=None,
                               dec_padding_mask=None)

fn_out.shape  # (batch_size, tar_seq_len, target_vocab_size)

TensorShape([64, 97, 12363])

In [8]:
transformer.summary()

Model: "transformer"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder (Encoder)            multiple                  23008256  
_________________________________________________________________
decoder (Decoder)            multiple                  23161856  
_________________________________________________________________
dense_16 (Dense)             multiple                  12672075  
Total params: 58,842,187
Trainable params: 58,842,187
Non-trainable params: 0
_________________________________________________________________


In [0]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

train_loss = tf.keras.metrics.Mean(name='loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='val_accuracy')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [0]:
experiment_number = "backtranslation_7_"

checkpoint_path = "./checkpoints/train"+experiment_number

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

writer_train = tf.summary.create_file_writer("log_dir/"+experiment_number+"_train")
writer_val = tf.summary.create_file_writer("log_dir/"+experiment_number+"_val")

# # if a checkpoint exists, restore the latest checkpoint.
# if ckpt_manager.latest_checkpoint:
#   ckpt.restore(ckpt_manager.latest_checkpoint)
#   print ('Latest checkpoint restored!!')

In [0]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
  
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp, 
                                 True, 
                                 enc_padding_mask, 
                                 combined_mask, 
                                 dec_padding_mask)
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
  train_loss(loss)
  train_accuracy(tar_real, predictions)

@tf.function(input_signature=train_step_signature)
def val_step(inp, tar):
  
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

  predictions, _ = transformer(inp, tar_inp, 
                                False, 
                                enc_padding_mask, 
                                combined_mask, 
                                dec_padding_mask)
  loss = loss_function(tar_real, predictions)
  
  val_loss(loss)
  val_accuracy(tar_real, predictions)


In [15]:
best_val_loss = np.inf

for epoch in range(EPOCHS):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()
  
  val_loss.reset_states()
  val_accuracy.reset_states()
  
  for (batch, (inp, tar)) in tqdm(enumerate(tensor_train)):
    train_step(inp, tar)
    if batch % 50 == 0:
      print ('Epoch {} Batch {} Training Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))
    
  print ('Epoch {} Training Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))
  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

  with writer_train.as_default():
    tf.summary.scalar('train_loss', train_loss.result(), step=epoch)

  print("validating")
  for (batch, (inp, tar)) in enumerate(tensor_val):
    val_step(inp, tar)
  
  print ('Epoch {} Validation Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                            val_loss.result(), 
                                            val_accuracy.result()))
  if best_val_loss > val_loss.result():
    best_val_loss = val_loss.result()
    ckpt_save_path = ckpt_manager.save()
    print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))
  with writer_val.as_default():
    tf.summary.scalar('val_loss', val_loss.result(), step=epoch)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 Batch 0 Training Loss 9.4703 Accuracy 0.0000
Epoch 1 Batch 50 Training Loss 9.3808 Accuracy 0.0012
Epoch 1 Batch 100 Training Loss 9.0990 Accuracy 0.0071

Epoch 1 Training Loss 8.8447 Accuracy 0.0086
Time taken for 1 epoch: 77.7696521282196 secs

validating
Epoch 1 Validation Loss 7.5805 Accuracy 0.0139
Saving checkpoint for epoch 1 at ./checkpoints/trainbacktranslation_7_/ckpt-1


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 Batch 0 Training Loss 7.8456 Accuracy 0.0158
Epoch 2 Batch 50 Training Loss 7.5205 Accuracy 0.0138
Epoch 2 Batch 100 Training Loss 7.2518 Accuracy 0.0143

Epoch 2 Training Loss 7.1055 Accuracy 0.0148
Time taken for 1 epoch: 72.24212980270386 secs

validating
Epoch 2 Validation Loss 6.6650 Accuracy 0.0194
Saving checkpoint for epoch 2 at ./checkpoints/trainbacktranslation_7_/ckpt-2


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 3 Batch 0 Training Loss 6.6215 Accuracy 0.0200
Epoch 3 Batch 50 Training Loss 6.5780 Accuracy 0.0187
Epoch 3 Batch 100 Training Loss 6.5402 Accuracy 0.0196

Epoch 3 Training Loss 6.5090 Accuracy 0.0207
Time taken for 1 epoch: 72.14341235160828 secs

validating
Epoch 3 Validation Loss 6.5218 Accuracy 0.0276
Saving checkpoint for epoch 3 at ./checkpoints/trainbacktranslation_7_/ckpt-3


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 4 Batch 0 Training Loss 6.4169 Accuracy 0.0293
Epoch 4 Batch 50 Training Loss 6.3022 Accuracy 0.0254
Epoch 4 Batch 100 Training Loss 6.2494 Accuracy 0.0268

Epoch 4 Training Loss 6.2225 Accuracy 0.0275
Time taken for 1 epoch: 72.12729740142822 secs

validating
Epoch 4 Validation Loss 6.3407 Accuracy 0.0340
Saving checkpoint for epoch 4 at ./checkpoints/trainbacktranslation_7_/ckpt-4


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 5 Batch 0 Training Loss 6.0755 Accuracy 0.0304
Epoch 5 Batch 50 Training Loss 5.9843 Accuracy 0.0325
Epoch 5 Batch 100 Training Loss 5.9543 Accuracy 0.0332

Epoch 5 Training Loss 5.9252 Accuracy 0.0336
Time taken for 1 epoch: 71.36136603355408 secs

validating
Epoch 5 Validation Loss 6.1586 Accuracy 0.0391
Saving checkpoint for epoch 5 at ./checkpoints/trainbacktranslation_7_/ckpt-5


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 6 Batch 0 Training Loss 5.8306 Accuracy 0.0350
Epoch 6 Batch 50 Training Loss 5.7053 Accuracy 0.0369
Epoch 6 Batch 100 Training Loss 5.6788 Accuracy 0.0377

Epoch 6 Training Loss 5.6566 Accuracy 0.0381
Time taken for 1 epoch: 72.21765494346619 secs

validating
Epoch 6 Validation Loss 5.9243 Accuracy 0.0441
Saving checkpoint for epoch 6 at ./checkpoints/trainbacktranslation_7_/ckpt-6


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 7 Batch 0 Training Loss 5.2976 Accuracy 0.0467
Epoch 7 Batch 50 Training Loss 5.4500 Accuracy 0.0417
Epoch 7 Batch 100 Training Loss 5.4283 Accuracy 0.0420

Epoch 7 Training Loss 5.4087 Accuracy 0.0423
Time taken for 1 epoch: 72.19678902626038 secs

validating
Epoch 7 Validation Loss 5.8080 Accuracy 0.0482
Saving checkpoint for epoch 7 at ./checkpoints/trainbacktranslation_7_/ckpt-7


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 8 Batch 0 Training Loss 5.3441 Accuracy 0.0413
Epoch 8 Batch 50 Training Loss 5.2277 Accuracy 0.0451
Epoch 8 Batch 100 Training Loss 5.1999 Accuracy 0.0462

Epoch 8 Training Loss 5.1813 Accuracy 0.0464
Time taken for 1 epoch: 71.3016893863678 secs

validating
Epoch 8 Validation Loss 5.6016 Accuracy 0.0525
Saving checkpoint for epoch 8 at ./checkpoints/trainbacktranslation_7_/ckpt-8


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 9 Batch 0 Training Loss 4.9112 Accuracy 0.0505
Epoch 9 Batch 50 Training Loss 4.9688 Accuracy 0.0494
Epoch 9 Batch 100 Training Loss 4.9718 Accuracy 0.0500

Epoch 9 Training Loss 4.9643 Accuracy 0.0504
Time taken for 1 epoch: 71.43145513534546 secs

validating
Epoch 9 Validation Loss 5.5412 Accuracy 0.0555
Saving checkpoint for epoch 9 at ./checkpoints/trainbacktranslation_7_/ckpt-9


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 10 Batch 0 Training Loss 4.7886 Accuracy 0.0573
Epoch 10 Batch 50 Training Loss 4.7408 Accuracy 0.0542
Epoch 10 Batch 100 Training Loss 4.7522 Accuracy 0.0540

Epoch 10 Training Loss 4.7545 Accuracy 0.0543
Time taken for 1 epoch: 71.1930582523346 secs

validating
Epoch 10 Validation Loss 5.4329 Accuracy 0.0583
Saving checkpoint for epoch 10 at ./checkpoints/trainbacktranslation_7_/ckpt-10


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 11 Batch 0 Training Loss 4.4333 Accuracy 0.0628
Epoch 11 Batch 50 Training Loss 4.5672 Accuracy 0.0580
Epoch 11 Batch 100 Training Loss 4.5503 Accuracy 0.0582

Epoch 11 Training Loss 4.5440 Accuracy 0.0581
Time taken for 1 epoch: 71.52901887893677 secs

validating
Epoch 11 Validation Loss 5.3699 Accuracy 0.0606
Saving checkpoint for epoch 11 at ./checkpoints/trainbacktranslation_7_/ckpt-11


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 12 Batch 0 Training Loss 4.4643 Accuracy 0.0555
Epoch 12 Batch 50 Training Loss 4.3485 Accuracy 0.0620
Epoch 12 Batch 100 Training Loss 4.3397 Accuracy 0.0622

Epoch 12 Training Loss 4.3302 Accuracy 0.0622
Time taken for 1 epoch: 71.09674954414368 secs

validating
Epoch 12 Validation Loss 5.3518 Accuracy 0.0628
Saving checkpoint for epoch 12 at ./checkpoints/trainbacktranslation_7_/ckpt-12


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 13 Batch 0 Training Loss 4.1635 Accuracy 0.0674
Epoch 13 Batch 50 Training Loss 4.1115 Accuracy 0.0655
Epoch 13 Batch 100 Training Loss 4.1183 Accuracy 0.0661

Epoch 13 Training Loss 4.1206 Accuracy 0.0659
Time taken for 1 epoch: 71.07215571403503 secs

validating
Epoch 13 Validation Loss 5.2638 Accuracy 0.0648
Saving checkpoint for epoch 13 at ./checkpoints/trainbacktranslation_7_/ckpt-13


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 14 Batch 0 Training Loss 3.9964 Accuracy 0.0632
Epoch 14 Batch 50 Training Loss 3.8936 Accuracy 0.0692
Epoch 14 Batch 100 Training Loss 3.8967 Accuracy 0.0693

Epoch 14 Training Loss 3.9046 Accuracy 0.0695
Time taken for 1 epoch: 71.2663164138794 secs

validating
Epoch 14 Validation Loss 5.2384 Accuracy 0.0665
Saving checkpoint for epoch 14 at ./checkpoints/trainbacktranslation_7_/ckpt-14


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 15 Batch 0 Training Loss 3.7747 Accuracy 0.0692
Epoch 15 Batch 50 Training Loss 3.6475 Accuracy 0.0736
Epoch 15 Batch 100 Training Loss 3.6722 Accuracy 0.0735

Epoch 15 Training Loss 3.6806 Accuracy 0.0738
Time taken for 1 epoch: 71.19672870635986 secs

validating
Epoch 15 Validation Loss 5.2358 Accuracy 0.0684
Saving checkpoint for epoch 15 at ./checkpoints/trainbacktranslation_7_/ckpt-15


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 16 Batch 0 Training Loss 3.3784 Accuracy 0.0760
Epoch 16 Batch 50 Training Loss 3.4261 Accuracy 0.0777
Epoch 16 Batch 100 Training Loss 3.4456 Accuracy 0.0778

Epoch 16 Training Loss 3.4598 Accuracy 0.0781
Time taken for 1 epoch: 71.25226998329163 secs

validating
Epoch 16 Validation Loss 5.2080 Accuracy 0.0703
Saving checkpoint for epoch 16 at ./checkpoints/trainbacktranslation_7_/ckpt-16


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 17 Batch 0 Training Loss 3.1272 Accuracy 0.0856
Epoch 17 Batch 50 Training Loss 3.2017 Accuracy 0.0830
Epoch 17 Batch 100 Training Loss 3.2219 Accuracy 0.0830

Epoch 17 Training Loss 3.2368 Accuracy 0.0824
Time taken for 1 epoch: 71.49773907661438 secs

validating
Epoch 17 Validation Loss 5.1757 Accuracy 0.0700
Saving checkpoint for epoch 17 at ./checkpoints/trainbacktranslation_7_/ckpt-17


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 18 Batch 0 Training Loss 2.9163 Accuracy 0.0854
Epoch 18 Batch 50 Training Loss 2.9669 Accuracy 0.0869
Epoch 18 Batch 100 Training Loss 2.9972 Accuracy 0.0875

Epoch 18 Training Loss 3.0125 Accuracy 0.0874
Time taken for 1 epoch: 71.27120208740234 secs

validating
Epoch 18 Validation Loss 5.1471 Accuracy 0.0714
Saving checkpoint for epoch 18 at ./checkpoints/trainbacktranslation_7_/ckpt-18


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 19 Batch 0 Training Loss 2.5318 Accuracy 0.0869
Epoch 19 Batch 50 Training Loss 2.7213 Accuracy 0.0942
Epoch 19 Batch 100 Training Loss 2.7632 Accuracy 0.0936

Epoch 19 Training Loss 2.7899 Accuracy 0.0930
Time taken for 1 epoch: 70.94375562667847 secs

validating
Epoch 19 Validation Loss 5.2097 Accuracy 0.0728


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 20 Batch 0 Training Loss 2.5151 Accuracy 0.0946
Epoch 20 Batch 50 Training Loss 2.5004 Accuracy 0.1003
Epoch 20 Batch 100 Training Loss 2.5390 Accuracy 0.0995

Epoch 20 Training Loss 2.5678 Accuracy 0.0987
Time taken for 1 epoch: 70.01166892051697 secs

validating
Epoch 20 Validation Loss 5.2510 Accuracy 0.0719


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 21 Batch 0 Training Loss 2.1517 Accuracy 0.1071
Epoch 21 Batch 50 Training Loss 2.2865 Accuracy 0.1066
Epoch 21 Batch 100 Training Loss 2.3284 Accuracy 0.1050

Epoch 21 Training Loss 2.3563 Accuracy 0.1043
Time taken for 1 epoch: 70.05898976325989 secs

validating
Epoch 21 Validation Loss 5.2846 Accuracy 0.0721


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 22 Batch 0 Training Loss 1.8651 Accuracy 0.1213
Epoch 22 Batch 50 Training Loss 2.0522 Accuracy 0.1139
Epoch 22 Batch 100 Training Loss 2.1097 Accuracy 0.1117

Epoch 22 Training Loss 2.1485 Accuracy 0.1106
Time taken for 1 epoch: 70.0838029384613 secs

validating
Epoch 22 Validation Loss 5.3428 Accuracy 0.0722


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 23 Batch 0 Training Loss 1.7901 Accuracy 0.1227
Epoch 23 Batch 50 Training Loss 1.8717 Accuracy 0.1196
Epoch 23 Batch 100 Training Loss 1.9258 Accuracy 0.1180

Epoch 23 Training Loss 1.9576 Accuracy 0.1166
Time taken for 1 epoch: 69.97788000106812 secs

validating
Epoch 23 Validation Loss 5.4500 Accuracy 0.0722


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 24 Batch 0 Training Loss 1.5981 Accuracy 0.1305
Epoch 24 Batch 50 Training Loss 1.6785 Accuracy 0.1278
Epoch 24 Batch 100 Training Loss 1.7303 Accuracy 0.1247

Epoch 24 Training Loss 1.7729 Accuracy 0.1229
Time taken for 1 epoch: 70.33239889144897 secs

validating
Epoch 24 Validation Loss 5.5422 Accuracy 0.0724


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 25 Batch 0 Training Loss 1.6733 Accuracy 0.1134
Epoch 25 Batch 50 Training Loss 1.5301 Accuracy 0.1338
Epoch 25 Batch 100 Training Loss 1.5705 Accuracy 0.1311

Epoch 25 Training Loss 1.6091 Accuracy 0.1287
Time taken for 1 epoch: 70.3958101272583 secs

validating
Epoch 25 Validation Loss 5.6604 Accuracy 0.0717


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 26 Batch 0 Training Loss 1.3636 Accuracy 0.1398
Epoch 26 Batch 50 Training Loss 1.3602 Accuracy 0.1406
Epoch 26 Batch 100 Training Loss 1.4260 Accuracy 0.1365

Epoch 26 Training Loss 1.4626 Accuracy 0.1341
Time taken for 1 epoch: 69.97698497772217 secs

validating
Epoch 26 Validation Loss 5.7734 Accuracy 0.0709


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 27 Batch 0 Training Loss 1.2430 Accuracy 0.1740
Epoch 27 Batch 50 Training Loss 1.2485 Accuracy 0.1435
Epoch 27 Batch 100 Training Loss 1.2915 Accuracy 0.1410


KeyboardInterrupt: ignored

Evaluate best model

In [42]:
# load model

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


In [43]:
transformer

<transformer.Transformer at 0x7f6720e2d668>

In [44]:
val_loss.reset_states()
val_accuracy.reset_states()
  
for (batch, (inp, tar)) in tqdm(enumerate(tensor_val)):
  val_step(inp, tar)
  
print ('Validation Loss {:.4f} Accuracy {:.4f}'.format(
                                          val_loss.result(), 
                                          val_accuracy.result()))


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Validation Loss 5.1471 Accuracy 0.0714


In [0]:
def generate_predictions(inp_sentences):

  if len(inp_sentences.get_shape())==1:
    encoder_input = tf.expand_dims(inp_sentences, 0)
    decoder_input = [english_word2id["<start>"]]
    output = tf.expand_dims(decoder_input, 0)

  else:
    encoder_input = inp_sentences
    decoder_input = [english_word2id["<start>"]]*inp_sentences.get_shape()[0]
    output = tf.expand_dims(decoder_input, -1)


  # encoder_input = tf.expand_dims(inp_sentence, 0)
  
  # decoder_input = [english_word2id["<start>"]]
  # output = tf.expand_dims(decoder_input, 0)
  
  for i in range(pe_target):
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        encoder_input, output)
  
    # predictions.shape == (batch_size, seq_len, vocab_size)
    predictions, attention_weights = transformer(encoder_input, 
                                                 output,
                                                 False,
                                                 enc_padding_mask,
                                                 combined_mask,
                                                 dec_padding_mask)
    
    # select the last word from the seq_len dimension
    predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
    
    # # return the result if the predicted_id is equal to the end token
    # if predicted_id == english_word2id["<eos>"]:
    if tf.reduce_sum(tf.cast((tf.reduce_sum(tf.cast(output == english_word2id["<eos>"], tf.float32),axis=1)>0), tf.float32)) == inp.get_shape()[0]:
      return output, attention_weights
    #   return tf.squeeze(output, axis=0), attention_weights
    
    # concatentate the predicted_id to the output which is given to the decoder
    # as its input.
    output = tf.concat([output, predicted_id], axis=-1)

  # return tf.squeeze(output, axis=0), attention_weights
  return output, attention_weights

In [46]:
all_preds = []
for (batch_i, (inp, tar)) in tqdm(enumerate(tensor_val.unbatch().batch(128))):

  preds, attention = generate_predictions(inp)
  all_preds.append(preds)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [47]:
translated_sentences = []

for k in tqdm(all_preds):
  for i in k:
    sentence_english = []
    for j in i.numpy()[1:]:
      if j==0 or j==english_word2id["<eos>"]:
        break
      sentence_english.append(english_id2word[j])

    sentence_english = " ".join(sentence_english)

    translated_sentences.append(sentence_english)

translated_sentences = "\n".join(translated_sentences)

with open("predictions.txt","w") as f:
  f.write(translated_sentences)

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))




In [48]:
!pip install sacrebleu
!python evaluator.py --input-file-path ./predictions.txt --target-file-path ./data/split_val.lang1 --do-not-run-model 

final avg bleu score: 10.67


Self-Training Monolingual Data Generation

In [0]:
amount_data_start = 0
amount_data_end = 100000

In [24]:
with open(data_path/"unaligned.fr","r") as f:
    french_monolingual = f.read().strip()
print(len(french_monolingual.split("\n")), french_monolingual[:200])

474000 Nous n’aurions pas pu dégager d’accord sur un calendrier de conclusion de la CIG sans l’engagement politique de mes collègues du Conseil européen.
(DE) Madame la Présidente, Monsieur le Commissaire, M


In [25]:
def transform_test_data(lang1, dict_word2id, amount_data_start=None,amount_data_end=None):
  lines = lang1.split("\n")
  if amount_data_start or amount_data_end:
    lines = lines[amount_data_start:amount_data_end]
  data = []

  for line in lines:
    line2id = [dict_word2id["<start>"]]
    for word in line.split():
      try:
        line2id.append(dict_word2id[word])
      except:
        line2id.append(dict_word2id["<unk>"])
    line2id.append(dict_word2id["<eos>"])
    data.append(line2id)

  return data

french_monolingual_data = transform_test_data(french_monolingual, french_word2id, amount_data_start, amount_data_end)
len(french_monolingual_data)

100000

In [26]:
max([len(i) for i in french_monolingual_data])

160

In [27]:
with open(data_path/"unaligned_tokenized_rempunc.en","r") as f:
    english_monolingual = f.read().strip().lower()
print(len(english_monolingual.split("\n")), english_monolingual[:200])
english_monolingual_data = transform_test_data(english_monolingual, english_word2id)
print(len(english_monolingual_data))
max([len(i) for i in english_monolingual_data])

474000 for the second phase of the trials we just had different sizes small medium large and extra - large it 's true
geng had been my host the previous january when i was the first us defense secretary to v
474000


182

In [28]:
pe_input = max([len(i) for i in french_monolingual_data])
pe_target = max([len(i) for i in english_monolingual_data])
pe_input, pe_target

(160, 182)

In [0]:
tensor_test = tf.data.Dataset.from_tensor_slices((
    tf.keras.preprocessing.sequence.pad_sequences(french_monolingual_data, padding='post')
)).batch(BATCH_SIZE, drop_remainder=False)

In [0]:
transformer = Transformer(
    num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, 
    input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size, 
    pe_input=pe_input, pe_target=pe_target, rate=dropout_rate)

In [31]:
experiment_number = "backtranslation_7_"

checkpoint_path = "./checkpoints/train"+experiment_number

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


In [0]:
all_preds = []
for batch_i, inp in tqdm(enumerate(tensor_test.unbatch().batch(128)),total=len(french_monolingual_data) // 128 + 1):
  preds, attention = generate_predictions(inp)
  all_preds.append(preds)

HBox(children=(IntProgress(value=0, max=782), HTML(value='')))

In [0]:
translated_sentences = []

for k in tqdm(all_preds):
  for i in k:
    sentence_english = []
    for j in i.numpy()[1:]:
      if j==0 or j==english_word2id["<eos>"]:
        break
      sentence_english.append(english_id2word[j])

    sentence_english = " ".join(sentence_english)

    translated_sentences.append(sentence_english)

translated_sentences = "\n".join(translated_sentences)

with open("predictions_french_monolingual_"+str(amount_data_start)+"_"+str(amount_data_end)+".txt","w") as f:
  f.write(translated_sentences)