In [28]:
!nvidia-smi

Fri Apr 17 18:08:05 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0    33W /  70W |   8483MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [29]:
from pathlib import Path
import os
from collections import Counter
import numpy as np
import time
from tqdm.notebook import tqdm

np.random.seed(8080)

data_path = Path("/content/drive/My Drive/Adv Projects in ML/data")
print(data_path)
print(os.listdir(data_path))

os.chdir("/content/drive/My Drive/Adv Projects in ML/")

!nvidia-smi

/content/drive/My Drive/Adv Projects in ML/data
['train.lang2', 'unaligned.en', 'unaligned.fr', 'train.lang1', 'split_train.lang1', 'split_val.lang2', 'split_train.lang2', 'split_val.lang1', 'unalignedtry.en', 'split_train_unaligned_tokenized_rempunc.en', 'split_val_unaligned_tokenized_rempunc.en', 'split_train_unaligned_tokenized.en', 'split_val_unaligned_tokenized.en', 'split_train_unaligned_tokenized_rempunc.fr', 'split_val_unaligned_tokenized_rempunc.fr', 'split_train_unaligned_tokenized.fr', 'split_val_unaligned_tokenized.fr', 'bpe', 'unaligned_tokenized_rempunc.en']
Fri Apr 17 18:08:13 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Co

In [30]:
%tensorflow_version 2.x
import tensorflow as tf
from transformer import Transformer, CustomSchedule, create_masks

print("Tensorflow version " + tf.__version__)

tf.random.set_seed(8080)
# make sure numpy seeded

Tensorflow version 2.2.0-rc3


In [31]:
# read data
with open(data_path/"split_train.lang1","r") as f:
    english = f.read()
print(len(english.split("\n")), english[:200])
    
with open(data_path/"split_train.lang2","r") as f:
    french = f.read()
print(len(french.split("\n")), french[:200])

with open(data_path/"split_val.lang1","r") as f:
    english_val = f.read()
print(len(english_val.split("\n")), english_val[:200])

with open(data_path/"split_val.lang2","r") as f:
    french_val = f.read()
print(len(french_val.split("\n")), french_val[:200])

with open("predictions_english_st.txt","r") as f:
    french_st = f.read().strip()
print(len(french_st.split("\n")), french_st[:200])

with open(data_path/"unaligned.en","r") as f:
    english_monolingual = f.read().strip()
# english_monolingual = english_monolingual.split("\n")[:len(french_st.split("\n"))]
print(len(english_monolingual.split("\n")), english_monolingual[:200])

with open("predictions_french_bt.txt","r") as f:
    english_bt = f.read().strip()
print(len(english_bt.split("\n")), english_bt[:200])

with open(data_path/"unaligned.fr","r") as f:
    french_monolingual = f.read().strip()
print(len(french_monolingual.split("\n")), french_monolingual[:200])

# create vocab
english_vocab = list(set(english.replace("\n", " <eos> ").split()))
french_vocab = list(set(french.replace("\n", " <eos> ").split()))
len(english_vocab), len(french_vocab)

english_counter = Counter(english.replace("\n", " <eos> ").split())
french_counter = Counter(french.replace("\n", " <eos> ").split())
len(english_counter), len(french_counter)

english_counter.update({"<unk>":0})
french_counter.update({"<unk>":0})
english_counter.update({"<start>":0})
french_counter.update({"<start>":0})
len(english_counter), len(french_counter)

english_vocab = list(english_counter.keys())
french_vocab = list(french_counter.keys())

# # trim vocab to 10k+2, 12k+2
# english_vocab = ["<start>","<unk>"]
# for i in english_counter.most_common(10000):
#   english_vocab.append(i[0])
# french_vocab = ["<start>","<unk>"]
# for i in french_counter.most_common(12000):
#   french_vocab.append(i[0])

english_word2id = {}
english_id2word = {}
french_word2id = {}
french_id2word = {}

# start enumerate from 1 so that 0 is reserved for padding seqs 
for i, w in enumerate(english_vocab, start=1):
  english_word2id[w] = i
  english_id2word[i] = w

for i, w in enumerate(french_vocab, start=1):
  french_word2id[w] = i
  french_id2word[i] = w

len(english_word2id), len(english_id2word), len(french_word2id), len(french_id2word)

def transform_data(english_lang1, french_lang2):
  english_lines = english_lang1.split("\n")
  french_lines = french_lang2.split("\n")

  data_english = []
  data_french = []

  for line in english_lines:
    line2id = [english_word2id["<start>"]]
    for word in line.split():
      try:
        line2id.append(english_word2id[word])
      except:
        line2id.append(english_word2id["<unk>"])
    line2id.append(english_word2id["<eos>"])
    data_english.append(line2id)

  for line in french_lines:
    line2id = [french_word2id["<start>"]]
    for word in line.split():
      try:
        line2id.append(french_word2id[word])
      except:
        line2id.append(french_word2id["<unk>"])
    line2id.append(french_word2id["<eos>"])
    data_french.append(line2id)

  print(len(data_english), len(data_french))
  return data_english, data_french

data_english, data_french = transform_data(english, french)
data_english_val, data_french_val = transform_data(english_val, french_val)

data_english_monolingual, data_french_st = transform_data(english_monolingual, french_st)
data_english_monolingual = data_english_monolingual[:len(data_french_st)]

data_english_bt, data_french_monolingual = transform_data(english_bt, french_monolingual)
data_french_monolingual = data_french_monolingual[:len(data_english_bt)]

english_id2word[54], len(data_english), len(data_french), len(data_english_val), len(data_french_val), len(data_english_monolingual), len(data_french_st), len(data_english_bt), len(data_french_monolingual)

8800 as mr de castro is not present mr le foll who is replacing mr de castro has the floor
on the other hand if you 're visiting an underdeveloped country and 25 dollars buys you a gourmet meal it 's exorb
8800 Comme M. De Castro est absent , M. Le Foll , qui le remplace , a la parole .
D' un autre côté , si vous êtes dans un pays en voie de développement , où 25 dollars peuvent vous obtenir un repas de luxe
2200 what action does the council intend to take in the face of this seriously discriminatory attitude which runs contrary to the principles of the eu
where would you like to go next
if that were not enoug
2200 Quelles mesures le Conseil compte-t-il adopter face à cette attitude qui constitue une grave discrimination et est contraire aux principes sur lesquels l' Union européenne est fondée ?
Où souhaiteriez
200000 Pour la deuxième fois , nous avons des frais d’ être prises par rapport à des petites entreprises et que c' est vrai .
La dernière séance , j' ai été dit que j' étais le

('gas', 8800, 8800, 2200, 2200, 200000, 200000, 200000, 200000)

In [0]:
np.savez("data_and_vocab_bt_st_upsample_.npz", data_english=data_english, data_french=data_french, data_english_val=data_english_val, data_french_val=data_french_val, 
         data_english_monolingual=data_english_monolingual, data_french_st=data_french_st,
         data_english_bt=data_english_bt, data_french_monolingual=data_french_monolingual,
         english_word2id=english_word2id, english_id2word=english_id2word, french_word2id=french_word2id,french_id2word=french_id2word)

In [32]:
BUFFER_SIZE = len(data_english)
BATCH_SIZE = 64
EPOCHS = 2
print("No. of batches: ", np.ceil(len(data_english_monolingual)/BATCH_SIZE))
print("No. of batches: ", np.ceil(len(data_english)/BATCH_SIZE))
repeat_factor = len(data_english_monolingual) // len(data_english) + 1
print(repeat_factor)

# transformer hyperparams
num_layers = 2
d_model = 1024
dff = 1024
num_heads = 8
input_vocab_size = len(english_vocab) + 1
target_vocab_size = len(french_vocab) + 1
dropout_rate = 0.4
p_wd_st = 0.3
p_wd_bt = 0.1
pe_input = max(max([len(i) for i in data_english]),max([len(i) for i in data_english_val]),max([len(i) for i in data_english_monolingual]),max([len(i) for i in data_english_bt]))
pe_target = max(max([len(i) for i in data_french]),max([len(i) for i in data_french_val]),max([len(i) for i in data_french_st]),max([len(i) for i in data_french_monolingual]))

# pe_input = 200
# pe_target = 230
pe_input, pe_target

No. of batches:  3125.0
No. of batches:  138.0
23


(184, 222)

In [0]:
tensor_train = tf.data.Dataset.from_tensor_slices((
    tf.keras.preprocessing.sequence.pad_sequences(data_english, padding='post'),
    tf.keras.preprocessing.sequence.pad_sequences(data_french, padding='post')
)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=False)
tensor_val = tf.data.Dataset.from_tensor_slices((
    tf.keras.preprocessing.sequence.pad_sequences(data_english_val, padding='post'),
    tf.keras.preprocessing.sequence.pad_sequences(data_french_val, padding='post')
)).batch(BATCH_SIZE, drop_remainder=False)

In [0]:
class DatasetGenerator_ST(tf.data.Dataset):
  def _generator():
    inp_pad = tf.keras.preprocessing.sequence.pad_sequences(data_english_monolingual, padding='post').shape[1]
    tar_pad = tf.keras.preprocessing.sequence.pad_sequences(data_french_st, padding='post').shape[1]
    indexes = np.arange(len(data_french_st))
    np.random.shuffle(indexes)
    data1 = np.array(data_english_monolingual)[indexes]
    data2 = np.array(data_french_st)[indexes]
    for i in range(len(data2)):
      tar = data2[i]
      tar = np.pad(tar, (0,tar_pad-len(tar)))
      aug = data1[i]
      if np.random.choice(['drop','swap']) == 'drop':
        drop_idxs = np.random.binomial(1,p_wd_st,len(aug))
        drop_idxs = np.where(drop_idxs==1)
        aug = np.delete(aug,drop_idxs)
      else:
        swap_idx = np.random.choice(np.arange(1,len(aug)))
        tmp = aug[swap_idx]
        aug[swap_idx] = aug[swap_idx-1]
        aug[swap_idx-1] = tmp
      aug = np.pad(aug, (0,inp_pad-len(aug)))
      yield aug, tar

  def __new__(cls):
      return tf.data.Dataset.from_generator(
          cls._generator,
          output_types=(tf.dtypes.int32,tf.dtypes.int32),
          output_shapes=(None,None)
      )

class DatasetGenerator_BT(tf.data.Dataset):
  def _generator():
    inp_pad = tf.keras.preprocessing.sequence.pad_sequences(data_english_bt, padding='post').shape[1]
    tar_pad = tf.keras.preprocessing.sequence.pad_sequences(data_french_monolingual, padding='post').shape[1]
    indexes = np.arange(len(data_french_monolingual))
    np.random.shuffle(indexes)
    data1 = np.array(data_english_bt)[indexes]
    data2 = np.array(data_french_monolingual)[indexes]
    for i in range(len(data2)):
      tar = data2[i]
      tar = np.pad(tar, (0,tar_pad-len(tar)))
      aug = data1[i]
      if np.random.choice(['drop','swap']) == 'drop':
        drop_idxs = np.random.binomial(1,p_wd_bt,len(aug))
        drop_idxs = np.where(drop_idxs==1)
        aug = np.delete(aug,drop_idxs)
      else:
        swap_idx = np.random.choice(np.arange(1,len(aug)))
        tmp = aug[swap_idx]
        aug[swap_idx] = aug[swap_idx-1]
        aug[swap_idx-1] = tmp
      aug = np.pad(aug, (0,inp_pad-len(aug)))
      yield aug, tar

  def __new__(cls):
      return tf.data.Dataset.from_generator(
          cls._generator,
          output_types=(tf.dtypes.int32,tf.dtypes.int32),
          output_shapes=(None,None)
      )

tensor_st = DatasetGenerator_ST().batch(BATCH_SIZE, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)
tensor_bt = DatasetGenerator_BT().batch(BATCH_SIZE, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

In [35]:
transformer = Transformer(
    num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, 
    input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size, 
    pe_input=pe_input, pe_target=pe_target, rate=dropout_rate)

temp_input = tf.random.uniform((BATCH_SIZE, pe_input), dtype=tf.int64, minval=0, maxval=200)
temp_target = tf.random.uniform((BATCH_SIZE, pe_target), dtype=tf.int64, minval=0, maxval=200)

fn_out, _ = transformer(temp_input, temp_target, training=False, 
                               enc_padding_mask=None, 
                               look_ahead_mask=None,
                               dec_padding_mask=None)

fn_out.shape  # (batch_size, tar_seq_len, target_vocab_size)

TensorShape([64, 222, 16315])

In [36]:
transformer.summary()

Model: "transformer_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_1 (Encoder)          multiple                  25263104  
_________________________________________________________________
decoder_1 (Decoder)          multiple                  37710848  
_________________________________________________________________
dense_65 (Dense)             multiple                  16722875  
Total params: 79,696,827
Trainable params: 79,696,827
Non-trainable params: 0
_________________________________________________________________


In [0]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

train_loss = tf.keras.metrics.Mean(name='loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='val_accuracy')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [0]:
experiment_number = "_bt_st_3_upsample_"

checkpoint_path = "./checkpoints/train"+experiment_number
checkpoint_path_acc = "./checkpoints/train"+experiment_number+"_acc_"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=3)
ckpt_manager_acc = tf.train.CheckpointManager(ckpt, checkpoint_path_acc, max_to_keep=3)

writer_train = tf.summary.create_file_writer("log_dir/"+experiment_number+"_train")
writer_val = tf.summary.create_file_writer("log_dir/"+experiment_number+"_val")

# # if a checkpoint exists, restore the latest checkpoint.
# if ckpt_manager.latest_checkpoint:
#   ckpt.restore(ckpt_manager.latest_checkpoint)
#   print ('Latest checkpoint restored!!')

In [0]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
  
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp, 
                                 True, 
                                 enc_padding_mask, 
                                 combined_mask, 
                                 dec_padding_mask)
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
  train_loss(loss)
  train_accuracy(tar_real, predictions)

@tf.function(input_signature=train_step_signature)
def val_step(inp, tar):
  
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

  predictions, _ = transformer(inp, tar_inp, 
                                False, 
                                enc_padding_mask, 
                                combined_mask, 
                                dec_padding_mask)
  loss = loss_function(tar_real, predictions)
  
  val_loss(loss)
  val_accuracy(tar_real, predictions)


In [27]:
best_val_loss = np.inf
best_val_acc = 0

for epoch in range(EPOCHS):
  start = time.time()

  train_loss.reset_states()
  train_accuracy.reset_states()

  print("training ST data")
  tensor_st = DatasetGenerator_ST().batch(BATCH_SIZE, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

  for (batch, (inp, tar)) in tqdm(enumerate(tensor_st), total=len(data_english_monolingual)//BATCH_SIZE+1):
    train_step(inp, tar)
    if batch % 100 == 0:
      print ('Epoch {} Batch {} Training Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))

  print("training BT data")
  tensor_bt = DatasetGenerator_BT().batch(BATCH_SIZE, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)

  for (batch, (inp, tar)) in tqdm(enumerate(tensor_bt), total=len(data_english_bt)//BATCH_SIZE+1):
    train_step(inp, tar)
    if batch % 100 == 0:
      print ('Epoch {} Batch {} Training Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))

  for iteration_i in range(repeat_factor):

    train_loss.reset_states()
    train_accuracy.reset_states()
    
    val_loss.reset_states()
    val_accuracy.reset_states()

    print("training Parallel data")

    for (batch, (inp, tar)) in tqdm(enumerate(tensor_train)):
      train_step(inp, tar)
      if batch % 50 == 0:
        print ('Epoch {} iteration_i {} Batch {} Training Loss {:.4f} Accuracy {:.4f}'.format(
            epoch + 1, iteration_i, batch, train_loss.result(), train_accuracy.result()))
      
    print ('Epoch {} iteration_i {} Training Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                  iteration_i,
                                                  train_loss.result(), 
                                                  train_accuracy.result()))
    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

    with writer_train.as_default():
      tf.summary.scalar('train_loss', train_loss.result(), step=epoch)

    print("validating")
    for (batch, (inp, tar)) in enumerate(tensor_val):
      val_step(inp, tar)
    
    print ('Epoch {} iteration_i {} Validation Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,
                                              iteration_i,
                                              val_loss.result(), 
                                              val_accuracy.result()))
    if best_val_loss > val_loss.result():
      best_val_loss = val_loss.result()
      ckpt_save_path = ckpt_manager.save()
      print ('Saving checkpoint for epoch {} iteration_i {} at {}'.format(epoch+1,
                                                          iteration_i,
                                                          ckpt_save_path))
    if best_val_acc < val_accuracy.result():
      best_val_acc = val_accuracy.result()
      ckpt_save_path = ckpt_manager_acc.save()
      print ('Saving checkpoint for epoch {} iteration_i {} at {}'.format(epoch+1,
                                                          iteration_i,
                                                          ckpt_save_path))
    
    with writer_val.as_default():
      tf.summary.scalar('val_loss', val_loss.result(), step=epoch)

training ST data


HBox(children=(IntProgress(value=0, max=3126), HTML(value='')))

Epoch 1 Batch 0 Training Loss 3.4775 Accuracy 0.0340
Epoch 1 Batch 100 Training Loss 2.5986 Accuracy 0.0446
Epoch 1 Batch 200 Training Loss 2.4912 Accuracy 0.0462
Epoch 1 Batch 300 Training Loss 2.4245 Accuracy 0.0475
Epoch 1 Batch 400 Training Loss 2.3782 Accuracy 0.0482
Epoch 1 Batch 500 Training Loss 2.3437 Accuracy 0.0488
Epoch 1 Batch 600 Training Loss 2.3146 Accuracy 0.0495
Epoch 1 Batch 700 Training Loss 2.2925 Accuracy 0.0500
Epoch 1 Batch 800 Training Loss 2.2730 Accuracy 0.0503
Epoch 1 Batch 900 Training Loss 2.2561 Accuracy 0.0506
Epoch 1 Batch 1000 Training Loss 2.2402 Accuracy 0.0509
Epoch 1 Batch 1100 Training Loss 2.2267 Accuracy 0.0511
Epoch 1 Batch 1200 Training Loss 2.2140 Accuracy 0.0514
Epoch 1 Batch 1300 Training Loss 2.2023 Accuracy 0.0516
Epoch 1 Batch 1400 Training Loss 2.1919 Accuracy 0.0517
Epoch 1 Batch 1500 Training Loss 2.1812 Accuracy 0.0519
Epoch 1 Batch 1600 Training Loss 2.1719 Accuracy 0.0521
Epoch 1 Batch 1700 Training Loss 2.1633 Accuracy 0.0523
Epoc

HBox(children=(IntProgress(value=0, max=3126), HTML(value='')))

Epoch 1 Batch 0 Training Loss 2.0689 Accuracy 0.0540
Epoch 1 Batch 100 Training Loss 2.1272 Accuracy 0.0537
Epoch 1 Batch 200 Training Loss 2.1689 Accuracy 0.0534
Epoch 1 Batch 300 Training Loss 2.2053 Accuracy 0.0532
Epoch 1 Batch 400 Training Loss 2.2375 Accuracy 0.0530
Epoch 1 Batch 500 Training Loss 2.2668 Accuracy 0.0528
Epoch 1 Batch 600 Training Loss 2.2938 Accuracy 0.0527
Epoch 1 Batch 700 Training Loss 2.3187 Accuracy 0.0526
Epoch 1 Batch 800 Training Loss 2.3415 Accuracy 0.0524
Epoch 1 Batch 900 Training Loss 2.3627 Accuracy 0.0523
Epoch 1 Batch 1000 Training Loss 2.3824 Accuracy 0.0522
Epoch 1 Batch 1100 Training Loss 2.4010 Accuracy 0.0521
Epoch 1 Batch 1200 Training Loss 2.4185 Accuracy 0.0520
Epoch 1 Batch 1300 Training Loss 2.4350 Accuracy 0.0519
Epoch 1 Batch 1400 Training Loss 2.4507 Accuracy 0.0519
Epoch 1 Batch 1500 Training Loss 2.4649 Accuracy 0.0518
Epoch 1 Batch 1600 Training Loss 2.4789 Accuracy 0.0517
Epoch 1 Batch 1700 Training Loss 2.4920 Accuracy 0.0517
Epoc

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 0 Batch 0 Training Loss 5.8078 Accuracy 0.0534
Epoch 1 iteration_i 0 Batch 50 Training Loss 4.0071 Accuracy 0.0779
Epoch 1 iteration_i 0 Batch 100 Training Loss 3.8251 Accuracy 0.0812

Epoch 1 iteration_i 0 Training Loss 3.7498 Accuracy 0.0820
Time taken for 1 epoch: 8982.202881336212 secs

validating
Epoch 1 iteration_i 0 Validation Loss 3.8141 Accuracy 0.0824
Saving checkpoint for epoch 1 iteration_i 0 at ./checkpoints/train_bt_st_3_upsample_/ckpt-3
Saving checkpoint for epoch 1 iteration_i 0 at ./checkpoints/train_bt_st_3_upsample__acc_/ckpt-4
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 1 Batch 0 Training Loss 3.4139 Accuracy 0.0954
Epoch 1 iteration_i 1 Batch 50 Training Loss 3.4190 Accuracy 0.0852
Epoch 1 iteration_i 1 Batch 100 Training Loss 3.4137 Accuracy 0.0849

Epoch 1 iteration_i 1 Training Loss 3.4165 Accuracy 0.0854
Time taken for 1 epoch: 9133.768951416016 secs

validating
Epoch 1 iteration_i 1 Validation Loss 3.7663 Accuracy 0.0842
Saving checkpoint for epoch 1 iteration_i 1 at ./checkpoints/train_bt_st_3_upsample_/ckpt-5
Saving checkpoint for epoch 1 iteration_i 1 at ./checkpoints/train_bt_st_3_upsample__acc_/ckpt-6
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 2 Batch 0 Training Loss 2.8851 Accuracy 0.0843
Epoch 1 iteration_i 2 Batch 50 Training Loss 3.1572 Accuracy 0.0892
Epoch 1 iteration_i 2 Batch 100 Training Loss 3.1614 Accuracy 0.0893

Epoch 1 iteration_i 2 Training Loss 3.1788 Accuracy 0.0894
Time taken for 1 epoch: 9287.532847642899 secs

validating
Epoch 1 iteration_i 2 Validation Loss 3.7840 Accuracy 0.0853
Saving checkpoint for epoch 1 iteration_i 2 at ./checkpoints/train_bt_st_3_upsample__acc_/ckpt-7
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 3 Batch 0 Training Loss 2.8493 Accuracy 0.0984
Epoch 1 iteration_i 3 Batch 50 Training Loss 2.9747 Accuracy 0.0936
Epoch 1 iteration_i 3 Batch 100 Training Loss 2.9885 Accuracy 0.0932

Epoch 1 iteration_i 3 Training Loss 2.9866 Accuracy 0.0931
Time taken for 1 epoch: 9424.619845628738 secs

validating
Epoch 1 iteration_i 3 Validation Loss 3.8070 Accuracy 0.0857
Saving checkpoint for epoch 1 iteration_i 3 at ./checkpoints/train_bt_st_3_upsample__acc_/ckpt-8
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 4 Batch 0 Training Loss 2.7338 Accuracy 0.0964
Epoch 1 iteration_i 4 Batch 50 Training Loss 2.7845 Accuracy 0.0961
Epoch 1 iteration_i 4 Batch 100 Training Loss 2.8104 Accuracy 0.0955

Epoch 1 iteration_i 4 Training Loss 2.8209 Accuracy 0.0960
Time taken for 1 epoch: 9562.178090810776 secs

validating
Epoch 1 iteration_i 4 Validation Loss 3.8537 Accuracy 0.0865
Saving checkpoint for epoch 1 iteration_i 4 at ./checkpoints/train_bt_st_3_upsample__acc_/ckpt-9
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 5 Batch 0 Training Loss 2.5575 Accuracy 0.0910
Epoch 1 iteration_i 5 Batch 50 Training Loss 2.6090 Accuracy 0.0998
Epoch 1 iteration_i 5 Batch 100 Training Loss 2.6385 Accuracy 0.1000

Epoch 1 iteration_i 5 Training Loss 2.6685 Accuracy 0.0992
Time taken for 1 epoch: 9699.79431772232 secs

validating
Epoch 1 iteration_i 5 Validation Loss 3.8964 Accuracy 0.0863
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 6 Batch 0 Training Loss 2.4234 Accuracy 0.1007
Epoch 1 iteration_i 6 Batch 50 Training Loss 2.4792 Accuracy 0.1034
Epoch 1 iteration_i 6 Batch 100 Training Loss 2.5056 Accuracy 0.1024

Epoch 1 iteration_i 6 Training Loss 2.5250 Accuracy 0.1022
Time taken for 1 epoch: 9832.814569234848 secs

validating
Epoch 1 iteration_i 6 Validation Loss 3.9623 Accuracy 0.0873
Saving checkpoint for epoch 1 iteration_i 6 at ./checkpoints/train_bt_st_3_upsample__acc_/ckpt-10
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 7 Batch 0 Training Loss 2.2655 Accuracy 0.1073
Epoch 1 iteration_i 7 Batch 50 Training Loss 2.3301 Accuracy 0.1074
Epoch 1 iteration_i 7 Batch 100 Training Loss 2.3681 Accuracy 0.1059

Epoch 1 iteration_i 7 Training Loss 2.3928 Accuracy 0.1054
Time taken for 1 epoch: 9970.07161808014 secs

validating
Epoch 1 iteration_i 7 Validation Loss 4.0139 Accuracy 0.0868
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 8 Batch 0 Training Loss 2.1522 Accuracy 0.1164
Epoch 1 iteration_i 8 Batch 50 Training Loss 2.2168 Accuracy 0.1108
Epoch 1 iteration_i 8 Batch 100 Training Loss 2.2443 Accuracy 0.1098

Epoch 1 iteration_i 8 Training Loss 2.2700 Accuracy 0.1087
Time taken for 1 epoch: 10103.098822593689 secs

validating
Epoch 1 iteration_i 8 Validation Loss 4.1087 Accuracy 0.0870
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 9 Batch 0 Training Loss 2.1305 Accuracy 0.1115
Epoch 1 iteration_i 9 Batch 50 Training Loss 2.1100 Accuracy 0.1135
Epoch 1 iteration_i 9 Batch 100 Training Loss 2.1300 Accuracy 0.1128

Epoch 1 iteration_i 9 Training Loss 2.1538 Accuracy 0.1116
Time taken for 1 epoch: 10236.328634262085 secs

validating
Epoch 1 iteration_i 9 Validation Loss 4.1733 Accuracy 0.0876
Saving checkpoint for epoch 1 iteration_i 9 at ./checkpoints/train_bt_st_3_upsample__acc_/ckpt-11
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 10 Batch 0 Training Loss 1.9030 Accuracy 0.1204
Epoch 1 iteration_i 10 Batch 50 Training Loss 1.9971 Accuracy 0.1168
Epoch 1 iteration_i 10 Batch 100 Training Loss 2.0289 Accuracy 0.1163

Epoch 1 iteration_i 10 Training Loss 2.0423 Accuracy 0.1151
Time taken for 1 epoch: 10374.131878376007 secs

validating
Epoch 1 iteration_i 10 Validation Loss 4.2731 Accuracy 0.0870
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 11 Batch 0 Training Loss 1.9368 Accuracy 0.1311
Epoch 1 iteration_i 11 Batch 50 Training Loss 1.8974 Accuracy 0.1203
Epoch 1 iteration_i 11 Batch 100 Training Loss 1.9254 Accuracy 0.1190

Epoch 1 iteration_i 11 Training Loss 1.9441 Accuracy 0.1184
Time taken for 1 epoch: 10507.133090019226 secs

validating
Epoch 1 iteration_i 11 Validation Loss 4.3246 Accuracy 0.0875
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 12 Batch 0 Training Loss 1.8869 Accuracy 0.1293
Epoch 1 iteration_i 12 Batch 50 Training Loss 1.8069 Accuracy 0.1243
Epoch 1 iteration_i 12 Batch 100 Training Loss 1.8276 Accuracy 0.1224

Epoch 1 iteration_i 12 Training Loss 1.8410 Accuracy 0.1221
Time taken for 1 epoch: 10640.267137289047 secs

validating
Epoch 1 iteration_i 12 Validation Loss 4.3726 Accuracy 0.0871
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 13 Batch 0 Training Loss 1.7179 Accuracy 0.1412
Epoch 1 iteration_i 13 Batch 50 Training Loss 1.6916 Accuracy 0.1277
Epoch 1 iteration_i 13 Batch 100 Training Loss 1.7312 Accuracy 0.1263

Epoch 1 iteration_i 13 Training Loss 1.7500 Accuracy 0.1254
Time taken for 1 epoch: 10773.560406446457 secs

validating
Epoch 1 iteration_i 13 Validation Loss 4.4521 Accuracy 0.0871
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 14 Batch 0 Training Loss 1.6345 Accuracy 0.1519
Epoch 1 iteration_i 14 Batch 50 Training Loss 1.6045 Accuracy 0.1330
Epoch 1 iteration_i 14 Batch 100 Training Loss 1.6415 Accuracy 0.1301

Epoch 1 iteration_i 14 Training Loss 1.6639 Accuracy 0.1289
Time taken for 1 epoch: 10906.53395485878 secs

validating
Epoch 1 iteration_i 14 Validation Loss 4.5033 Accuracy 0.0869
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 15 Batch 0 Training Loss 1.5183 Accuracy 0.1381
Epoch 1 iteration_i 15 Batch 50 Training Loss 1.5433 Accuracy 0.1349
Epoch 1 iteration_i 15 Batch 100 Training Loss 1.5592 Accuracy 0.1327

Epoch 1 iteration_i 15 Training Loss 1.5809 Accuracy 0.1319
Time taken for 1 epoch: 11039.49742603302 secs

validating
Epoch 1 iteration_i 15 Validation Loss 4.5752 Accuracy 0.0875
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 16 Batch 0 Training Loss 1.3836 Accuracy 0.1385
Epoch 1 iteration_i 16 Batch 50 Training Loss 1.4641 Accuracy 0.1367
Epoch 1 iteration_i 16 Batch 100 Training Loss 1.4870 Accuracy 0.1353

Epoch 1 iteration_i 16 Training Loss 1.5082 Accuracy 0.1348
Time taken for 1 epoch: 11172.166684150696 secs

validating
Epoch 1 iteration_i 16 Validation Loss 4.6289 Accuracy 0.0876
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 17 Batch 0 Training Loss 1.2641 Accuracy 0.1549
Epoch 1 iteration_i 17 Batch 50 Training Loss 1.4017 Accuracy 0.1412
Epoch 1 iteration_i 17 Batch 100 Training Loss 1.4207 Accuracy 0.1383

Epoch 1 iteration_i 17 Training Loss 1.4372 Accuracy 0.1377
Time taken for 1 epoch: 11305.367782354355 secs

validating
Epoch 1 iteration_i 17 Validation Loss 4.6608 Accuracy 0.0869
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 18 Batch 0 Training Loss 1.2903 Accuracy 0.1569
Epoch 1 iteration_i 18 Batch 50 Training Loss 1.3306 Accuracy 0.1434
Epoch 1 iteration_i 18 Batch 100 Training Loss 1.3502 Accuracy 0.1412

Epoch 1 iteration_i 18 Training Loss 1.3705 Accuracy 0.1406
Time taken for 1 epoch: 11438.528971672058 secs

validating
Epoch 1 iteration_i 18 Validation Loss 4.7234 Accuracy 0.0868
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 19 Batch 0 Training Loss 1.1893 Accuracy 0.1371
Epoch 1 iteration_i 19 Batch 50 Training Loss 1.2692 Accuracy 0.1439
Epoch 1 iteration_i 19 Batch 100 Training Loss 1.2928 Accuracy 0.1439

Epoch 1 iteration_i 19 Training Loss 1.3096 Accuracy 0.1428
Time taken for 1 epoch: 11571.960437297821 secs

validating
Epoch 1 iteration_i 19 Validation Loss 4.7920 Accuracy 0.0867
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 20 Batch 0 Training Loss 1.1198 Accuracy 0.1390
Epoch 1 iteration_i 20 Batch 50 Training Loss 1.1967 Accuracy 0.1473
Epoch 1 iteration_i 20 Batch 100 Training Loss 1.2354 Accuracy 0.1469

Epoch 1 iteration_i 20 Training Loss 1.2511 Accuracy 0.1454
Time taken for 1 epoch: 11705.122066497803 secs

validating
Epoch 1 iteration_i 20 Validation Loss 4.8427 Accuracy 0.0868
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 21 Batch 0 Training Loss 1.0667 Accuracy 0.1562
Epoch 1 iteration_i 21 Batch 50 Training Loss 1.1551 Accuracy 0.1501
Epoch 1 iteration_i 21 Batch 100 Training Loss 1.1799 Accuracy 0.1485

Epoch 1 iteration_i 21 Training Loss 1.2024 Accuracy 0.1476
Time taken for 1 epoch: 11838.843664646149 secs

validating
Epoch 1 iteration_i 21 Validation Loss 4.9036 Accuracy 0.0865
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 iteration_i 22 Batch 0 Training Loss 1.0859 Accuracy 0.1403
Epoch 1 iteration_i 22 Batch 50 Training Loss 1.1140 Accuracy 0.1515
Epoch 1 iteration_i 22 Batch 100 Training Loss 1.1361 Accuracy 0.1500

Epoch 1 iteration_i 22 Training Loss 1.1555 Accuracy 0.1495
Time taken for 1 epoch: 11972.686581611633 secs

validating
Epoch 1 iteration_i 22 Validation Loss 4.9355 Accuracy 0.0863
training ST data


HBox(children=(IntProgress(value=0, max=3126), HTML(value='')))

Epoch 2 Batch 0 Training Loss 4.3233 Accuracy 0.0304
Epoch 2 Batch 100 Training Loss 2.6758 Accuracy 0.0461
Epoch 2 Batch 200 Training Loss 2.4856 Accuracy 0.0481
Epoch 2 Batch 300 Training Loss 2.3835 Accuracy 0.0493
Epoch 2 Batch 400 Training Loss 2.3190 Accuracy 0.0502
Epoch 2 Batch 500 Training Loss 2.2699 Accuracy 0.0509
Epoch 2 Batch 600 Training Loss 2.2341 Accuracy 0.0516
Epoch 2 Batch 700 Training Loss 2.2056 Accuracy 0.0520
Epoch 2 Batch 800 Training Loss 2.1814 Accuracy 0.0524
Epoch 2 Batch 900 Training Loss 2.1612 Accuracy 0.0527
Epoch 2 Batch 1000 Training Loss 2.1439 Accuracy 0.0530
Epoch 2 Batch 1100 Training Loss 2.1297 Accuracy 0.0532
Epoch 2 Batch 1200 Training Loss 2.1157 Accuracy 0.0534
Epoch 2 Batch 1300 Training Loss 2.1035 Accuracy 0.0536
Epoch 2 Batch 1400 Training Loss 2.0919 Accuracy 0.0538
Epoch 2 Batch 1500 Training Loss 2.0820 Accuracy 0.0540
Epoch 2 Batch 1600 Training Loss 2.0727 Accuracy 0.0542
Epoch 2 Batch 1700 Training Loss 2.0635 Accuracy 0.0543
Epoc

HBox(children=(IntProgress(value=0, max=3126), HTML(value='')))

Epoch 2 Batch 0 Training Loss 1.9866 Accuracy 0.0557
Epoch 2 Batch 100 Training Loss 2.0490 Accuracy 0.0553
Epoch 2 Batch 200 Training Loss 2.0933 Accuracy 0.0550
Epoch 2 Batch 300 Training Loss 2.1322 Accuracy 0.0548
Epoch 2 Batch 400 Training Loss 2.1673 Accuracy 0.0546
Epoch 2 Batch 500 Training Loss 2.1989 Accuracy 0.0544
Epoch 2 Batch 600 Training Loss 2.2273 Accuracy 0.0542
Epoch 2 Batch 700 Training Loss 2.2539 Accuracy 0.0540
Epoch 2 Batch 800 Training Loss 2.2789 Accuracy 0.0539
Epoch 2 Batch 900 Training Loss 2.3025 Accuracy 0.0537
Epoch 2 Batch 1000 Training Loss 2.3244 Accuracy 0.0536
Epoch 2 Batch 1100 Training Loss 2.3449 Accuracy 0.0535
Epoch 2 Batch 1200 Training Loss 2.3642 Accuracy 0.0534
Epoch 2 Batch 1300 Training Loss 2.3827 Accuracy 0.0532
Epoch 2 Batch 1400 Training Loss 2.3999 Accuracy 0.0531
Epoch 2 Batch 1500 Training Loss 2.4162 Accuracy 0.0530
Epoch 2 Batch 1600 Training Loss 2.4317 Accuracy 0.0529
Epoch 2 Batch 1700 Training Loss 2.4465 Accuracy 0.0529
Epoc

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 0 Batch 0 Training Loss 5.3744 Accuracy 0.0769
Epoch 2 iteration_i 0 Batch 50 Training Loss 3.5408 Accuracy 0.0898
Epoch 2 iteration_i 0 Batch 100 Training Loss 3.2905 Accuracy 0.0939

Epoch 2 iteration_i 0 Training Loss 3.1997 Accuracy 0.0959
Time taken for 1 epoch: 8911.723845243454 secs

validating
Epoch 2 iteration_i 0 Validation Loss 3.8477 Accuracy 0.0871
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 1 Batch 0 Training Loss 2.6882 Accuracy 0.1069
Epoch 2 iteration_i 1 Batch 50 Training Loss 2.6207 Accuracy 0.1078
Epoch 2 iteration_i 1 Batch 100 Training Loss 2.6056 Accuracy 0.1084

Epoch 2 iteration_i 1 Training Loss 2.6001 Accuracy 0.1084
Time taken for 1 epoch: 9044.672929525375 secs

validating
Epoch 2 iteration_i 1 Validation Loss 3.8755 Accuracy 0.0885
Saving checkpoint for epoch 2 iteration_i 1 at ./checkpoints/train_bt_st_3_upsample__acc_/ckpt-12
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 2 Batch 0 Training Loss 2.2154 Accuracy 0.1102
Epoch 2 iteration_i 2 Batch 50 Training Loss 2.3293 Accuracy 0.1150
Epoch 2 iteration_i 2 Batch 100 Training Loss 2.3400 Accuracy 0.1146

Epoch 2 iteration_i 2 Training Loss 2.3315 Accuracy 0.1148
Time taken for 1 epoch: 9181.640061616898 secs

validating
Epoch 2 iteration_i 2 Validation Loss 3.9307 Accuracy 0.0895
Saving checkpoint for epoch 2 iteration_i 2 at ./checkpoints/train_bt_st_3_upsample__acc_/ckpt-13
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 3 Batch 0 Training Loss 2.1007 Accuracy 0.1272
Epoch 2 iteration_i 3 Batch 50 Training Loss 2.1106 Accuracy 0.1201
Epoch 2 iteration_i 3 Batch 100 Training Loss 2.1262 Accuracy 0.1197

Epoch 2 iteration_i 3 Training Loss 2.1278 Accuracy 0.1199
Time taken for 1 epoch: 9319.26719045639 secs

validating
Epoch 2 iteration_i 3 Validation Loss 3.9699 Accuracy 0.0897
Saving checkpoint for epoch 2 iteration_i 3 at ./checkpoints/train_bt_st_3_upsample__acc_/ckpt-14
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 4 Batch 0 Training Loss 2.0028 Accuracy 0.1367
Epoch 2 iteration_i 4 Batch 50 Training Loss 1.9478 Accuracy 0.1266
Epoch 2 iteration_i 4 Batch 100 Training Loss 1.9570 Accuracy 0.1247

Epoch 2 iteration_i 4 Training Loss 1.9579 Accuracy 0.1244
Time taken for 1 epoch: 9456.779994010925 secs

validating
Epoch 2 iteration_i 4 Validation Loss 4.0419 Accuracy 0.0896
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 5 Batch 0 Training Loss 1.6914 Accuracy 0.1314
Epoch 2 iteration_i 5 Batch 50 Training Loss 1.8051 Accuracy 0.1285
Epoch 2 iteration_i 5 Batch 100 Training Loss 1.8027 Accuracy 0.1283

Epoch 2 iteration_i 5 Training Loss 1.8083 Accuracy 0.1286
Time taken for 1 epoch: 9590.442853450775 secs

validating
Epoch 2 iteration_i 5 Validation Loss 4.1176 Accuracy 0.0889
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 6 Batch 0 Training Loss 1.6628 Accuracy 0.1272
Epoch 2 iteration_i 6 Batch 50 Training Loss 1.6565 Accuracy 0.1349
Epoch 2 iteration_i 6 Batch 100 Training Loss 1.6623 Accuracy 0.1336

Epoch 2 iteration_i 6 Training Loss 1.6726 Accuracy 0.1329
Time taken for 1 epoch: 9723.975379705429 secs

validating
Epoch 2 iteration_i 6 Validation Loss 4.1733 Accuracy 0.0891
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 7 Batch 0 Training Loss 1.5068 Accuracy 0.1528
Epoch 2 iteration_i 7 Batch 50 Training Loss 1.5328 Accuracy 0.1379
Epoch 2 iteration_i 7 Batch 100 Training Loss 1.5487 Accuracy 0.1375

Epoch 2 iteration_i 7 Training Loss 1.5613 Accuracy 0.1366
Time taken for 1 epoch: 9857.479020118713 secs

validating
Epoch 2 iteration_i 7 Validation Loss 4.2271 Accuracy 0.0893
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 8 Batch 0 Training Loss 1.4362 Accuracy 0.1553
Epoch 2 iteration_i 8 Batch 50 Training Loss 1.4243 Accuracy 0.1435
Epoch 2 iteration_i 8 Batch 100 Training Loss 1.4457 Accuracy 0.1415

Epoch 2 iteration_i 8 Training Loss 1.4548 Accuracy 0.1404
Time taken for 1 epoch: 9990.872699737549 secs

validating
Epoch 2 iteration_i 8 Validation Loss 4.3393 Accuracy 0.0888
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 9 Batch 0 Training Loss 1.3618 Accuracy 0.1355
Epoch 2 iteration_i 9 Batch 50 Training Loss 1.3344 Accuracy 0.1447
Epoch 2 iteration_i 9 Batch 100 Training Loss 1.3496 Accuracy 0.1439

Epoch 2 iteration_i 9 Training Loss 1.3675 Accuracy 0.1435
Time taken for 1 epoch: 10123.97950053215 secs

validating
Epoch 2 iteration_i 9 Validation Loss 4.3845 Accuracy 0.0887
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 10 Batch 0 Training Loss 1.1783 Accuracy 0.1641
Epoch 2 iteration_i 10 Batch 50 Training Loss 1.2440 Accuracy 0.1501
Epoch 2 iteration_i 10 Batch 100 Training Loss 1.2661 Accuracy 0.1483

Epoch 2 iteration_i 10 Training Loss 1.2748 Accuracy 0.1472
Time taken for 1 epoch: 10257.208960533142 secs

validating
Epoch 2 iteration_i 10 Validation Loss 4.4778 Accuracy 0.0888
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 11 Batch 0 Training Loss 1.1035 Accuracy 0.1533
Epoch 2 iteration_i 11 Batch 50 Training Loss 1.1514 Accuracy 0.1521
Epoch 2 iteration_i 11 Batch 100 Training Loss 1.1801 Accuracy 0.1511

Epoch 2 iteration_i 11 Training Loss 1.1997 Accuracy 0.1504
Time taken for 1 epoch: 10390.363196372986 secs

validating
Epoch 2 iteration_i 11 Validation Loss 4.5454 Accuracy 0.0886
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 12 Batch 0 Training Loss 1.1261 Accuracy 0.1643
Epoch 2 iteration_i 12 Batch 50 Training Loss 1.0927 Accuracy 0.1563
Epoch 2 iteration_i 12 Batch 100 Training Loss 1.1126 Accuracy 0.1546

Epoch 2 iteration_i 12 Training Loss 1.1267 Accuracy 0.1535
Time taken for 1 epoch: 10523.260709762573 secs

validating
Epoch 2 iteration_i 12 Validation Loss 4.5940 Accuracy 0.0882
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 13 Batch 0 Training Loss 0.9262 Accuracy 0.1508
Epoch 2 iteration_i 13 Batch 50 Training Loss 1.0291 Accuracy 0.1582
Epoch 2 iteration_i 13 Batch 100 Training Loss 1.0551 Accuracy 0.1566

Epoch 2 iteration_i 13 Training Loss 1.0678 Accuracy 0.1558
Time taken for 1 epoch: 10656.062583684921 secs

validating
Epoch 2 iteration_i 13 Validation Loss 4.6552 Accuracy 0.0884
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 14 Batch 0 Training Loss 1.0169 Accuracy 0.1618
Epoch 2 iteration_i 14 Batch 50 Training Loss 0.9738 Accuracy 0.1610
Epoch 2 iteration_i 14 Batch 100 Training Loss 0.9978 Accuracy 0.1595

Epoch 2 iteration_i 14 Training Loss 1.0099 Accuracy 0.1583
Time taken for 1 epoch: 10788.783400297165 secs

validating
Epoch 2 iteration_i 14 Validation Loss 4.7429 Accuracy 0.0880
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 15 Batch 0 Training Loss 0.8991 Accuracy 0.1586
Epoch 2 iteration_i 15 Batch 50 Training Loss 0.9273 Accuracy 0.1610
Epoch 2 iteration_i 15 Batch 100 Training Loss 0.9447 Accuracy 0.1610

Epoch 2 iteration_i 15 Training Loss 0.9569 Accuracy 0.1605
Time taken for 1 epoch: 10921.049254894257 secs

validating
Epoch 2 iteration_i 15 Validation Loss 4.8098 Accuracy 0.0886
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 16 Batch 0 Training Loss 1.0028 Accuracy 0.1529
Epoch 2 iteration_i 16 Batch 50 Training Loss 0.8817 Accuracy 0.1626
Epoch 2 iteration_i 16 Batch 100 Training Loss 0.8992 Accuracy 0.1622

Epoch 2 iteration_i 16 Training Loss 0.9123 Accuracy 0.1623
Time taken for 1 epoch: 11053.403935670853 secs

validating
Epoch 2 iteration_i 16 Validation Loss 4.8599 Accuracy 0.0880
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 17 Batch 0 Training Loss 0.8144 Accuracy 0.1547
Epoch 2 iteration_i 17 Batch 50 Training Loss 0.8508 Accuracy 0.1661
Epoch 2 iteration_i 17 Batch 100 Training Loss 0.8610 Accuracy 0.1641

Epoch 2 iteration_i 17 Training Loss 0.8701 Accuracy 0.1643
Time taken for 1 epoch: 11186.076206445694 secs

validating
Epoch 2 iteration_i 17 Validation Loss 4.9035 Accuracy 0.0875
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 18 Batch 0 Training Loss 0.7200 Accuracy 0.1625
Epoch 2 iteration_i 18 Batch 50 Training Loss 0.7979 Accuracy 0.1697
Epoch 2 iteration_i 18 Batch 100 Training Loss 0.8158 Accuracy 0.1675

Epoch 2 iteration_i 18 Training Loss 0.8296 Accuracy 0.1659
Time taken for 1 epoch: 11318.54722571373 secs

validating
Epoch 2 iteration_i 18 Validation Loss 5.0025 Accuracy 0.0875
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 19 Batch 0 Training Loss 0.7657 Accuracy 0.1698
Epoch 2 iteration_i 19 Batch 50 Training Loss 0.7632 Accuracy 0.1698
Epoch 2 iteration_i 19 Batch 100 Training Loss 0.7865 Accuracy 0.1686

Epoch 2 iteration_i 19 Training Loss 0.8002 Accuracy 0.1674
Time taken for 1 epoch: 11450.960218667984 secs

validating
Epoch 2 iteration_i 19 Validation Loss 5.0390 Accuracy 0.0871
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 20 Batch 0 Training Loss 0.7775 Accuracy 0.1850
Epoch 2 iteration_i 20 Batch 50 Training Loss 0.7433 Accuracy 0.1711
Epoch 2 iteration_i 20 Batch 100 Training Loss 0.7536 Accuracy 0.1707

Epoch 2 iteration_i 20 Training Loss 0.7649 Accuracy 0.1690
Time taken for 1 epoch: 11583.248711824417 secs

validating
Epoch 2 iteration_i 20 Validation Loss 5.1071 Accuracy 0.0873
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 21 Batch 0 Training Loss 0.7035 Accuracy 0.1822
Epoch 2 iteration_i 21 Batch 50 Training Loss 0.7042 Accuracy 0.1733
Epoch 2 iteration_i 21 Batch 100 Training Loss 0.7223 Accuracy 0.1713

Epoch 2 iteration_i 21 Training Loss 0.7339 Accuracy 0.1704
Time taken for 1 epoch: 11715.378323793411 secs

validating
Epoch 2 iteration_i 21 Validation Loss 5.1724 Accuracy 0.0867
training Parallel data


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 iteration_i 22 Batch 0 Training Loss 0.7873 Accuracy 0.1712
Epoch 2 iteration_i 22 Batch 50 Training Loss 0.6827 Accuracy 0.1725
Epoch 2 iteration_i 22 Batch 100 Training Loss 0.7004 Accuracy 0.1729

Epoch 2 iteration_i 22 Training Loss 0.7093 Accuracy 0.1715
Time taken for 1 epoch: 11847.466495990753 secs

validating
Epoch 2 iteration_i 22 Validation Loss 5.1919 Accuracy 0.0864


Evaluate best model

In [43]:
# load model
print(ckpt_manager_acc.checkpoints)
# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager_acc.latest_checkpoint:
  ckpt.restore("./checkpoints/train_bt_st_3_upsample__acc_/ckpt-4")
  print ('Latest checkpoint restored!!')

['./checkpoints/train_bt_st_3_upsample__acc_/ckpt-12', './checkpoints/train_bt_st_3_upsample__acc_/ckpt-13', './checkpoints/train_bt_st_3_upsample__acc_/ckpt-14']


NotFoundError: ignored

In [15]:
transformer

<transformer.Transformer at 0x7f6b344af048>

In [23]:
val_loss.reset_states()
val_accuracy.reset_states()
  
for (batch, (inp, tar)) in tqdm(enumerate(tensor_val)):
  val_step(inp, tar)
  
print ('Validation Loss {:.4f} Accuracy {:.4f}'.format(
                                          val_loss.result(), 
                                          val_accuracy.result()))
  

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Validation Loss 3.9917 Accuracy 0.0770


In [0]:
def generate_predictions(inp_sentences):

  if len(inp_sentences.get_shape())==1:
    encoder_input = tf.expand_dims(inp_sentences, 0)
    decoder_input = [french_word2id["<start>"]]
    output = tf.expand_dims(decoder_input, 0)

  else:
    encoder_input = inp_sentences
    decoder_input = [french_word2id["<start>"]]*inp_sentences.get_shape()[0]
    output = tf.expand_dims(decoder_input, -1)


  # encoder_input = tf.expand_dims(inp_sentence, 0)
  
  # decoder_input = [french_word2id["<start>"]]
  # output = tf.expand_dims(decoder_input, 0)
  
  for i in range(pe_target):
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        encoder_input, output)
  
    # predictions.shape == (batch_size, seq_len, vocab_size)
    predictions, attention_weights = transformer(encoder_input, 
                                                 output,
                                                 False,
                                                 enc_padding_mask,  
                                                 combined_mask,
                                                 dec_padding_mask)
    
    # select the last word from the seq_len dimension
    predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
    
    # # return the result if all the seqs has the end token
    if tf.reduce_sum(tf.cast((tf.reduce_sum(tf.cast(output == french_word2id["<eos>"], tf.float32),axis=1)>0), tf.float32)) == inp.get_shape()[0]:
      return output, attention_weights
    
    # concatentate the predicted_id to the output which is given to the decoder
    # as its input.
    output = tf.concat([output, predicted_id], axis=-1)

  # return tf.squeeze(output, axis=0), attention_weights
  return output, attention_weights

In [0]:
all_preds = []
for (batch_i, (inp, tar)) in tqdm(enumerate(tensor_val)):
  preds, attention = generate_predictions(inp)
  all_preds.append(preds)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [0]:
translated_sentences = []

for k in tqdm(all_preds):
  for i in k:
    sentence_french = []
    for j in i.numpy()[1:]:
      if j==0 or j==french_word2id["<eos>"]:
        break
      sentence_french.append(french_id2word[j])

    sentence_french = " ".join(sentence_french)

    translated_sentences.append(sentence_french)

translated_sentences = "\n".join(translated_sentences)

with open("predictions.txt","w") as f:
  f.write(translated_sentences)

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))




In [0]:
!pip install sacrebleu
!python evaluator.py --input-file-path ./predictions.txt --target-file-path ./data/split_val.lang2 --do-not-run-model 

final avg bleu score: 11.13


Self-Training Monolingual Data Generation

In [0]:
amount_data_start = 100000
amount_data_end = 200000

In [0]:
with open(data_path/"unaligned_tokenized_rempunc.en","r") as f:
    english_monolingual = f.read().strip().lower()
print(len(english_monolingual.split("\n")), english_monolingual[:200])

474000 for the second phase of the trials we just had different sizes small medium large and extra - large it 's true
geng had been my host the previous january when i was the first us defense secretary to v


In [0]:
def transform_test_data(lang1, dict_word2id, amount_data_start=None,amount_data_end=None):
  lines = lang1.split("\n")
  if amount_data_start or amount_data_end:
    lines = lines[amount_data_start:amount_data_end]
  data = []

  for line in lines:
    line2id = [dict_word2id["<start>"]]
    for word in line.split():
      try:
        line2id.append(dict_word2id[word])
      except:
        line2id.append(dict_word2id["<unk>"])
    line2id.append(dict_word2id["<eos>"])
    data.append(line2id)

  return data

english_monolingual_data = transform_test_data(english_monolingual, english_word2id, amount_data_start, amount_data_end)
len(english_monolingual_data)

100000

In [0]:
max([len(i) for i in english_monolingual_data])

112

In [0]:
with open(data_path/"unaligned.fr","r") as f:
    french_monolingual = f.read().strip().lower()
print(len(french_monolingual.split("\n")), french_monolingual[:200])
french_monolingual_data = transform_test_data(french_monolingual, french_word2id)
print(len(french_monolingual_data))
max([len(i) for i in french_monolingual_data])

474000 nous n’aurions pas pu dégager d’accord sur un calendrier de conclusion de la cig sans l’engagement politique de mes collègues du conseil européen.
(de) madame la présidente, monsieur le commissaire, m
474000


220

In [0]:
pe_input = max([len(i) for i in english_monolingual_data])
pe_target = max([len(i) for i in french_monolingual_data])

In [0]:
tensor_test = tf.data.Dataset.from_tensor_slices((
    tf.keras.preprocessing.sequence.pad_sequences(english_monolingual_data, padding='post')
)).batch(BATCH_SIZE, drop_remainder=False)

In [0]:
transformer = Transformer(
    num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, 
    input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size, 
    pe_input=pe_input, pe_target=pe_target, rate=dropout_rate)

In [0]:
experiment_number = "7_smaller_1_1024_"

checkpoint_path = "./checkpoints/train"+experiment_number

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


In [0]:
all_preds = []
for batch_i, inp in tqdm(enumerate(tensor_test.unbatch().batch(128)),total=len(english_monolingual_data) // 128 + 1):
  preds, attention = generate_predictions(inp)
  all_preds.append(preds)

HBox(children=(IntProgress(value=0, max=782), HTML(value='')))




In [0]:
translated_sentences = []

for k in tqdm(all_preds):
  for i in k:
    sentence_french = []
    for j in i.numpy()[1:]:
      if j==0 or j==french_word2id["<eos>"]:
        break
      sentence_french.append(french_id2word[j])

    sentence_french = " ".join(sentence_french)

    translated_sentences.append(sentence_french)

translated_sentences = "\n".join(translated_sentences)

with open("predictions_english_monolingual_"+str(amount_data_start)+"_"+str(amount_data_end)+".txt","w") as f:
  f.write(translated_sentences)

HBox(children=(IntProgress(value=0, max=782), HTML(value='')))


