In [0]:
from pathlib import Path
import os
from collections import Counter
import numpy as np
import time
from tqdm.notebook import tqdm

np.random.seed(5050)

In [0]:
data_path = Path("/content/drive/My Drive/Adv Projects in ML/data")
data_path
os.listdir(data_path)

['train.lang2',
 'unaligned.en',
 'unaligned.fr',
 'train.lang1',
 'split_train.lang1',
 'split_val.lang2',
 'split_train.lang2',
 'split_val.lang1']

In [0]:
!nvidia-smi

Sat Apr  4 05:20:09 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

# Loading Data

In [0]:
with open(data_path/"train.lang1","r") as f:
    english = f.read()
len(english.split("\n"))

11001

In [0]:
with open(data_path/"train.lang2","r") as f:
    french = f.read()

len(french.split("\n"))

11001

In [0]:
if english.split("\n")[-1] == "":
  english = english[:-1]

if french.split("\n")[-1] == "":
  french = french[:-1]

len(english.split("\n")), len(french.split("\n"))

(11000, 11000)

In [0]:
with open(data_path/"unaligned.en","r") as f:
    english_unaligned = f.read()
len(english_unaligned.split("\n"))

474001

In [0]:
if english_unaligned.split("\n")[-1] == "":
  english_unaligned = english_unaligned[:-1]
len(english_unaligned.split("\n"))

474000

In [0]:
with open(data_path/"unaligned.fr","r") as f:
    french_unaligned = f.read()
len(french_unaligned.split("\n"))

474001

In [0]:
if french_unaligned.split("\n")[-1] == "":
  french_unaligned = french_unaligned[:-1]
len(french_unaligned.split("\n"))

474000

In [0]:
english[:200], french[:200]

('so too does the idea that accommodating religious differences is dangerous\nmr president ladies and gentlemen the financial perspective outlines the scope of the eu ’s activities over coming years as w',
 'L’ idée de concilier les différences religieuses semble donc dangereuse .\nMonsieur le Président , Mesdames et Messieurs , les perspectives financières esquissent la portée des activités de l’ UE pour ')

In [0]:
english_unaligned[:200], french_unaligned[:200]

("For the second phase of the trials we just had different sizes, small, medium, large and extra-large. It's true.\nGeng had been my host the previous January, when I was the first US defense secretary t",
 'Nous n’aurions pas pu dégager d’accord sur un calendrier de conclusion de la CIG sans l’engagement politique de mes collègues du Conseil européen.\n(DE) Madame la Présidente, Monsieur le Commissaire, M')

# Splitting Data

In [0]:
def split_dataset(text_data_1, text_data_2, split):
  text_data_1 = np.array(text_data_1.split("\n"))
  text_data_2 = np.array(text_data_2.split("\n"))

  idxs = list(range(len(text_data_1)))
  np.random.shuffle(idxs)
  text_data_1 = text_data_1[idxs]
  text_data_2 = text_data_2[idxs]

  train_split = int(len(text_data_1) * split)
  print(train_split, len(text_data_1)-train_split)

  train_text1 = text_data_1[:train_split]
  val_text1 = text_data_1[train_split:]
  train_text2 = text_data_2[:train_split]
  val_text2 = text_data_2[train_split:]

  with open(data_path/'split_train.lang1','w') as f:
    f.write('\n'.join(train_text1))

  with open(data_path/'split_train.lang2','w') as f:
    f.write('\n'.join(train_text2))

  with open(data_path/'split_val.lang1','w') as f:
    f.write('\n'.join(val_text1))

  with open(data_path/'split_val.lang2','w') as f:
    f.write('\n'.join(val_text2))

  # return train_text1, val_text1, train_text2, val_text1

split_dataset(english, french, 0.8)

8800 2201


In [0]:
with open(data_path/"split_train.lang1","r") as f:
    english_train = f.read()
print(len(english_train.split("\n")), english_train[:200])

with open(data_path/"split_train.lang2","r") as f:
    french_train = f.read()
print(len(french_train.split("\n")), french_train[:200])

with open(data_path/"split_val.lang1","r") as f:
    english_val = f.read()
print(len(english_val.split("\n")), english_val[:200])

with open(data_path/"split_val.lang2","r") as f:
    french_val = f.read()
print(len(french_val.split("\n")), french_val[:200])



8800 i want to know the people behind my dinner choices
privatization has stalled
bernanke also noted the possibility of temporarily raising the fed ’s medium - term inflation target a policy that i sugges
8800 Je veux savoir qui est derrière mes choix en cuisine .
La privatisation est au point mort .
Bernanke a aussi évoqué la possibilité d’ un relèvement temporaire des objectifs d’ inflation à moyen terme 
2201 at that point with us demand shrinking the rest of the world would indeed feel the economic effects of a romney presidency quite directly
i 've never eaten anything like this before
if the waiting lis
2201 Dans ce cas , suite à la contraction de la demande américaine , le reste du monde ressentirait en fait assez directement les effets économiques d' une présidence Romney .
Je n' ai jamais mangé quelque


In [0]:
from pathlib import Path
import os
from collections import Counter
import numpy as np

np.random.seed(8080)

def read_data(data_path):
    data_path
    os.listdir(data_path)
    
    with open(data_path/"train.lang1","r") as f:
        english = f.read()
    len(english.split("\n"))
    
    with open(data_path/"train.lang2","r") as f:
        french = f.read()
    len(french.split("\n"))

    return english, french

def split_dataset(data_path, text_data_1, text_data_2, split):
  text_data_1 = np.array(text_data_1.split("\n"))
  text_data_2 = np.array(text_data_2.split("\n"))

  if text_data_1[-1] == "":
    text_data_1 = text_data_1[:-1]

  if text_data_2[-1] == "":
    text_data_2 = text_data_2[:-1]

  idxs = list(range(len(text_data_1)))
  np.random.shuffle(idxs)
  text_data_1 = text_data_1[idxs]
  text_data_2 = text_data_2[idxs]

  train_split = int(len(text_data_1) * split)
  print(train_split, len(text_data_1)-train_split)

  train_text1 = text_data_1[:train_split]
  val_text1 = text_data_1[train_split:]
  train_text2 = text_data_2[:train_split]
  val_text2 = text_data_2[train_split:]

  with open(data_path/'split_train.lang1','w') as f:
    f.write('\n'.join(train_text1))

  with open(data_path/'split_train.lang2','w') as f:
    f.write('\n'.join(train_text2))

  with open(data_path/'split_val.lang1','w') as f:
    f.write('\n'.join(val_text1))

  with open(data_path/'split_val.lang2','w') as f:
    f.write('\n'.join(val_text2))

def main():
    data_path = Path("/content/drive/My Drive/Adv Projects in ML/data")
    english, french = read_data(data_path)
    split_dataset(data_path, english, french, 0.8)

    with open(data_path/"split_train.lang1","r") as f:
        english_train = f.read()
    print(len(english_train.split("\n")), english_train[:200])
    
    with open(data_path/"split_train.lang2","r") as f:
        french_train = f.read()
    print(len(french_train.split("\n")), french_train[:200])
    
    with open(data_path/"split_val.lang1","r") as f:
        english_val = f.read()
    print(len(english_val.split("\n")), english_val[:200])
    
    with open(data_path/"split_val.lang2","r") as f:
        french_val = f.read()
    print(len(french_val.split("\n")), french_val[:200])

main()

8800 2200
8800 as mr de castro is not present mr le foll who is replacing mr de castro has the floor
on the other hand if you 're visiting an underdeveloped country and 25 dollars buys you a gourmet meal it 's exorb
8800 Comme M. De Castro est absent , M. Le Foll , qui le remplace , a la parole .
D' un autre côté , si vous êtes dans un pays en voie de développement , où 25 dollars peuvent vous obtenir un repas de luxe
2200 what action does the council intend to take in the face of this seriously discriminatory attitude which runs contrary to the principles of the eu
where would you like to go next
if that were not enoug
2200 Quelles mesures le Conseil compte-t-il adopter face à cette attitude qui constitue une grave discrimination et est contraire aux principes sur lesquels l' Union européenne est fondée ?
Où souhaiteriez


In [0]:
!python "/content/drive/My Drive/Adv Projects in ML/data_split.py" 

8800 2200
8800 as mr de castro is not present mr le foll who is replacing mr de castro has the floor
on the other hand if you 're visiting an underdeveloped country and 25 dollars buys you a gourmet meal it 's exorb
8800 Comme M. De Castro est absent , M. Le Foll , qui le remplace , a la parole .
D' un autre côté , si vous êtes dans un pays en voie de développement , où 25 dollars peuvent vous obtenir un repas de luxe
2200 what action does the council intend to take in the face of this seriously discriminatory attitude which runs contrary to the principles of the eu
where would you like to go next
if that were not enoug
2200 Quelles mesures le Conseil compte-t-il adopter face à cette attitude qui constitue une grave discrimination et est contraire aux principes sur lesquels l' Union européenne est fondée ?
Où souhaiteriez


In [0]:
with open(data_path/"split_train.lang1","r") as f:
    english = f.read()
print(len(english.split("\n")), english[:200])
    
with open(data_path/"split_train.lang2","r") as f:
    french = f.read()
print(len(french.split("\n")), french[:200])

with open(data_path/"split_val.lang1","r") as f:
    english_val = f.read()
print(len(english_val.split("\n")), english_val[:200])

with open(data_path/"split_val.lang2","r") as f:
    french_val = f.read()
print(len(french_val.split("\n")), french_val[:200])


8800 as mr de castro is not present mr le foll who is replacing mr de castro has the floor
on the other hand if you 're visiting an underdeveloped country and 25 dollars buys you a gourmet meal it 's exorb
8800 Comme M. De Castro est absent , M. Le Foll , qui le remplace , a la parole .
D' un autre côté , si vous êtes dans un pays en voie de développement , où 25 dollars peuvent vous obtenir un repas de luxe
2200 what action does the council intend to take in the face of this seriously discriminatory attitude which runs contrary to the principles of the eu
where would you like to go next
if that were not enoug
2200 Quelles mesures le Conseil compte-t-il adopter face à cette attitude qui constitue une grave discrimination et est contraire aux principes sur lesquels l' Union européenne est fondée ?
Où souhaiteriez


# Creating Vocab

In [0]:
with open(data_path/"split_train.lang1","r") as f:
    english = f.read()
print(len(english.split("\n")), english[:200])
    
with open(data_path/"split_train.lang2","r") as f:
    french = f.read()
print(len(french.split("\n")), french[:200])

with open(data_path/"split_val.lang1","r") as f:
    english_val = f.read()
print(len(english_val.split("\n")), english_val[:200])

with open(data_path/"split_val.lang2","r") as f:
    french_val = f.read()
print(len(french_val.split("\n")), french_val[:200])

8800 as mr de castro is not present mr le foll who is replacing mr de castro has the floor
on the other hand if you 're visiting an underdeveloped country and 25 dollars buys you a gourmet meal it 's exorb
8800 Comme M. De Castro est absent , M. Le Foll , qui le remplace , a la parole .
D' un autre côté , si vous êtes dans un pays en voie de développement , où 25 dollars peuvent vous obtenir un repas de luxe
2200 what action does the council intend to take in the face of this seriously discriminatory attitude which runs contrary to the principles of the eu
where would you like to go next
if that were not enoug
2200 Quelles mesures le Conseil compte-t-il adopter face à cette attitude qui constitue une grave discrimination et est contraire aux principes sur lesquels l' Union européenne est fondée ?
Où souhaiteriez


In [0]:
english_vocab = list(set(english.replace("\n", " <eos> ").split()))
french_vocab = list(set(french.replace("\n", " <eos> ").split()))

len(english_vocab), len(french_vocab)

(12360, 16312)

In [0]:
english_counter = Counter(english.replace("\n", " <eos> ").split())
french_counter = Counter(french.replace("\n", " <eos> ").split())
len(english_counter), len(french_counter)

(12360, 16312)

In [0]:
english_counter.update({"<unk>":0})
french_counter.update({"<unk>":0})
english_counter.update({"<start>":0})
french_counter.update({"<start>":0})
len(english_counter), len(french_counter)

(12362, 16314)

In [0]:
english_counter.most_common(10), french_counter.most_common(10)

([('the', 10975),
  ('<eos>', 8799),
  ('to', 5136),
  ('of', 5132),
  ('and', 4155),
  ('in', 3486),
  ('a', 3039),
  ('is', 2894),
  ('that', 2842),
  ('i', 2140)],
 [('<eos>', 8799),
  ('.', 8509),
  (',', 8353),
  ('de', 8307),
  ('la', 5073),
  ('et', 3630),
  ('le', 3617),
  ('à', 3442),
  ('les', 3136),
  ('des', 2943)])

In [0]:
# english_vocab = []
# for i in english_counter:
#   if english_counter[i] > 1:
#     english_vocab.append(i)
#   else:
#     print(i)
# len(english_vocab)

In [0]:
english_vocab = list(english_counter.keys())
french_vocab = list(french_counter.keys())

In [0]:
english_word2id = {}
english_id2word = {}
french_word2id = {}
french_id2word = {}

# start enumerate from 1 so that 0 is reserved for padding seqs 
for i, w in enumerate(english_vocab, start=1):
  english_word2id[w] = i
  english_id2word[i] = w

for i, w in enumerate(french_vocab, start=1):
  french_word2id[w] = i
  french_id2word[i] = w

len(english_word2id), len(english_id2word), len(french_word2id), len(french_id2word)

(12362, 12362, 16314, 16314)

# Convert to dataset

In [0]:
# english_lines = english.split("\n")
# french_lines = french.split("\n")

# data_english = []
# data_french = []

# for line in english_lines:
#   line2id = []
#   for word in line.split():
#     try:
#       line2id.append(english_word2id[word])
#     except:
#       line2id.append(english_word2id["<unk>"])
#   data_english.append(line2id)


# for line in french_lines:
#   line2id = []
#   for word in line.split():
#     try:
#       line2id.append(french_word2id[word])
#     except:
#       line2id.append(french_word2id["<unk>"])
#   data_french.append(line2id)

# len(data_english), len(data_french)

In [0]:
def transform_data(english_lang1, french_lang2):
  english_lines = english_lang1.split("\n")
  french_lines = french_lang2.split("\n")

  data_english = []
  data_french = []

  for line in english_lines:
    line2id = [english_word2id["<start>"]]
    for word in line.split():
      try:
        line2id.append(english_word2id[word])
      except:
        line2id.append(english_word2id["<unk>"])
    line2id.append(english_word2id["<eos>"])
    data_english.append(line2id)


  for line in french_lines:
    line2id = [french_word2id["<start>"]]
    for word in line.split():
      try:
        line2id.append(french_word2id[word])
      except:
        line2id.append(french_word2id["<unk>"])
    line2id.append(french_word2id["<eos>"])
    data_french.append(line2id)

  print(len(data_english), len(data_french))
  return data_english, data_french

In [0]:
data_english, data_french = transform_data(english, french)

8800 8800


In [0]:
data_english_val, data_french_val = transform_data(english_val, french_val)

2200 2200


In [0]:
english_word2id["<start>"], english_word2id["<eos>"], len(english.split("\n")[0].split()), len(data_english[0])

(12362, 15, 19, 21)

# Create Dataset

In [0]:
# %tensorflow_version 2.x
# import tensorflow as tf
# print("Tensorflow version " + tf.__version__)

# tf.random.set_seed(8080)
# # make sure numpy seeded

Tensorflow version 2.2.0-rc2


In [0]:
# class DataLoader(tf.data.Dataset):
#   def _generator(data_english, data_french):
#     data_english = np.array(data_english)
#     data_french = np.array(data_french)

#     epochs = 2   
#     batch_size = 3
#     num_batches = len(data_english)//batch_size

#     for epoch_i in range(epochs):
#       idxs = list(range(len(data_english)))
#       np.random.shuffle(idxs)
#       data_english = data_english[idxs]
#       data_french = data_french[idxs]
#       for i in range(num_batches):
#         batch_english = data_english[i*batch_size : (i+1)*batch_size]
#         batch_french = data_french[i*batch_size : (i+1)*batch_size]
#         batch_english = tf.keras.preprocessing.sequence.pad_sequences(batch_english, padding='post')
#         batch_french = tf.keras.preprocessing.sequence.pad_sequences(batch_french, padding='post')
#         yield batch_english,batch_french[:,0]
    
#   def __new__(cls):
#     return tf.data.Dataset.from_generator(
#         lambda: cls._generator(data_english, data_french),
#         output_types=(tf.dtypes.int32,tf.dtypes.int32),
#         output_shapes=((None),(None)),
#         args=()
#     )

In [0]:
# dl = DataLoader()
# for i,j in dl:
#   print(i.shape,j.shape)
#   break

In [0]:
# tensor_train = tf.data.Dataset.from_tensor_slices((
#     tf.keras.preprocessing.sequence.pad_sequences(data_english, padding='post'),
#     tf.keras.preprocessing.sequence.pad_sequences(data_french, padding='post')
# ))
# tensor_val = tf.data.Dataset.from_tensor_slices((
#     tf.keras.preprocessing.sequence.pad_sequences(data_english_val, padding='post'),
#     tf.keras.preprocessing.sequence.pad_sequences(data_french_val, padding='post')

# ))

# Train

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

tf.random.set_seed(8080)
# make sure numpy seeded

Tensorflow version 2.2.0-rc2


In [2]:

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('\n\n Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError as e:
  print(e)
  print("\n\n Not using TPU")
  tpu = None
  # raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

if tpu:
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

Please provide a TPU Name to connect to.


 Not using TPU


In [3]:
from pathlib import Path
import os
from collections import Counter
import numpy as np
import time
from tqdm.notebook import tqdm

np.random.seed(5050)

data_path = Path("/content/drive/My Drive/Adv Projects in ML/data")
print(data_path)
print(os.listdir(data_path))

!nvidia-smi

/content/drive/My Drive/Adv Projects in ML/data
['train.lang2', 'unaligned.en', 'unaligned.fr', 'train.lang1', 'split_train.lang1', 'split_val.lang2', 'split_train.lang2', 'split_val.lang1', 'unalignedtry.en']
Sat Apr  4 21:28:26 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------

In [4]:
# read data
with open(data_path/"split_train.lang1","r") as f:
    english = f.read()
print(len(english.split("\n")), english[:200])
    
with open(data_path/"split_train.lang2","r") as f:
    french = f.read()
print(len(french.split("\n")), french[:200])

with open(data_path/"split_val.lang1","r") as f:
    english_val = f.read()
print(len(english_val.split("\n")), english_val[:200])

with open(data_path/"split_val.lang2","r") as f:
    french_val = f.read()
print(len(french_val.split("\n")), french_val[:200])

# create vocab
english_vocab = list(set(english.replace("\n", " <eos> ").split()))
french_vocab = list(set(french.replace("\n", " <eos> ").split()))
len(english_vocab), len(french_vocab)

english_counter = Counter(english.replace("\n", " <eos> ").split())
french_counter = Counter(french.replace("\n", " <eos> ").split())
len(english_counter), len(french_counter)

english_counter.update({"<unk>":0})
french_counter.update({"<unk>":0})
english_counter.update({"<start>":0})
french_counter.update({"<start>":0})
len(english_counter), len(french_counter)

english_vocab = list(english_counter.keys())
french_vocab = list(french_counter.keys())

english_word2id = {}
english_id2word = {}
french_word2id = {}
french_id2word = {}

# start enumerate from 1 so that 0 is reserved for padding seqs 
for i, w in enumerate(english_vocab, start=1):
  english_word2id[w] = i
  english_id2word[i] = w

for i, w in enumerate(french_vocab, start=1):
  french_word2id[w] = i
  french_id2word[i] = w

len(english_word2id), len(english_id2word), len(french_word2id), len(french_id2word)

def transform_data(english_lang1, french_lang2):
  english_lines = english_lang1.split("\n")
  french_lines = french_lang2.split("\n")

  data_english = []
  data_french = []

  for line in english_lines:
    line2id = [english_word2id["<start>"]]
    for word in line.split():
      try:
        line2id.append(english_word2id[word])
      except:
        line2id.append(english_word2id["<unk>"])
    line2id.append(english_word2id["<eos>"])
    data_english.append(line2id)


  for line in french_lines:
    line2id = [french_word2id["<start>"]]
    for word in line.split():
      try:
        line2id.append(french_word2id[word])
      except:
        line2id.append(french_word2id["<unk>"])
    line2id.append(french_word2id["<eos>"])
    data_french.append(line2id)

  print(len(data_english), len(data_french))
  return data_english, data_french

data_english, data_french = transform_data(english, french)
data_english_val, data_french_val = transform_data(english_val, french_val)

len(data_english), len(data_french), len(data_english_val), len(data_french) 

8800 as mr de castro is not present mr le foll who is replacing mr de castro has the floor
on the other hand if you 're visiting an underdeveloped country and 25 dollars buys you a gourmet meal it 's exorb
8800 Comme M. De Castro est absent , M. Le Foll , qui le remplace , a la parole .
D' un autre côté , si vous êtes dans un pays en voie de développement , où 25 dollars peuvent vous obtenir un repas de luxe
2200 what action does the council intend to take in the face of this seriously discriminatory attitude which runs contrary to the principles of the eu
where would you like to go next
if that were not enoug
2200 Quelles mesures le Conseil compte-t-il adopter face à cette attitude qui constitue une grave discrimination et est contraire aux principes sur lesquels l' Union européenne est fondée ?
Où souhaiteriez
8800 8800
2200 2200


(8800, 8800, 2200, 8800)

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [0]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [8]:
vocab_inp_size = len(english_vocab)+1
vocab_tar_size = len(french_vocab)+1
embedding_dim = 256
units = 256
if tpu:
  BATCH_SIZE = 128 * tpu_strategy.num_replicas_in_sync
else:
  BATCH_SIZE = 64

BUFFER_SIZE = len(data_english)
steps_per_epoch = len(data_english)//BATCH_SIZE
print("Number of batches = ", steps_per_epoch)

Number of batches =  137


In [9]:
# with tpu_strategy.scope():
#   encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
#   attention_layer = BahdanauAttention(10)
#   decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
#   optimizer = tf.keras.optimizers.Adam()
#   loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
#       from_logits=True, reduction='none')

#   def loss_function(real, pred):
#     mask = tf.math.logical_not(tf.math.equal(real, 0))
#     loss_ = loss_object(real, pred)

#     mask = tf.cast(mask, dtype=loss_.dtype)
#     loss_ *= mask

#     return tf.reduce_mean(loss_)

#   @tf.function
#   def train_step(inp, targ, enc_hidden):
#     loss = 0

#     with tf.GradientTape() as tape:
#       enc_output, enc_hidden = encoder(inp, enc_hidden)

#       dec_hidden = enc_hidden

#       dec_input = tf.expand_dims([french_word2id['<start>']] * BATCH_SIZE, 1)

#       # Teacher forcing - feeding the target as the next input
#       for t in range(1, targ.shape[1]):
#         # passing enc_output to the decoder
#         predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

#         loss += loss_function(targ[:, t], predictions)

#         # using teacher forcing
#         dec_input = tf.expand_dims(targ[:, t], 1)

#     batch_loss = (loss / int(targ.shape[1]))

#     variables = encoder.trainable_variables + decoder.trainable_variables

#     gradients = tape.gradient(loss, variables)

#     optimizer.apply_gradients(zip(gradients, variables))

#     return batch_loss
  
#   tensor_train = tf.data.Dataset.from_tensor_slices((
#     tf.keras.preprocessing.sequence.pad_sequences(data_english, padding='post'),
#     tf.keras.preprocessing.sequence.pad_sequences(data_french, padding='post')
#   )).shuffle(BUFFER_SIZE).batch(BATCH_SIZE,drop_remainder=True)
#   tensor_val = tf.data.Dataset.from_tensor_slices((
#       tf.keras.preprocessing.sequence.pad_sequences(data_english_val, padding='post'),
#       tf.keras.preprocessing.sequence.pad_sequences(data_french_val, padding='post')
#   ))

# # with tpu_strategy.scope():
# EPOCHS = 5
# ini = time.time()
# for epoch in range(EPOCHS):
#   start1 = time.time()
  
#   enc_hidden = encoder.initialize_hidden_state()
  
#   total_loss = 0

#   for (batch, (inp, targ)) in tqdm(enumerate(tensor_train.take(steps_per_epoch))):
#     # start = time.time()
    
#     batch_loss = train_step(inp, targ, enc_hidden)
#     total_loss += batch_loss

#     print("Epoch: ", epoch+1, " Batch: ", batch+1)

#     if batch % 100 == 0:
#       print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
#                                                   batch,
#                                                   batch_loss.numpy()))
#       # print('Time taken for 100 batches {} sec\n'.format(time.time() - start))
      
#   # # saving (checkpoint) the model every 2 epochs
#   # if (epoch + 1) % 2 == 0:
#   #   checkpoint.save(file_prefix = checkpoint_prefix)

#   print('Epoch {} Loss {:.4f}'.format(epoch + 1,
#                                       total_loss / steps_per_epoch))
#   print('Time taken for 1 epoch {} sec\n'.format(time.time() - start1))
# print("total time: ", time.time()-ini)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch:  1  Batch:  1
Epoch 1 Batch 0 Loss 2.0131
Epoch:  1  Batch:  2
Epoch:  1  Batch:  3
Epoch:  1  Batch:  4
Epoch:  1  Batch:  5
Epoch:  1  Batch:  6
Epoch:  1  Batch:  7
Epoch:  1  Batch:  8

Epoch 1 Loss 2.0280
Time taken for 1 epoch 198.30731749534607 sec



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch:  2  Batch:  1
Epoch 2 Batch 0 Loss 1.9172
Epoch:  2  Batch:  2
Epoch:  2  Batch:  3
Epoch:  2  Batch:  4
Epoch:  2  Batch:  5
Epoch:  2  Batch:  6
Epoch:  2  Batch:  7
Epoch:  2  Batch:  8

Epoch 2 Loss 1.6586
Time taken for 1 epoch 95.1074047088623 sec



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch:  3  Batch:  1
Epoch 3 Batch 0 Loss 1.4447
Epoch:  3  Batch:  2
Epoch:  3  Batch:  3
Epoch:  3  Batch:  4
Epoch:  3  Batch:  5
Epoch:  3  Batch:  6
Epoch:  3  Batch:  7
Epoch:  3  Batch:  8

Epoch 3 Loss 1.4279
Time taken for 1 epoch 94.59843873977661 sec



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch:  4  Batch:  1
Epoch 4 Batch 0 Loss 1.4466
Epoch:  4  Batch:  2
Epoch:  4  Batch:  3
Epoch:  4  Batch:  4
Epoch:  4  Batch:  5
Epoch:  4  Batch:  6
Epoch:  4  Batch:  7
Epoch:  4  Batch:  8

Epoch 4 Loss 1.4260
Time taken for 1 epoch 94.49924516677856 sec



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch:  5  Batch:  1
Epoch 5 Batch 0 Loss 1.3925
Epoch:  5  Batch:  2
Epoch:  5  Batch:  3
Epoch:  5  Batch:  4
Epoch:  5  Batch:  5
Epoch:  5  Batch:  6
Epoch:  5  Batch:  7
Epoch:  5  Batch:  8

Epoch 5 Loss 1.4076
Time taken for 1 epoch 94.85381293296814 sec

total time:  577.3671452999115


In [0]:
if tpu:
  with tpu_strategy.scope():
    encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
    attention_layer = BahdanauAttention(10)
    decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
    optimizer = tf.keras.optimizers.Adam()
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

    def loss_function(real, pred):
      mask = tf.math.logical_not(tf.math.equal(real, 0))
      loss_ = loss_object(real, pred)

      mask = tf.cast(mask, dtype=loss_.dtype)
      loss_ *= mask

      return tf.reduce_mean(loss_)

    # training_model.compile(
    #     optimizer=tf.keras.optimizers.Adam(),
    #     loss='sparse_categorical_crossentropy',
    #     metrics=['sparse_categorical_accuracy'])

else:
    encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
    attention_layer = BahdanauAttention(10)
    decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
    optimizer = tf.keras.optimizers.Adam()
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

    def loss_function(real, pred):
      mask = tf.math.logical_not(tf.math.equal(real, 0))
      loss_ = loss_object(real, pred)

      mask = tf.cast(mask, dtype=loss_.dtype)
      loss_ *= mask

      return tf.reduce_mean(loss_)

In [0]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([french_word2id['<start>']] * targ.get_shape()[0], 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

@tf.function
def val_step(inp, targ, enc_hidden):
  loss = 0

  enc_output, enc_hidden = encoder(inp, enc_hidden)

  dec_hidden = enc_hidden

  dec_input = tf.expand_dims([french_word2id['<start>']] * targ.get_shape()[0], 1)

  # Teacher forcing - feeding the target as the next input
  for t in range(1, targ.shape[1]):
    # passing enc_output to the decoder
    predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

    loss += loss_function(targ[:, t], predictions)

    # using teacher forcing
    dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  return batch_loss

In [0]:
tensor_train = tf.data.Dataset.from_tensor_slices((
    tf.keras.preprocessing.sequence.pad_sequences(data_english, padding='post'),
    tf.keras.preprocessing.sequence.pad_sequences(data_french, padding='post')
)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
tensor_val = tf.data.Dataset.from_tensor_slices((
    tf.keras.preprocessing.sequence.pad_sequences(data_english_val, padding='post'),
    tf.keras.preprocessing.sequence.pad_sequences(data_french_val, padding='post')
)).batch(BATCH_SIZE, drop_remainder=True)

In [13]:
EPOCHS = 10

ini = time.time()
for epoch in range(EPOCHS):
  start = time.time()
  
  enc_hidden = encoder.initialize_hidden_state()
  
  total_loss = 0

  for (batch, (inp, targ)) in tqdm(enumerate(tensor_train.take(steps_per_epoch))):
    
    # print("Epoch: ", epoch+1, " Batch: ", batch+1)

    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    # if batch % 100 == 0:
    #   print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
    #                                               batch,
    #                                               batch_loss.numpy()))
      
  # # saving (checkpoint) the model every 2 epochs
  # if (epoch + 1) % 2 == 0:
  #   checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

  

print('total time = ', time.time()-ini)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 1 Batch 0 Loss 0.9640
Epoch 1 Batch 100 Loss 1.2387

Epoch 1 Loss 1.1010
Time taken for 1 epoch 49.00045585632324 sec



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 2 Batch 0 Loss 1.0092
Epoch 2 Batch 100 Loss 1.1590

Epoch 2 Loss 1.0638
Time taken for 1 epoch 49.04193377494812 sec



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 3 Batch 0 Loss 1.0047
Epoch 3 Batch 100 Loss 0.9370

Epoch 3 Loss 1.0335
Time taken for 1 epoch 49.01347017288208 sec



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 4 Batch 0 Loss 1.0092
Epoch 4 Batch 100 Loss 0.9541

Epoch 4 Loss 1.0074
Time taken for 1 epoch 48.92790937423706 sec



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 5 Batch 0 Loss 0.9882
Epoch 5 Batch 100 Loss 1.0469

Epoch 5 Loss 0.9825
Time taken for 1 epoch 48.88078165054321 sec



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 6 Batch 0 Loss 0.9463
Epoch 6 Batch 100 Loss 0.9738

Epoch 6 Loss 0.9594
Time taken for 1 epoch 48.86037349700928 sec



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 7 Batch 0 Loss 1.0684
Epoch 7 Batch 100 Loss 0.9558

Epoch 7 Loss 0.9386
Time taken for 1 epoch 49.02456188201904 sec



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 8 Batch 0 Loss 1.0188
Epoch 8 Batch 100 Loss 0.8730

Epoch 8 Loss 0.9188
Time taken for 1 epoch 48.88335585594177 sec



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 9 Batch 0 Loss 0.9889
Epoch 9 Batch 100 Loss 0.8640

Epoch 9 Loss 0.9003
Time taken for 1 epoch 48.81165909767151 sec



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Epoch 10 Batch 0 Loss 1.0143
Epoch 10 Batch 100 Loss 0.7840

Epoch 10 Loss 0.8818
Time taken for 1 epoch 48.86402630805969 sec

total time =  489.3096933364868


In [28]:
enc_hidden = encoder.initialize_hidden_state()  
total_loss = 0

for (batch, (inp, targ)) in tqdm(enumerate(tensor_val.take(steps_per_epoch))):
  batch_loss = val_step(inp, targ, enc_hidden)
  total_loss += batch_loss

total_loss = total_loss / steps_per_epoch
print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                    total_loss))

# if (epoch + 1) % 2 == 0:
#   checkpoint.save(file_prefix = checkpoint_prefix)

  # if batch % 100 == 0:
  #   print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
  #                                               batch,
  #                                               batch_loss.numpy()))


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Epoch 10 Loss 0.2768
