In [1]:
import tensorflow as tf
import numpy as np
import re
import io
import os
import unicodedata
import time
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/gdrive')
os.chdir('/gdrive/My Drive/nmt')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive


In [2]:
!pip install import_ipynb
import import_ipynb

Collecting import_ipynb
  Downloading https://files.pythonhosted.org/packages/63/35/495e0021bfdcc924c7cdec4e9fbb87c88dd03b9b9b22419444dc370c8a45/import-ipynb-0.1.3.tar.gz
Building wheels for collected packages: import-ipynb
  Building wheel for import-ipynb (setup.py) ... [?25l[?25hdone
  Created wheel for import-ipynb: filename=import_ipynb-0.1.3-cp36-none-any.whl size=2976 sha256=a893ec0b747fe688d68e6c689ad143f88bba98d43a3b62ebc3cb7363873d4e3e
  Stored in directory: /root/.cache/pip/wheels/b4/7b/e9/a3a6e496115dffdb4e3085d0ae39ffe8a814eacc44bbf494b5
Successfully built import-ipynb
Installing collected packages: import-ipynb
Successfully installed import-ipynb-0.1.3


In [3]:
from enc_dec import Encoder, BahdanauAttention , Decoder

In [4]:
path = "spa-eng/spa-eng/spa.txt"

In [5]:
def preprocess_sentence(w):
  w = re.sub(r"([?.!,¿])", r" \1 ", w)# creating a space between a word and the punctuation following it
  w = re.sub(r'[" "]+', " ", w)
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)# replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

  w = w.strip()
  w = '<start> ' + w + ' <end>'
  return w

In [6]:
en_sentence = u"What would you like to drink?"
sp_sentence = "¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> What would you like to drink ? <end>
b'<start> \xc2\xbf Puedo tomar prestado este libro ? <end>'


In [7]:
def create_dataset(path, num_examples):
  lines = io.open(path, encoding = 'UTF-8').read().strip().split('\n')
  word_pairs = [[preprocess_sentence(x) for x in l.split('\t')] for l in lines[:num_examples]]
  return zip(*word_pairs)

In [8]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding = 'post')

  return tensor, lang_tokenizer

In [9]:
def load_dataset(path, num_examples = None):
  inp_lang, targ_lang = create_dataset(path, num_examples)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer 

In [10]:
num_examples = 30000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path, num_examples)
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [11]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

24000 24000 6000 6000


In [12]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [13]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
56 ----> please
94 ----> take
9 ----> a
698 ----> bath
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
25 ----> por
55 ----> favor
426 ----> ba
4314 ----> ate
3 ----> .
2 ----> <end>


In [14]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = BUFFER_SIZE // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_input_size = len(inp_lang.index_word) + 1
vocab_target_size = len(targ_lang.index_word) + 1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder =True)

In [15]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 11]), TensorShape([64, 18]))

In [16]:
encoder = Encoder(vocab_input_size, embedding_dim, units, BATCH_SIZE)
sample_hidden = tf.zeros((BATCH_SIZE, units))
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 11, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [17]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 11, 1)


In [18]:
decoder = Decoder(vocab_target_size, embedding_dim, units, BATCH_SIZE)
sample_decoder_output, _ , _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)
print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 8895)


In [19]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, reduction = 'none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss = loss_object(real, pred)

  mask = tf.cast(mask, dtype = loss.dtype)
  loss*=mask

  return tf.reduce_mean(loss)

In [20]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0
  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:,t], predictions)

      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [21]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = tf.zeros((BATCH_SIZE, units))
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if ((batch%100) == 0):
      print("Epoch {}  Batch {}  Loss{:.4f}".format(epoch + 1, batch, total_loss.numpy()))


  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1  Batch 0  Loss3.0154
Epoch 1  Batch 100  Loss179.2935
Epoch 1  Batch 200  Loss330.5957
Epoch 1  Batch 300  Loss470.1703
Epoch 1 Loss 1.5074
Time taken for 1 epoch 63.38821816444397 sec

Epoch 2  Batch 0  Loss1.1754
Epoch 2  Batch 100  Loss116.4235
Epoch 2  Batch 200  Loss225.9061
Epoch 2  Batch 300  Loss327.1769
Epoch 2 Loss 1.0651
Time taken for 1 epoch 46.74494242668152 sec

Epoch 3  Batch 0  Loss0.8463
Epoch 3  Batch 100  Loss85.1226
Epoch 3  Batch 200  Loss166.9136
Epoch 3  Batch 300  Loss245.8074
Epoch 3 Loss 0.8031
Time taken for 1 epoch 46.56194472312927 sec

Epoch 4  Batch 0  Loss0.7033
Epoch 4  Batch 100  Loss61.9215
Epoch 4  Batch 200  Loss122.6227
Epoch 4  Batch 300  Loss181.1514
Epoch 4 Loss 0.5992
Time taken for 1 epoch 46.607561349868774 sec

Epoch 5  Batch 0  Loss0.4731
Epoch 5  Batch 100  Loss44.0127
Epoch 5  Batch 200  Loss88.3957
Epoch 5  Batch 300  Loss132.4809
Epoch 5 Loss 0.4397
Time taken for 1 epoch 46.67987251281738 sec

Epoch 6  Batch 0  Loss0.2967
Epoc

In [23]:
encoder.save_weights('encoder.h5')
decoder.save_weights('decoder.h5')

In [26]:
inplang_json = inp_lang.to_json()
with open("inp_lang.json", "w") as json_file:
    json_file.write(inplang_json)

targlang_json = targ_lang.to_json()
with open("targ_lang.json", "w") as json_file:
    json_file.write(targlang_json)

In [None]:
def evaluate(sentence):
  sentence = [preprocess_sentence(sentence)]
  inputs = inp_lang.texts_to_sequences(sentence)
  inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, maxlen = max_length_inp, padding='post')

  hidden = tf.zeros((1, units))
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)
  
  result = ''

  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

    predicted_id = tf.argmax(predictions[0]).numpy()

    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence

    result += targ_lang.index_word[predicted_id] + ' '


    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [None]:
def translate(sentence):
  result, sentences = evaluate(sentence)

  print("Input : %s" % (sentence))
  print("Predicted Translate : {}".format(result))

In [None]:
translate('Do not Run!')

Input : Do not Run!
Predicted Translate : no corras ! 
