In [1]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np
from keras import backend as K
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import re
import unicodedata
import os
import time
import io
from sklearn.model_selection import train_test_split

In [2]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"


In [3]:
ex_sentence=u'hello guys this is good'

In [4]:
sent=ex_sentence.lower().strip()

In [5]:
dd=unicodedata.normalize('NFD',sent)

In [6]:
unicodedata.category(dd[0])

'Ll'

In [7]:
# converts unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c)!='Mn')
def preprocess_sentence(w):
    w=unicode_to_ascii(w.lower().strip())
      # creating a space between a word and the punctuation following it
      # eg: "he is a boy." => "he is a boy ."
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
     # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w=w.strip()
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    
    return w


In [8]:
en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> may i borrow this book ? <end>
b'<start> \xc2\xbf puedo tomar prestado este libro ? <end>'


In [9]:
# remove the accents
# clean the sentences
# crete a dataset in format[english,spanish]

def create_pairs(path,number_of_examples):
    lines=io.open(path,encoding='UTF-8').read().strip().split('\n')
    word_pairs=[[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:number_of_examples]]
    return zip(*word_pairs)

In [10]:
lines=io.open(path_to_file,encoding='UTF-8').read().strip().split('\n')

In [11]:
pair=create_pairs(path_to_file,None)

In [12]:
en,sp=pair

In [13]:
def tokenize(lang):
    lang_tokenizer=keras.preprocessing.text.Tokenizer(filters='',char_level=False)
    lang_tokenizer.fit_on_texts(lang)
    tensor=lang_tokenizer.texts_to_sequences(lang)
    tensor=tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
    return tensor,lang_tokenizer

In [14]:
def load_dataset(path,examples=None):
    targ_lang,inp_lang=create_pairs(path,examples)
    
    input_tensor,input_lang_tokenizer=tokenize(inp_lang)
    output_tensor,output_lang_tokenizer=tokenize(targ_lang)
    
    return input_tensor,input_lang_tokenizer,output_tensor,output_lang_tokenizer

    

In [15]:
input_tensor,inp_lang,target_tensor,out_lang=load_dataset(path_to_file)

In [16]:
word=en[-100]
print(word)

<start> i came here to see if there was something i could do to help , but there doesn t seem to be anything for me to do . <end>


In [17]:
max_length_input=input_tensor.shape[1]
max_length_output=target_tensor.shape[1]

In [18]:
max_length_input

53

In [19]:
max_length_output

51

In [20]:
x_train,x_test,y_train,y_test=train_test_split(input_tensor,target_tensor,test_size=0.2)

In [21]:
buffer_size=len(x_train)
batch_size=64
steps_per_epochs=int(buffer_size/batch_size)
embedding_dim=256
units=1024
vocab_size_input=len(inp_lang.word_index)+1
vocab_size_output=len(out_lang.word_index)+1


In [22]:
print(steps_per_epochs)
print(vocab_size_input)
print(vocab_size_output)

1487
24794
12934


In [23]:
dataset=tf.data.Dataset.from_tensor_slices((x_train,y_train)).shuffle(buffer_size)
dataset=dataset.batch(batch_size,drop_remainder=True)

In [24]:
for i in dataset.take(1):
    print(i)

(<tf.Tensor: shape=(64, 53), dtype=int32, numpy=
array([[  1,  16,   4, ...,   0,   0,   0],
       [  1,   6, 488, ...,   0,   0,   0],
       [  1,  12,  78, ...,   0,   0,   0],
       ...,
       [  1, 101, 383, ...,   0,   0,   0],
       [  1,  42,   8, ...,   0,   0,   0],
       [  1,  13,  39, ...,   0,   0,   0]])>, <tf.Tensor: shape=(64, 51), dtype=int32, numpy=
array([[  1,   4,  45, ...,   0,   0,   0],
       [  1,   5, 315, ...,   0,   0,   0],
       [  1,  79,   7, ...,   0,   0,   0],
       ...,
       [  1,  14,  15, ...,   0,   0,   0],
       [  1,  17,  11, ...,   0,   0,   0],
       [  1,   4,  33, ...,   0,   0,   0]])>)


#### trying book's version of writing a basic encoder decoder network

In [25]:
import tensorflow_addons as tfa

In [26]:
max_output=y_train.shape[1]

In [53]:
encoder_input=keras.layers.Input(shape=[None],dtype=np.int32)
decoder_input=keras.layers.Input(shape=[None],dtype=np.int32)
sequence_lengths=keras.layers.Input(shape=[],dtype=np.int32)

encoder_embedding=keras.layers.Embedding(vocab_size_input,embedding_dim)(encoder_input)
decoder_embedding_layer=keras.layers.Embedding(vocab_size_output+1,embedding_dim)
decoder_embedding=decoder_embedding_layer(decoder_input)

encoder=keras.layers.LSTM(512,return_state=True)
encoder_output,state_a,state_c=encoder(encoder_embedding)
encoder_state=[state_a,state_c]

sampler=tfa.seq2seq.sampler.TrainingSampler()

decoder_cell=keras.layers.LSTMCell(512)
output_layer=keras.layers.Dense(vocab_size_output)


decoder=tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell,sampler,output_layer=output_layer)
final_output,final_state,final_sequence_length=decoder(decoder_embedding,initial_state=encoder_state,sequence_length=sequence_lengths,training=True)


y_proba=keras.layers.Activation('softmax')(final_output.rnn_output)

model=keras.Model(inputs=[encoder_input,decoder_input,sequence_lengths],outputs=[y_proba])


In [54]:
print(vocab_size_output)
print(vocab_size_input)

12934
24794


In [55]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 256)    6347264     input_7[0][0]                    
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, None, 256)    3311360     input_8[0][0]                    
____________________________________________________________________________________________

In [56]:
model.compile(loss='sparse_categorical_crossentropy',optimizer=keras.optimizers.Adam(),metrics=['accuracy'])

In [57]:
print(x_train.shape)
print(y_train.shape)

(95171, 53)
(95171, 51)


In [64]:
sos_vector=tf.fill(dims=(len(y_train),1),value=1)
x_decoder=tf.concat([sos_vector,y_train[:,:-1]],axis=1)

In [70]:
seq_lengths = np.full([95171],50 )
print(seq_lengths)

[50 50 50 ... 50 50 50]


In [71]:
history=model.fit([x_train[:,1:],y_train,seq_lengths],y_train[:,1:],epochs=2,validation_split=0.1)

Epoch 1/2
  17/2677 [..............................] - ETA: 3:33:12 - loss: 2.9464 - accuracy: 0.8269

KeyboardInterrupt: 

#### The following is the preferred way in tensorflow as shown in the website

In [30]:
class Encoder(keras.Model):
    def __init__(self,vocab_size,embedding_dim,enc_units,batch_size,**kwargs):
        super().__init__(**kwargs)
        self.batch_size=batch_size
        self.enc_units=enc_units
        self.embedding=keras.layers.Embedding(vocab_size,embedding_dim)
        self.gru=tf.keras.layers.GRU(self.enc_units,return_sequences=True,
                                    return_state=True,recurrent_initializer='glorot_uniform')
    
    def call(self,x,hidden_state):
        x=self.embedding(x)
        output,state=self.gru(x,initial_state=hidden_state)
        return output,state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size,self.enc_units))
    
        

In [31]:
input_layer=keras.layers.Input(shape=[None],dtype=tf.int32)
layer=keras.layers.Embedding(vocab_size_input,embedding_dim)(input_layer)
gru=keras.layers.GRU(units,return_sequences=True,return_state=True)(layer)

In [32]:
print(layer)
print(gru[0])
print(gru[1])

Tensor("embedding/Identity:0", shape=(None, None, 256), dtype=float32)
Tensor("gru/Identity:0", shape=(None, None, 1024), dtype=float32)
Tensor("gru/Identity_1:0", shape=(None, 1024), dtype=float32)


In [33]:
query_with_time_axis=tf.expand_dims(gru[1],1)
print(query_with_time_axis)

Tensor("ExpandDims:0", shape=(None, 1, 1024), dtype=float32)


In [34]:
val1=keras.layers.Dense(units)(query_with_time_axis)
val2=keras.layers.Dense(units)(gru[1])
val=val1+val2
val3=tf.nn.tanh(val)
score=keras.layers.Dense(1)(val3)
attention=tf.nn.softmax(score,axis=1)
context=attention*val2
context_vector=tf.reduce_sum(context,axis=1)

In [35]:
print(val1)
print(val2)
print(val)
print(val3)
print(score)
print(attention)
print(context)
print(context_vector)

Tensor("dense/Identity:0", shape=(None, 1, 1024), dtype=float32)
Tensor("dense_1/Identity:0", shape=(None, 1024), dtype=float32)
Tensor("AddV2:0", shape=(None, None, 1024), dtype=float32)
Tensor("Tanh:0", shape=(None, None, 1024), dtype=float32)
Tensor("dense_2/Identity:0", shape=(None, None, 1), dtype=float32)
Tensor("Transpose_1:0", shape=(None, None, 1), dtype=float32)
Tensor("Mul:0", shape=(None, None, 1024), dtype=float32)
Tensor("Sum:0", shape=(None, 1024), dtype=float32)


now write a attention script which is bahndanau's attention algorithm 


In [36]:
class bahandanau_attention(tf.keras.layers.Layer):
    def __init__(self,units,**kwargs):
        super().__init__(**kwargs)
        self.w1=tf.keras.layers.Dense(units)
        self.w2=keras.layers.Dense(units)
        self.v=keras.layers.Dense(1)
        
    def call(self,query,values):
        query_with_time_axis=tf.expand_dims(query,1)
        score=self.v(tf.nn.tanh(self.w1(query_with_time_axis)+self.w2(values)))
        attention_weights=tf.nn.softmax(score,axis=1)
        context_vector=attention_weights*values
        context_vector=tf.reduce_sum(context_vector,axis=1)
        return context_vector,attention_weights
    

In [37]:
class Decoder(keras.Model):
    def __init__(self,vocab_size,embedding_dim,dec_units,batch_size,**kwargs):
        super().__init__(**kwargs)
        self.batch_size=batch_size
        self.dec_units=dec_units
        self.embedding=keras.layers.Embedding(vocab_size,embedding_dim)
        self.gru=keras.layers.GRU(self.dec_units,
                                 return_sequences=True,
                                 return_state=True,
                                 recurrent_initializer='glorot_uniform')
        self.fc=keras.layers.Dense(vocab_size)
        self.attention=bahandanau_attention(self.dec_units)
        
    def call(self,x,hidden,enc_output):
        context_vector,attention_weights=self.attention(hidden,enc_output)
        x=self.embedding(x)
        x=tf.concat([tf.expand_dims(context_vector,1),x],axis=-1)
        output,state=self.gru(x)
        output=tf.reshape(output,(-1,output.shape[2]))
        x=self.fc(output)
            
        return x,state,attention_weights

In [38]:
decoder_embedding=keras.layers.Embedding(vocab_size_output,embedding_dim)(input_layer)

In [39]:
print(tf.expand_dims(context_vector,1))
print(decoder_embedding)

Tensor("ExpandDims_1:0", shape=(None, 1, 1024), dtype=float32)
Tensor("embedding_1/Identity:0", shape=(None, None, 256), dtype=float32)


In [40]:
after_concat=tf.concat([tf.expand_dims(context_vector,1),decoder_embedding],axis=-1)
print(tf.concat([tf.expand_dims(context_vector,1),decoder_embedding],axis=-1))

Tensor("concat_1:0", shape=(None, 1, 1280), dtype=float32)


In [41]:
gru_output,gru_state=keras.layers.GRU(units,return_sequences=True,return_state=True,
                                      recurrent_initializer='glorot_uniform')(after_concat)
print(gru_output)
print(gru_state)

Tensor("gru_1/Identity:0", shape=(None, 1, 1024), dtype=float32)
Tensor("gru_1/Identity_1:0", shape=(None, 1024), dtype=float32)


In [42]:
output=tf.reshape(gru_output,(-1,gru_output.shape[2]))
print(output)
x=keras.layers.Dense(vocab_size_output)(output)
print(x)

Tensor("Reshape:0", shape=(None, 1024), dtype=float32)
Tensor("dense_3/Identity:0", shape=(None, 12934), dtype=float32)


In [43]:
decoder=Decoder(vocab_size_output,embedding_dim,units,batch_size)
encoder=Encoder(vocab_size_input,embedding_dim,units,batch_size)
encoder_initialized_state=encoder.initialize_hidden_state()

In [44]:
encoder_input_layer=keras.layers.Input(shape=[None],dtype=tf.int32)
decoder_input_layer=keras.layers.Input(shape=[None],dtype=tf.int32)
encoder_output,encoder_hidden_state=encoder(encoder_input_layer,encoder_initialized_state)
predictions,decoder_hidden_state,_=decoder(decoder_input_layer,encoder_hidden_state,encoder_output)
y_proba=tf.nn.softmax(predictions)


model1=keras.models.Model(inputs=[encoder_input_layer,decoder_input_layer],outputs=[y_proba])


In [45]:
model1.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder (Encoder)               ((64, None, 1024), ( 10285568    input_2[0][0]                    
__________________________________________________________________________________________________
decoder (Decoder)               ((64, 12934), (64, 1 25752711    input_3[0][0]                    
______________________________________________________________________________________________

In [46]:
model.summary()

NameError: name 'model' is not defined

In [None]:
encoder_hidden_state

In [None]:
model.output

In [None]:
decoder_input=tf.expand_dims([out_lang.word_index['<start>']]*batch_size,1)

In [103]:
model1.compile(loss='sparse_categorical_crossentropy',optimizer='adam')
model1.fit([x_train[:,1:],x_train],y_train)

ValueError: in user code:

    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    <ipython-input-52-4c5bbcdfca0b>:17 call  *
        x=tf.concat([tf.expand_dims(context_vector,1),x],axis=-1)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\util\dispatch.py:180 wrapper  **
        return target(*args, **kwargs)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\ops\array_ops.py:1606 concat
        return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\ops\gen_array_ops.py:1189 concat_v2
        "ConcatV2", values=values, axis=axis, name=name)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\framework\op_def_library.py:744 _apply_op_helper
        attrs=attr_protos, op_def=op_def)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\framework\func_graph.py:595 _create_op_internal
        compute_device)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\framework\ops.py:3327 _create_op_internal
        op_def=op_def)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\framework\ops.py:1817 __init__
        control_input_ops, op_def)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\framework\ops.py:1657 _create_c_op
        raise ValueError(str(e))

    ValueError: Dimension 1 in both shapes must be equal, but are 1 and 53. Shapes are [64,1] and [?,53]. for '{{node model_6/decoder_1/concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](model_6/decoder_1/ExpandDims, model_6/decoder_1/embedding_13/embedding_lookup/Identity_1, model_6/decoder_1/concat/axis)' with input shapes: [64,1,1024], [?,53,256], [] and with computed input tensors: input[2] = <-1>.


In [None]:

encoder_input=keras.layers.Input(shape=[None],dtype=tf.int32)
decoder_input=keras.layers.Input(shape=[None],dtype=tf.int32)
layer1=encoder(encoder_input,encoder_initialized_state)
layer2=decoder(decoder_input,layer1[1],layer1[0])
model=keras.models.Model(inputs=[encoder_input,decoder_input],outputs=[layer2[0]])

In [None]:
model.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy',optimizer=keras.optimizers.Adam(),metrics='accuracy')

In [112]:
model.fit([x_train[:,1:],x_train],y_train,epochs=3)

Epoch 1/3


ValueError: in user code:

    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    <ipython-input-30-4c5bbcdfca0b>:17 call  *
        x=tf.concat([tf.expand_dims(context_vector,1),x],axis=-1)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\util\dispatch.py:180 wrapper  **
        return target(*args, **kwargs)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\ops\array_ops.py:1606 concat
        return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\ops\gen_array_ops.py:1189 concat_v2
        "ConcatV2", values=values, axis=axis, name=name)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\framework\op_def_library.py:744 _apply_op_helper
        attrs=attr_protos, op_def=op_def)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\framework\func_graph.py:595 _create_op_internal
        compute_device)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\framework\ops.py:3327 _create_op_internal
        op_def=op_def)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\framework\ops.py:1817 __init__
        control_input_ops, op_def)
    C:\Users\user\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\framework\ops.py:1657 _create_c_op
        raise ValueError(str(e))

    ValueError: Dimension 1 in both shapes must be equal, but are 1 and 53. Shapes are [64,1] and [?,53]. for '{{node model_10/decoder_2/concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](model_10/decoder_2/ExpandDims, model_10/decoder_2/embedding_28/embedding_lookup/Identity_1, model_10/decoder_2/concat/axis)' with input shapes: [64,1,1024], [?,53,256], [] and with computed input tensors: input[2] = <-1>.


Trials

In [47]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 53]), TensorShape([64, 51]))

In [48]:
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

In [49]:
attention_layer=bahandanau_attention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

In [50]:
sample_decoder_output, _, _ = decoder(tf.random.uniform((batch_size, 1)),
                                      sample_hidden, sample_output)


In [51]:
mask=tf.math.logical_not(tf.math.equal(x_train[0],0))
print(mask)
mask2=tf.cast(mask,dtype=np.float32)
print(mask2)
out=x_train[0]*mask2
print(out)
returns=tf.reduce_mean(out)
print(returns)

tf.Tensor(
[ True  True  True  True  True  True  True False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False], shape=(53,), dtype=bool)
tf.Tensor(
[1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0.], shape=(53,), dtype=float32)
tf.Tensor(
[1.000e+00 1.887e+03 3.700e+01 1.600e+01 5.770e+02 3.000e+00 2.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00

#### Tried website subclassing and tried to train like a normal model. could'nt solve it (crying emoji)
hence using the @tf function for training . Apparently this is profesional method of training adding more flexibility

this is also the preferred method presented in tensorflow tutorials

defining optimizers and loss function

In [52]:
optimizer=keras.optimizers.Adam()
loss_object=keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none')
def loss_function(real,pred):
    mask=tf.math.logical_not(tf.math.equal(real,0))
    loss=loss_object(real,pred)
    
    mask=tf.cast(mask,loss.dtype)
    loss*=mask
    return tf.reduce_mean(loss)

In [53]:
@tf.function

def train_step(inp,target,enc_hidden):
    loss=0
    with tf.GradientTape() as tape:
        enc_output,enc_hidden=encoder(inp,enc_hidden)
        dec_hidden=enc_hidden
        
        dec_input=tf.expand_dims([out_lang.word_index['<start>']]*batch_size,1)
        # teacher forcing
        for i in range(1,target.shape[1]):
            predictions,dec_hidden,_=decoder(dec_input,dec_hidden,enc_output)
            loss+= loss_function(target[:,i],predictions)
            dec_input=tf.expand_dims(target[:,i],1)
        batch_loss=(loss/target.shape[1])
        variables=encoder.trainable_variables+decoder.trainable_variables
        gradients=tape.gradient(loss,variables)
        optimizer.apply_gradients(zip(gradients,variables))
        return batch_loss
            

object based saving

In [54]:
checkpoint_dir='./training_saved'
checkpoint_prefix=os.path.join(checkpoint_dir,'cptk')
checkpoint=tf.train.Checkpoint(optimizer=optimizer,ecoder=encoder,decoder=decoder)

In [55]:
for i in dataset.take(1):
    print(i)

(<tf.Tensor: shape=(64, 53), dtype=int32, numpy=
array([[    1,    44,    38, ...,     0,     0,     0],
       [    1,    12,     4, ...,     0,     0,     0],
       [    1,  1188, 14870, ...,     0,     0,     0],
       ...,
       [    1,    39,    14, ...,     0,     0,     0],
       [    1,    46,  1750, ...,     0,     0,     0],
       [    1,    89,    14, ...,     0,     0,     0]])>, <tf.Tensor: shape=(64, 51), dtype=int32, numpy=
array([[  1,  72,   4, ...,   0,   0,   0],
       [  1,  29,  57, ...,   0,   0,   0],
       [  1,   7, 185, ...,   0,   0,   0],
       ...,
       [  1,  23,  11, ...,   0,   0,   0],
       [  1,  13,  19, ...,   0,   0,   0],
       [  1,  97,  14, ...,   0,   0,   0]])>)


In [56]:
import time
epochs=10
for epoch in range(epochs):
    start=time.time()
    
    enc_hidden=encoder.initialize_hidden_state()
    total_loss=0
    
    for (batch,(inp,target)) in enumerate(dataset.take(steps_per_epochs)):
        batch_loss=train_step(inp,target,enc_hidden)
        total_loss+=batch_loss
        
        if batch %100 ==0:
            print('Epochs {} Batch {} Loss {:.4f}'.format(epoch+1,
                                                        batch,
                                                        batch_loss.numpy()))
        # save model every 2 epochs
        if (epochs+1) %2 ==0:
            checkpoint.save(file_prefix=checkpoint_prefix)
        
    

KeyboardInterrupt: 

#### now for evaluation a seperate function is created

In [67]:
def evaluate(sentence):
    attention_plot=np.zeros((max_length_output,max_length_input))
    
    sentence=preprocess(sentence)
    inputs=[inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs=tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                        maxlen=max_length_input,
                                                        padding='post')
    inputs=tf.convert_to_tensor(inputs)
    
    result=' '
    
    hidden=[tf.zeros((1,units))]
    encoder_output,encoder_state=encoder(inputs,hidden)
    decoder_hidden=encoder_state
    decoder_input=tf.expand_dims([out_lang.word_index['<start>']],0)
    
    for i in range(max_length_output):
        predictions,decoder_hidden,attention_weights=decoder(decoder_input,decoder_hidden,encoder_output)
        
        attention_weights=tf.reshape(attention_weights,(-1,))
        attention_plot[i]=attention_weights.numpy()
        
        predicted_id=tf.argmax(predictions[0]).numpy()
        result+= out_lang.index_word[predicted_id]+' '
        
        if out_lang.index_word[predicted_id]=='<end>':
            return result,sentence,attention_plot
        
        decoder_input=tf.expand_dims([predicted_id],0)
        
        return result,sentence,attention_plot
    
    
    

function for plotting the attention weights

In [68]:
def plot_attention_weights(attention_weights,predicted_sentence,sentence):
    fig=plt.figure(figsize=(10,10))
    ax=fig.add_subplot(1,1,1)
    ax.matshow(attention_weights,cmap='viridis')
    plt.show()
    

In [69]:
def translate(sentence):
    result,sentence,prediction_plot=evaluate(sentence)
    attention_plot=prediction_plot[:len(result.split(' ')),:len(sentence.split(' '))]
    plot_attention_weights(attention_plot,results.split(' '),sentence.split(' '))
    
    

#### the following approach is the book's way of subclassing and its training can be done like a normal model (model.fit...)

In [179]:
class language_translation(keras.models.Model):
    def __init__(self,units=128,encoder_embedding_size=32,decoder_embedding_size=32,**kwargs):
        super().__init__(**kwargs)
        self.encoder_embedding=keras.layers.Embedding(input_dim=vocab_size_input,output_dim=encoder_embedding_size)
        self.encoder=keras.layers.LSTM(units,return_sequences=True,return_state=True)
        self.decoder_embedding=keras.layers.Embedding(input_dim=vocab_size_output,output_dim=decoder_embedding_size)
        self.attention=tfa.seq2seq.LuongAttention(units)
        decoder_inner_cell=keras.layers.LSTMCell(units)
        self.decoder_cell=tfa.seq2seq.AttentionWrapper(cell=decoder_inner_cell,attention_mechanism=self.attention)
        output_layer=keras.layers.Dense(vocab_size_output)
        self.decoder=tfa.seq2seq.basic_decoder.BasicDecoder(cell=self.decoder_cell,
                                                           sampler=tfa.seq2seq.sampler.TrainingSampler(),
                                                           output_layer=output_layer)
        self.inference_decoder=tfa.seq2seq.BasicDecoder(cell=self.decoder_cell,
                                                       sampler=tfa.seq2seq.sampler.GreedyEmbeddingSampler(
                                                       embedding_fn=self.decoder_embedding),
                                                       output_layer=output_layer,
                                                       maximum_iterations=max_length_output)
        self.sequence_length=keras.layers.Input(shape=[],dtype=np.int32)
    def call(self,inputs,training=None):
        encoder_input,decoder_input=inputs
        #sequence_lengths=self.sequence_length(seq_length)
        encoder_embeddings=self.encoder_embedding(encoder_input)
        encoder_output,encoder_state_h,encoder_state_c=self.encoder(encoder_embeddings,training=training)
        encoder_state=[encoder_state_h,encoder_state_c]
        self.attention(encoder_output,setup_memory=True)
        decoder_embeddings=self.decoder_embedding(decoder_input)
        decoder_initial_state=self.decoder_cell.get_initial_state(decoder_embeddings)
        decoder_initial_state=decoder_initial_state.clone(cell_state=encoder_state)
        
        if training:
            decoder_outputs,_,_=self.decoder(decoder_embeddings,
                                           initial_state=decoder_initial_state,
                                           training=training)
        else:
            start_tokens=tf.zeros_like(encoder_input[:,0])+out_lang.word_index['<start>']
            decoder_outputs,_,_=self.inference_decoder(decoder_embeddings,
                                                      initial_state=decoder_initial_state,
                                                      start_tokens=start_tokens,
                                                      end_token=0)
        return tf.nn.softmax(decoder_outputs.rnn_output)
            
 
        

In [180]:
model2=language_translation()

In [181]:
model2.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [182]:
seq_length=np.full([x_train.shape[0]],50)
print(seq_length)
print(seq_length.shape)

[50 50 50 ... 50 50 50]
(95171,)


In [183]:
sos_id=vocab_size_output+1
def shift_sequences(Y):
    sos_vector=tf.fill(dims=(len(Y),1),value=sos_id)
    return tf.concat([sos_vector,Y[:,:-1]],axis=1)

In [184]:
x_decoder=shift_sequences(y_train)

In [186]:
history=model2.fit([x_train[:,1:],y_train[:,:-1]],y_train[:,1:],epochs=2,validation_split=0.1)

Epoch 1/2
  12/2677 [..............................] - ETA: 44:05 - loss: 9.3009 - accuracy: 0.7586

KeyboardInterrupt: 

In [160]:
print(x_decoder.shape)
print(y_train.shape)

(95171, 51)
(95171, 51)


In [130]:
x_train[:,1:]

array([[  86,    4,    8, ...,    0,    0,    0],
       [  42,   14,    6, ...,    0,    0,    0],
       [  88,   14,   15, ...,    0,    0,    0],
       ...,
       [   6, 2920,   16, ...,    0,    0,    0],
       [   6,   16, 4418, ...,    0,    0,    0],
       [  17, 5369,  123, ...,    0,    0,    0]])

In [131]:
y_train

array([[   1,    4,   66, ...,    0,    0,    0],
       [   1,   17,   15, ...,    0,    0,    0],
       [   1,   14,   11, ...,    0,    0,    0],
       ...,
       [   1,    5, 2269, ...,    0,    0,    0],
       [   1,   13,   19, ...,    0,    0,    0],
       [   1,   14,   79, ...,    0,    0,    0]])

In [132]:
y_train[:,1:]

array([[   4,   66,   13, ...,    0,    0,    0],
       [  17,   15,    5, ...,    0,    0,    0],
       [  14,   11,  236, ...,    0,    0,    0],
       ...,
       [   5, 2269,   11, ...,    0,    0,    0],
       [  13,   19,   59, ...,    0,    0,    0],
       [  14,   79,  108, ...,    0,    0,    0]])

In [138]:
x_train

array([[   1,   86,    4, ...,    0,    0,    0],
       [   1,   42,   14, ...,    0,    0,    0],
       [   1,   88,   14, ...,    0,    0,    0],
       ...,
       [   1,    6, 2920, ...,    0,    0,    0],
       [   1,    6,   16, ...,    0,    0,    0],
       [   1,   17, 5369, ...,    0,    0,    0]])