In [1]:
import tensorflow as tf
tf.enable_eager_execution()
from tensorflow.keras.layers import LSTM

In [2]:
sentence = "My name is John."

In [3]:
class BahdanauAttention(tf.keras.layers.Layer):
    
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.va = tf.keras.layers.Dense(1)
    
    def call(self, query, values):
        """
        query tensor of shape: [batch_size, hidden_size],
        value tensor of shape: [batch_size, inp_seq_len, hidden_size]
        """
        query = tf.expand_dims(query, 1)        
        scores = self.va(tf.nn.tanh(self.W1(query) + self.W2(values)))
        attention_weights = tf.nn.softmax(scores, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return attention_weights, context_vector

In [4]:
batch_size = 4
latentSpaceDimension = 16
n_features = 12
n_timesteps = 10

encoder_inputs = tf.random.uniform((batch_size,
                                    n_timesteps,
                                    n_features))
encoder_lstm = LSTM(latentSpaceDimension,
                    return_sequences=True,
                    return_state=True,
                    name='encoder_lstm')
decoder_lstm = LSTM(latentSpaceDimension, 
                    return_state=True,
                    name='decoder_lstm')

attention = BahdanauAttention(latentSpaceDimension)
encoder_outputs, *encoder_states = encoder_lstm(encoder_inputs)
#encoder_state_h, encoder_state_c = encoder_states
#initial decoder's state
#encoder's last hidden state + last cell state
#decoder_outputs = encoder_state_h
#states = encoder_states

## French To English With Encoder Decoder Architecture

In [5]:
import numpy as np
import io
import unicodedata
import re
from tqdm import tqdm

file_path = ".\\fra-eng\\fra.txt"
#Opening and reading the .txt file that contains english and its corresponding italian translation
lines = io.open(file_path, encoding = 'UTF-8').read().split('\n')

In [6]:
lines[0:5]

['Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)',
 'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)',
 'Go.\tBouge !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)',
 'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)',
 'Hi.\tSalut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)']

In [7]:
def unicode_to_ascii(s) :
  """
  Unicode to ascii conversion
  """
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def cleanhtml(raw_html) :
    """
      Function to clean html tags and numbers
      """
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def cleanString(incomingString):
    """
      Function to clean unwanted symbol from text
    """
    newstring = incomingString
    newstring = newstring.replace("!","")
    newstring = newstring.replace("@","")
    newstring = newstring.replace("#","")
    newstring = newstring.replace("$","")
    newstring = newstring.replace("%","")
    newstring = newstring.replace("^","")
    newstring = newstring.replace("&","and")
    newstring = newstring.replace("*","")
    newstring = newstring.replace("(","")
    newstring = newstring.replace(")","")
    newstring = newstring.replace("+","")
    newstring = newstring.replace("=","")
    newstring = newstring.replace("?","")
    newstring = newstring.replace("\'","")
    newstring = newstring.replace("\"","")
    newstring = newstring.replace("{","")
    newstring = newstring.replace("}","")
    newstring = newstring.replace("[","")
    newstring = newstring.replace("]","")
    newstring = newstring.replace("<","")
    newstring = newstring.replace(">","")
    newstring = newstring.replace("~","")
    newstring = newstring.replace("`","")
    newstring = newstring.replace(":","")
    newstring = newstring.replace(";","")
    newstring = newstring.replace("|","")
    newstring = newstring.replace("\\","")
    newstring = newstring.replace("/","")     
    return ' '.join(newstring.split())

def preprocess_string(data) :
    """
      This function calls other
      preprocessing function for
      cleaning data
    """
    data = unicode_to_ascii(data)
    #Remove html
    data = cleanhtml(data)
    #Remove unwanted symbols
    data = cleanString(data)
    return data

def get_data(raw_lines):
    french = []
    english = []
    for itr in tqdm(range(len(raw_lines))):
        if len(raw_lines[itr].split()) > 2:
            eng, fre, _ = raw_lines[itr].split('\t')
            english.append('<start> ' + preprocess_string(eng) + ' <end>')
            french.append('<start> ' + preprocess_string(fre) + ' <end>')
    return french, english

#Get data
french, english = get_data(lines)

#Train-Test Split
from sklearn.model_selection import train_test_split
fre_tr, fre_te, eng_tr, eng_te = train_test_split(french, english, test_size = 0.2, random_state = 43)

100%|███████████████████████████████████████████████████████████████████████| 192342/192342 [00:06<00:00, 31044.21it/s]


In [8]:
eng_te[1:5], fre_te[1:5]

(['<start> These pills come in a blister pack. <end>',
  '<start> Wheres your daughter <end>',
  '<start> Ive upset you. <end>',
  '<start> Do you mind if I turn off the light <end>'],
 ['<start> Ces comprimes se presentent sous forme de plaquettes. <end>',
  '<start> Ou se trouve ta fille <end>',
  '<start> Je tai contrariee. <end>',
  '<start> Voyez-vous un inconvenient a ce que jeteigne la lumiere <end>'])

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow
tensorflow.enable_eager_execution()

#Tokenizing and padding input language
fre_token = Tokenizer(filters='', lower = False)
fre_token.fit_on_texts(fre_tr)
fre_tokenized = fre_token.texts_to_sequences(fre_tr)
fre_padded = pad_sequences(fre_tokenized, padding='post')

#Tokenizing and padding target language
eng_token = Tokenizer(filters='', lower = False)
eng_token.fit_on_texts(eng_tr)
eng_tokenized = eng_token.texts_to_sequences(eng_tr)
eng_padded = pad_sequences(eng_tokenized, padding='post')

#Number of unique tokens in input and output languages
num_ip_tokens = len(fre_token.word_index)   #French
num_op_tokens = len(eng_token.word_index)   #English

#Maximum length of a sentence in both the languages
max_len_ip = fre_padded.shape[1]   #French
max_len_op = eng_padded.shape[1]   #English

In [16]:
print(max_len_ip, max_len_op, '#French Tokens=',num_ip_tokens)

57 49 #French Tokens= 41813


In [17]:
fre_padded[0]

array([    1,    11,    39,   805,    16,     7,   164,  8623,    55,
         250,   610,    49,   136,   188,     9,   406,  2604,     5,
         399,    38, 13463,     2,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0])

In [18]:
# Create a tf.data dataset
import tensorflow as tf

BUFFER_SIZE = len(fre_padded)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 512

dataset = tf.data.Dataset.from_tensor_slices((fre_padded, eng_padded)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder = True)

In [19]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size,
                 embedding_dim,
                 enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size,
                                                   embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                  return_sequences=True,
                                  return_state=True,
                                  recurrent_initializer='glorot_uniform',
                                  recurrent_activation='sigmoid')
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
    def initial_hidden_state(self):
        #Generating encoder initial states as all zeros
        return tf.zeros((self.batch_sz, self.enc_units))

In [20]:
#Decoder with attention
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size,
                 embedding_dim,
                 dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                  return_sequences=True,
                                  return_state=True,
                                  recurrent_initializer='glorot_uniform',
                                  recurrent_activation='sigmoid')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(dec_units)
        
        # used for attention
        #self.W1 = tf.keras.layers.Dense(self.dec_units)
        #self.W2 = tf.keras.layers.Dense(self.dec_units)
        #self.V = tf.keras.layers.Dense(1)
    
    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        ##hidden_with_time_axis = tf.expand_dims(hidden, 1)
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V
        #Getting scores
        ##score = self.V(tf.nn.tanh(self.W1(enc_output)+self.W2(hidden_with_time_axis)))
        # attention_weights shape == (batch_size, max_length, 1)
        #Attention weights from softmax
        ##attention_weights = tf.nn.softmax(score, axis=1)
        # context_vector shape after sum == (batch_size, hidden_size)
        #getting context vector
        ##context_vector = attention_weights*enc_output
        ##context_vector = tf.reduce_sum(context_vector, axis=1)
        attention_weights, context_vector = self.attention(hidden, enc_output)
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        #Concatenating previous op with cv
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        # output shape == (batch_size * 1, vocab)
        x = self.fc(output)
        return x, state, attention_weights
    
    def initialize_hidden_state(self):
        return tf.zeros(self.batch_sz, self.dec_units)

In [21]:
encoder = Encoder(num_ip_tokens + 1, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(num_op_tokens + 1, embedding_dim, units, BATCH_SIZE)

In [25]:
optimizer = tf.keras.optimizers.Adam()

def loss_function(real, pred):
    mask = tf.equal(real, 0) # as in the OP
    weights = 1. - tf.cast(mask, tf.float32)  # convert to (0, 1) weights
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred)*weights
    return tf.reduce_mean(loss_)
 
import os
from tensorflow.train import Checkpoint
checkpoint_dir = './FrenchToEnglish/CheckPoint'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

In [26]:
@tf.function
def train_step(inp_seq, targ_seq):    
    loss = 0
    #Getting initial encoder states (all zeros)
    hidden = encoder.initial_hidden_state()
    with tf.GradientTape() as tape:
        enc_output , enc_hidden = encoder(inp, hidden)
        #Setting final encoder states as initial decoder states
        dec_hidden = enc_hidden
        # Teacher forcing - feeding the target as the next input
        #Passing '<start>' token as initial token
        dec_input = tf.expand_dims([eng_token.word_index['<start>']]*BATCH_SIZE, 1)
        for t in range(1, targ.shape[1]) :
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:,t], predictions)
            # using teacher forcing
            dec_input = tf.expand_dims(targ[:,t], 1)
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.variables + decoder.variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [None]:
EPOCH = 20

for epoch in range(EPOCH):
    total_loss = 0
    for batch, (inp, targ) in enumerate(dataset):
        loss = train_step(inp, targ)
        total_loss+= loss
        if batch % 1 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, loss.numpy()))
    if (epoch + 1)%2 == 0 :
        checkpoint.save(file_prefix=checkpoint_prefix)
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / N_BATCH))  

In [38]:
dataset.range(10)

<DatasetV1Adapter shapes: (), types: tf.int64>

In [27]:
tf.equal([0, 1, 3], 0)

<tf.Tensor: id=1383, shape=(3,), dtype=bool, numpy=array([ True, False, False])>

In [1]:
import numpy as np

In [3]:
np.zeros(25,60)

TypeError: data type not understood