In [1]:
import tensorflow as tf
import pathlib
import re
import random

In [2]:
# Download and extract the dataset
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)
path_to_file = pathlib.Path(path_to_zip).parent / 'spa-eng/spa.txt'

In [3]:
# Preprocess sentences
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
    sentence = sentence.strip()
    sentence = "<start> " + sentence + " <end>"
    return sentence

In [4]:
# Create dataset
def create_dataset(path, num_examples=None):
    with open(path, encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
    sentence_pairs = [[preprocess_sentence(sentence) for sentence in line.split('\t')[:2]] for line in lines]
    return zip(*sentence_pairs[:num_examples])


In [5]:
# Tokenize the data
def tokenize(lang):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(lang)
    tensor = tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, tokenizer

In [6]:
# Load and preprocess the dataset
num_examples = 50000  # Set the number of examples to load
source_lang, target_lang = create_dataset(path_to_file, num_examples)

# Tokenize the source and target languages
input_tensor, input_tokenizer = tokenize(source_lang)
target_tensor, target_tokenizer = tokenize(target_lang)

# Vocabulary sizes
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

In [14]:
# Encoder
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [7]:
# Attention Mechanism
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights


In [8]:
# Decoder with Teacher Forcing
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state, attention_weights

In [9]:
# Training configuration
BUFFER_SIZE = len(input_tensor)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor) // BATCH_SIZE
embedding_dim = 256
units = 1024

In [10]:
# Dataset
dataset = tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [15]:
# Initialize encoder and decoder
encoder = Encoder(input_vocab_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(target_vocab_size, embedding_dim, units, BATCH_SIZE)


In [12]:
# Optimizer and loss function
optimizer = tf.keras.optimizers.legacy.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')


In [16]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [17]:

# Training step with teacher forcing
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)  # Teacher forcing

    batch_loss = loss / int(targ.shape[1])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss


In [None]:
# Training Loop with multiple epochs
EPOCHS = 10  # Increase epochs for better performance

for epoch in range(EPOCHS):
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    print(f"Epoch {epoch+1} Loss {total_loss / steps_per_epoch:.4f}")


In [18]:
def evaluate(sentence):
    # Preprocess the input sentence
    sentence = preprocess_sentence(sentence)
    inputs = [input_tokenizer.word_index.get(word, 0) for word in sentence.split()]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=input_tensor.shape[1], padding='post')
    inputs = tf.convert_to_tensor(inputs)

    # Encode the input sentence
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)

    result = ''

    # Decode the output sequence
    for t in range(target_tensor.shape[1]):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()

        if target_tokenizer.index_word[predicted_id] == '<end>':
            break

        result += target_tokenizer.index_word[predicted_id] + ' '

        # Use the predicted word as the next input
        dec_input = tf.expand_dims([predicted_id], 0)

    return result.strip()

'''def translate(sentence):
    result = evaluate(sentence)
    print(f"Input: {sentence}")
    print(f"Predicted translation: {result}") '''

'def translate(sentence):\n    result = evaluate(sentence)\n    print(f"Input: {sentence}")\n    print(f"Predicted translation: {result}") '

In [1]:
# To test on 20 random sentences from the dataset
def translate_random_sentences():
    random_indices = random.sample(range(len(source_lang)), 20)

    for i in random_indices:
        source_sentence = source_lang[i]
        target_sentence = target_lang[i]  # Actual translation (target language sentence)
        
        print(f"Translating sentence {i+1}: {source_sentence}")
        print(f"Actual translation: {target_sentence}")
        
        predicted_translation = evaluate(source_sentence)
        print(f"Predicted translation: {predicted_translation}")
        
        print("\n" + "="*50 + "\n")

# Call the function to translate random sentences
translate_random_sentences()




Epoch 1 Loss 1.5707
Epoch 2 Loss 1.0996
Epoch 3 Loss 0.8080
Epoch 4 Loss 0.5793
Epoch 5 Loss 0.4115
Epoch 6 Loss 0.2982
Epoch 7 Loss 0.2251
Epoch 8 Loss 0.1770
Epoch 9 Loss 0.1458
Epoch 10 Loss 0.1237
Translating sentence 40272: <start> i hope she will get well . <end>
Actual translation: <start> espero que ella se mejore . <end>
Predicted translation: empieza de que lo pudiera .


Translating sentence 30230: <start> who s tom looking for ? <end>
Actual translation: <start> ¿ a qui n est buscando tom ? <end>
Predicted translation: eran qui n era de qui n .


Translating sentence 36588: <start> i m always very nervous . <end>
Actual translation: <start> siempre estoy muy nervioso . <end>
Predicted translation: empieza siempre estoy muy nervioso .


Translating sentence 41284: <start> my uncle died a year ago . <end>
Actual translation: <start> mi t o muri hace un a o . <end>
Predicted translation: comienza el t o a o .


Translating sentence 30297: <start> will you come with us ? <end>
