In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
import tensorflow as tf
import numpy as np

In [2]:
import tensorflow as tf
import zipfile
from pathlib import Path

url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
zip_path = tf.keras.utils.get_file(
    fname="spa-eng.zip",
    origin=url,
    extract=False,
    cache_dir="/content",
    cache_subdir=""
)

extract_path = Path("/content/spa-eng-data")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

text_path = extract_path / "spa-eng" / "spa.txt"
text = text_path.read_text(encoding='utf-8')

print(text[:500])


Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
[1m2638744/2638744[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Go.	Ve.
Go.	Vete.
Go.	Vaya.
Go.	Váyase.
Hi.	Hola.
Run!	¡Corre!
Run.	Corred.
Who?	¿Quién?
Fire!	¡Fuego!
Fire!	¡Incendio!
Fire!	¡Disparad!
Help!	¡Ayuda!
Help!	¡Socorro! ¡Auxilio!
Help!	¡Auxilio!
Jump!	¡Salta!
Jump.	Salte.
Stop!	¡Parad!
Stop!	¡Para!
Stop!	¡Pare!
Wait!	¡Espera!
Wait.	Esperen.
Go on.	Continúa.
Go on.	Continúe.
Hello!	Hola.
I ran.	Corrí.
I ran.	Corría.
I try.	Lo intento.
I won!	¡He ganado!
Oh no!	¡Oh, no!
Relax.	Tomátelo con soda.
Smile.	Sonríe.
Attack!	¡Al ataque!
Attack!	¡Atacad!
Ge


In [3]:
import numpy as np
text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs) # separates the pairs into 2 lists

In [4]:
english = list(sentences_en)

In [5]:
spanish = ['<bos> ' + sentence + ' <eos>' for sentence in sentences_es]

In [6]:
def tokenizer(sentences):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(sentences)
  sequence = tokenizer.texts_to_sequences(sentences)
  return tokenizer, sequence

In [7]:
english_tokenizer, english_sequence = tokenizer(english)

In [8]:
spanish_tokenizer, spanish_sequence = tokenizer(spanish)

In [9]:
english_maxlen = max(len(i) for i in english_sequence)

In [10]:
spanish_maxlen = max(len(i) for i in spanish_sequence)

In [11]:
english_vocab_size = len(english_tokenizer.word_index) + 1

In [12]:
spanish_vocab_size = len(spanish_tokenizer.word_index) + 1

In [13]:
english_padding = pad_sequences(english_sequence, maxlen = english_maxlen , padding = 'post')

In [14]:
spanish_padding = pad_sequences(spanish_sequence, maxlen = spanish_maxlen, padding = 'post')

In [15]:
spanish_vocab_size

26040

In [16]:
english_vocab_size

13525

In [17]:
spanish_padding

array([[   1,   75,   67, ...,    0,    0,    0],
       [   1,   13, 1019, ...,    0,    0,    0],
       [   1,  164,    4, ...,    0,    0,    0],
       ...,
       [   1, 4896,  133, ...,    0,    0,    0],
       [   1,    7,    6, ...,    0,    0,    0],
       [   1,   52,  331, ...,    0,    0,    0]], dtype=int32)

In [18]:
decoder_target = np.zeros_like(spanish_padding)
decoder_target[:, :-1] = spanish_padding[:, 1:]

In [19]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout, TimeDistributed
from tensorflow.keras import regularizers

# --- Encoder ---
encoder_inputs = Input(shape=(english_maxlen,))
encoder_embedding_layer = Embedding(
    input_dim=english_vocab_size,
    output_dim=128,
    mask_zero=True
)
encoder_embedding = encoder_embedding_layer(encoder_inputs)

# LSTM with dropout
encoder_lstm_layer = LSTM(
    512,
    return_state=True,

)
encoder_output, hidden_state, cell_state = encoder_lstm_layer(encoder_embedding)
encoder_states = [hidden_state, cell_state]

# --- Decoder ---
decoder_inputs = Input(shape=(spanish_maxlen,))
decoder_embedding_layer = Embedding(
    input_dim=spanish_vocab_size,
    output_dim=128,
    mask_zero=True
)
decoder_embedding = decoder_embedding_layer(decoder_inputs)

decoder_lstm = LSTM(
    512,
    return_sequences=True,
    return_state=True,
)
decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Dropout before dense output
decoder_output = Dropout(0.3)(decoder_output)

# TimeDistributed Dense for sequence output
decoder_dense = TimeDistributed(Dense(spanish_vocab_size, activation='softmax'))
output = decoder_dense(decoder_output)

In [20]:
model = Model([encoder_inputs, decoder_inputs], output)

In [21]:
model.compile(optimizer = 'adam', loss='sparse_categorical_crossentropy', metrics = ['accuracy'])

In [22]:
decoder_target.shape

(118964, 51)

In [23]:
#decoder_target = np.expand_dims(decoder_target, axis =-1)
decoder_target.shape

(118964, 51)

In [24]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3)

In [25]:
fitted_model = model.fit([english_padding, spanish_padding], decoder_target, validation_split=0.2,epochs = 10)

Epoch 1/10
[1m2975/2975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m613s[0m 203ms/step - accuracy: 0.1112 - loss: 4.9606 - val_accuracy: 0.0736 - val_loss: 3.3123
Epoch 2/10
[1m2975/2975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m602s[0m 202ms/step - accuracy: 0.0767 - loss: 3.0679 - val_accuracy: 0.0872 - val_loss: 2.6426
Epoch 3/10
[1m2975/2975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m603s[0m 203ms/step - accuracy: 0.0910 - loss: 2.3331 - val_accuracy: 0.0951 - val_loss: 2.3059
Epoch 4/10
[1m2975/2975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m602s[0m 202ms/step - accuracy: 0.1006 - loss: 1.8543 - val_accuracy: 0.1006 - val_loss: 2.1068
Epoch 5/10
[1m2975/2975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m603s[0m 203ms/step - accuracy: 0.1082 - loss: 1.5128 - val_accuracy: 0.1033 - val_loss: 1.9968
Epoch 6/10
[1m2975/2975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m603s[0m 203ms/step - accuracy: 0.1142 - loss: 1.2559 - val_accuracy: 0.1050 - val_loss:

In [26]:
def translate(sentence):
  seq = english_tokenizer.texts_to_sequences([sentence])
  seq = pad_sequences(seq, maxlen = english_maxlen, padding = 'post')
  seq = tf.convert_to_tensor(seq)


  encoder_embedding = encoder_embedding_layer(seq)
  _,state_h, state_c = encoder_lstm_layer(encoder_embedding)
  states = [state_h, state_c]

  bos_token = spanish_tokenizer.word_index.get('bos')
  eos_token = spanish_tokenizer.word_index.get('eos')

  target_seq = tf.convert_to_tensor([[bos_token]], dtype = tf.int32)
  decoded_words = []

  for i in range(spanish_maxlen):
      decoder_embedding = decoder_embedding_layer(target_seq)
      decoder_outputs,state_h,state_c = decoder_lstm(decoder_embedding, initial_state = states)
      output = decoder_dense(decoder_outputs)

      word_id = np.argmax(output[0, -1, :].numpy())

      if word_id == eos_token:
          break
      word = spanish_tokenizer.index_word.get(word_id)

      if word:
        decoded_words.append(word)

      target_seq = tf.convert_to_tensor([[word_id]], dtype = tf.int32)
      states = [state_h, state_c]
  return decoded_words

In [46]:
sentence = 'I am happy'

In [47]:
spanish_list = translate(sentence)

In [48]:
spanish_sentences = ' '.join(spanish_list)

In [49]:
print(spanish_sentences)

estoy feliz


In [51]:
sentence = 'What is your name?'

In [52]:
spanish_list = translate(sentence)

In [53]:
spanish_sentences = ' '.join(spanish_list)

In [54]:
print(spanish_sentences)

cuál es tu nombre


In [59]:
sentence = 'This model translates english to spanish fluently'

In [60]:
spanish_list = translate(sentence)

In [61]:
spanish_sentences = ' '.join(spanish_list)

In [62]:
print(spanish_sentences)

este va a estudiar inglés con fluidez


In [63]:
sentence = 'Thank You'

In [64]:
spanish_list = translate(sentence)

In [65]:
spanish_sentences = ' '.join(spanish_list)

In [66]:
print(spanish_sentences)

gracias
