## **Libraries**

In [1]:
from tensorflow.keras.layers import Masking, Input, Dense, LSTM, Embedding
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import re

In [2]:
data = pd.read_table('spa.txt')
data.head()

Unnamed: 0,Go.,Ve.,CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986655 (cueyayotl)
0,Go.,Vete.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Vaya.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,Váyase.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Hi.,Hola.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
4,Run!,¡Corre!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...


In [3]:
data = data.rename(columns={'Go.':'eng', 'Ve.':'spa'})

In [4]:
X = data['eng'].apply(lambda x:x.lower())
y = data['spa'].apply(lambda x:x.lower())
X = X.apply(lambda x:re.sub("[^a-zA-Z]"," ",x))
y = y.apply(lambda x:re.sub("[^a-zA-Z]"," ",x))
y = y.apply(lambda x:'START_ '+x+' _END')

In [5]:
eng_vocab, spa_vocab = set(), set()
for sent in X:
  for word in sent.split():
    if word not in eng_vocab:
      eng_vocab.add(word)
for sent in y:
 for word in sent.split():
  if word not in spa_vocab:
    spa_vocab.add(word)
engVocab = sorted(list(eng_vocab))
spaVocab = sorted(list(spa_vocab))

In [None]:
source_length_list=[]
for l in X:
    source_length_list.append(len(l.split(' ')))
max_eng_sent_length= max(source_length_list)

In [7]:
target_length_list=[]
for l in y:
    target_length_list.append(len(l.split(' ')))
max_spa_sent_length= max(target_length_list)

In [8]:
eng_word2idx = dict([(word, i+1) for i, word in enumerate(engVocab)])
spa_word2idx = dict([(word, i+1) for i, word in enumerate(spaVocab)])

In [9]:
eng_idx2word= dict([(i, word) for word, i in  eng_word2idx.items()])
spa_idx2word =dict([(i, word) for word, i in spa_word2idx.items()])

In [10]:
X, y = shuffle(X, y, random_state=2)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [11]:
num_encoder_tokens = len(engVocab)
num_decoder_tokens = len(spaVocab) + 1

In [12]:
def generate_batch(X=x_train, y=y_train, batch_size=128):
  while True:
    for i in range(0, len(X), batch_size):
      encoder_input_data = np.zeros(shape=(batch_size, max_eng_sent_length), dtype="float32")
      decoder_input_data = np.zeros(shape=(batch_size, max_spa_sent_length), dtype="float32")
      decoder_output_data = np.zeros(shape=(batch_size, max_spa_sent_length, num_decoder_tokens), dtype="float32")
      for j, (input_text, target_text) in enumerate(zip(X[i:i+batch_size], y[i:i+batch_size])):
        for k, word in enumerate(input_text.split()):
          encoder_input_data[j, k] = eng_word2idx[word]
        for k, word in enumerate(target_text.split()):
          if k < len(target_text.split())-1:
            decoder_input_data[j, k] = spa_word2idx[word]
          if k > 0:
            decoder_output_data[j, k-1, spa_word2idx[word]] = 1
      yield([encoder_input_data, decoder_input_data], decoder_output_data)

In [13]:
train_samples = len(x_train)
val_samples = len(x_test)
batch_size = 128
epochs = 50
latent_dim=256

In [14]:
encoder_inputs = Input(shape=(None,))
enc_emb_layer = Embedding(num_encoder_tokens, latent_dim, mask_zero=True)(encoder_inputs)
enc_lstm_layer = LSTM(units=latent_dim, return_state=True)
encoder_outputs, h_state, c_state = enc_lstm_layer(enc_emb_layer)
encoder_states = [h_state, c_state]

2025-07-25 18:54:08.829958: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [15]:
decoder_inputs = Input(shape=(None, ))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)
dec_lstm_layer = LSTM(units=latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = dec_lstm_layer(dec_emb, initial_state=encoder_states)
dec_dense = Dense(units=num_decoder_tokens, activation="softmax")
decoder_outputs = dec_dense(decoder_outputs)

In [16]:
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

In [17]:
checkpoint = ModelCheckpoint('my_models/nmt_eng2spa.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)

In [18]:
reduceLR = ReduceLROnPlateau(monitor='val_loss',factor=0.2,patience=3,verbose=1,min_delta=0.0001)

In [19]:
callbacks = [checkpoint, reduceLR]

In [None]:
hist = model.fit(generate_batch(X=x_train, y=y_train), steps_per_epoch=train_samples//batch_size, epochs=epochs, callbacks=callbacks, verbose=1, validation_data=generate_batch(X=x_test, y=y_test), validation_steps=val_samples//batch_size)


