In [1]:
import pandas as pd
import numpy as np
from time import time

import tensorflow.keras as keras
from keras import Sequential
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.losses import sparse_categorical_crossentropy
from keras.layers import *

# Dataset

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/eduardofc/data/main/es_en.csv")
df.head()

Unnamed: 0,en,es
0,I hope you're not alone.,Espero que no estés solo.
1,"When I was taking a bath, the telephone rang.","Mientras me bañaba, sonó el teléfono."
2,I just need you to come with me.,Solo necesito que vengas conmigo.
3,Tom wondered how soon Mary would have dinner r...,Tom se preguntaba cuán pronto María tendría li...
4,Tom is waiting for an answer.,Tom está esperando una respuesta.


In [7]:
# spanish
es_sentences = df.es.values
es_tokenizer = Tokenizer()
es_tokenizer.fit_on_texts(es_sentences)
es_sequences = es_tokenizer.texts_to_sequences(es_sentences)

In [10]:
# english
en_sentences = df.en.values
en_tokenizer = Tokenizer()
en_tokenizer.fit_on_texts(en_sentences)
en_sequences = en_tokenizer.texts_to_sequences(en_sentences)

In [19]:
es_length = max([len(ss) for ss in es_sequences])
en_length = max([len(ss) for ss in en_sequences])

print(f"{es_length} | {en_length}")

31 | 25


In [24]:
es_vocab = len(es_tokenizer.word_index) +1
en_vocab = len(en_tokenizer.word_index) +1

print(f"{es_vocab} | {en_vocab}")

7893 | 5053


In [25]:
es_padded = pad_sequences(es_sequences, maxlen=es_length, truncating='post')
en_padded = pad_sequences(en_sequences, maxlen=en_length, truncating='post')

# Model

In [26]:
keras.utils.set_random_seed(812)

embed_dim = 128

model = Sequential([
    Embedding(
        input_dim=es_vocab,
        input_length=es_length,
        output_dim=embed_dim
    ),
    LSTM(64, return_sequences=False),
    RepeatVector(en_length),
    LSTM(64, return_sequences=True, dropout=.2),
    TimeDistributed(Dense(en_vocab, activation='softmax'))
])




In [27]:
model.compile(
    loss=sparse_categorical_crossentropy,
    optimizer=Adam(1e-3), 
    metrics=['accuracy']
)

In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 31, 128)           1010304   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 repeat_vector (RepeatVecto  (None, 25, 64)            0         
 r)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 25, 64)            33024     
                                                                 
 time_distributed (TimeDist  (None, 25, 5053)          328445    
 ributed)                                                        
                                                                 
Total params: 1421181 (5.42 MB)
Trainable params: 142118

In [29]:
start = time()

model.fit(
    es_padded,
    en_padded,
    epochs=35
)

end = time()
print(f">>>> Elapsed time: {(end-start)/60}m")

Epoch 1/35


Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
>>>> Elapsed time: 12.595557185014089m


In [30]:
model.save("model_seq2seq35.h5")

  saving_api.save_model(


In [34]:
# from keras.models import load_model

# model2 = load_model("model_seq2seq35.h5")

In [57]:
model = load_model("model_seqseq2_500.h5")

# Predicciones

In [59]:
ii = 457
# ii = 841

print(f"{es_sentences[ii]}")
print(f"{en_sentences[ii]}")

preds = model.predict(es_padded[ii:ii+1])[0]
' '.join([en_tokenizer.index_word[ww] for ww in np.argmax(preds, 1) if ww!=0])

Debes dar lo mejor.
You must do your best.


'you must do your best'