In [107]:
!python -m pip install keras



In [109]:
import pandas as pd
import tensorflow as tf
from unidecode import unidecode
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
# from keras.utils.vis_utils import plot_model

In [32]:
file = open("../../dados/sentence-pairs-english-portuguese-2023-11.tsv", "r")
row = 0
with file:
  try:
    while line := file.readline():
      row += 1
      # print(row, line.rstrip())
  except Exception as e:
    print("Last Row read: ", row)
    print(e)


Last Row read:  2694
'charmap' codec can't decode byte 0x81 in position 5159: character maps to <undefined>


In [33]:
texts = pd.read_csv("../../dados/sentence-pairs-english-portuguese-2023-11.tsv", encoding="utf-8", sep="\t",
                   names=['english_id', 'english_text', 'portuguese_id', 'portuguese_text'])

In [35]:
texts["sentence"] = texts["english_text"].astype(str) + "<sep>" + texts["portuguese_text"].astype(str)

In [51]:
### Remover os acentos e caracteres especiais do ingles e do portugues

In [50]:
trans = str.maketrans('\n\r\t', '   ', string.punctuation)

def preprocess_sentence( sentence ):
    new_sentence = unidecode( sentence )
    return new_sentence.translate(trans)

In [58]:
texts["english_text_clean"] = texts["english_text"].apply(preprocess_sentence)

In [59]:
texts["portuguese_text_clean"] = texts["portuguese_text"].apply(preprocess_sentence)

In [60]:
### Adicionar tags nas sentenças

In [61]:
def tag_sentence( sentence ):
    return "<sos> " + sentence + " <eos>"

In [62]:
texts["english_text_tagged"] = texts["english_text_clean"].apply(tag_sentence)

In [63]:
texts["portuguese_text_tagged"] = texts["portuguese_text_clean"].apply(tag_sentence)

In [83]:
english_train = texts["english_text_tagged"]
portuguese_train = texts["portuguese_text_tagged"]

In [84]:
### Criando vocabulario ingles
english_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>")
english_tokenizer.fit_on_texts(english_train)
english_vocab_size = len(english_tokenizer.word_index) + 1
english_tokenizer.get_config()

{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': '<unk>',
 'document_count': 284341,
 'index_docs': '{"2": 284341, "213": 1253, "249": 1047, "131": 2198, "3": 284341, "52": 6187, "5": 60573, "6": 62347, "340": 711, "19": 15524, "1721": 88, "18": 15614, "140": 2043, "15": 16127, "583": 369, "4920": 18, "7881": 8, "10": 37947, "5954": 13, "95": 3040, "1256": 135, "1708": 86, "4": 59828, "126": 2267, "50": 6145, "287": 850, "28": 11094, "457": 484, "25": 12983, "30": 10974, "40": 7800, "9": 41251, "1983": 72, "20": 15304, "93": 3108, "61": 5155, "481": 456, "97": 2943, "130": 2171, "32": 10014, "24": 13486, "27": 11603, "17": 16985, "11": 26739, "62": 4966, "1541": 98, "11496": 4, "1785": 83, "13": 24386, "535": 411, "1895": 76, "68": 4494, "63": 4829, "199": 1294, "1505": 105, "16337": 2, "21": 14473, "81": 3582, "100": 2735, "160": 1732, "70": 4420, "1738": 85, "8": 50078, "23": 13630, "22": 14196,

In [85]:
portuguese_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>")
portuguese_tokenizer.fit_on_texts(portuguese_train)
portuguese_vocab_size = len(portuguese_tokenizer.word_index) + 1
portuguese_tokenizer.get_config()

{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': '<unk>',
 'document_count': 284341,
 'word_counts': '{"sos": 284380, "vamos": 2020, "tentar": 520, "alguma": 1380, "coisa": 2087, "eos": 284341, "algo": 1668, "preciso": 1811, "ir": 3439, "dormir": 600, "tenho": 4867, "que": 67826, "de": 60597, "hoje": 2626, "e": 57534, "dia": 2334, "18": 53, "junho": 87, "aniversario": 399, "do": 14356, "muiriel": 18, "o": 66964, "muriel": 6, "completou": 25, "20": 263, "anos": 2552, "tem": 9032, "agora": 2884, "a": 60608, "esta": 20891, "com": 16741, "fez": 2173, "senha": 87, "voltarei": 102, "em": 17504, "breve": 252, "volto": 54, "logo": 471, "ja": 4050, "estarei": 224, "volta": 627, "nao": 54122, "palavras": 487, "isso": 15233, "nunca": 3066, "acaba": 64, "vai": 4991, "ter": 3295, "fim": 502, "acabar": 101, "acabara": 23, "eu": 46449, "simplesmente": 241, "sei": 3772, "dizer": 1946, "era": 3820, "um": 25330, "c

In [86]:
english_vocab_size, portuguese_vocab_size

(41448, 54674)

In [90]:
def generate_decoder_inputs_targets(sentences, tokenizer):
  seqs = tokenizer.texts_to_sequences(sentences)
  decoder_inputs = [s[:-1] for s in seqs] # Drop the last token in the sentence.
  decoder_targets = [s[1:] for s in seqs] # Drop the first token in the sentence.

  return decoder_inputs, decoder_targets

In [91]:
### Criando Sequences das sentenças
train_encoder_inputs = english_tokenizer.texts_to_sequences(english_train)

In [92]:
train_decoder_inputs, train_decoder_targets = generate_decoder_inputs_targets(portuguese_train, 
                                                                              portuguese_tokenizer)

In [93]:
max_encoding_len = len(max(train_encoder_inputs, key=len))
max_encoding_len

198

In [94]:
max_decoding_len = len(max(train_decoder_inputs, key=len))
max_decoding_len

201

In [97]:
padded_train_encoder_inputs = pad_sequences(train_encoder_inputs, max_encoding_len, padding='post', truncating='post')
padded_train_decoder_inputs = pad_sequences(train_decoder_inputs, max_decoding_len, padding='post', truncating='post')
padded_train_decoder_targets = pad_sequences(train_decoder_targets, max_decoding_len, padding='post', truncating='post')

In [98]:
embedding_dim = 128
hidden_dim = 256
default_dropout=0.2
batch_size = 32
epochs = 30

In [100]:
# The initial encoder input layer which will take in padded sequences. We're specifying
# a shape of None here but you can specify it upfront if you want since we
# know what the max encoding length is.
encoder_inputs = layers.Input(shape=[None], name='encoder_inputs')

# The embedding layer. Similar to what we did in the RNN demo.
encoder_embeddings = layers.Embedding(english_vocab_size, 
                                      embedding_dim,
                                      mask_zero=True,
                                      name='encoder_embeddings')

# Passing the input layer output to the embedding layer creates a link between the
# two. Input sequences will now flow into the embedding layer which will output
# a sequence of embeddings.
encoder_embedding_output = encoder_embeddings(encoder_inputs)


# We're not using any kind of attention mechanism in this model, so setting only
# return_state to True is enough. return_sequences remains False.
encoder_lstm = layers.LSTM(hidden_dim, 
                           return_state=True, 
                           dropout=default_dropout, 
                           name='encoder_lstm')

# Passing the embedding layer output to the LSTM layer creates another link.
# IMPORTANT: The LSTM always returns three values. When return_sequences is
# False, encoder_outputs and state_h are the SAME. When return_sequences is
# True, encoder_outputs contains the encoder hidden states from each time step.
#
# Side note: we won't be using encoder_outputs here so that variable can be 
# replaced with a _ if preferred.
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding_output)

# The final hidden and cell/context states from the encoder will be the the
# initial states for the decoder.
encoder_states = (state_h, state_c)

In [102]:
decoder_inputs = layers.Input(shape=[None], name='decoder_inputs')


decoder_embeddings = layers.Embedding(portuguese_vocab_size, 
                                      embedding_dim, 
                                      mask_zero=True,
                                      name='decoder_embeddings')


decoder_embedding_output = decoder_embeddings(decoder_inputs)

# Return sequences set to True.
decoder_lstm = layers.LSTM(hidden_dim,
                           return_sequences=True,
                           return_state=True,
                           dropout=default_dropout,
                           name='decoder_lstm')


# Set the decoder's initial state to the encoder's final output states. Since
# return_sequences is set to True, decoder_outputs is going to be a collection of
# the decoder's hidden state at each timestep. Also note that since we don't need
# the decoder's final hidden output and cell states, those are just set to _.
decoder_outputs, _, _ = decoder_lstm(decoder_embedding_output, initial_state=encoder_states)

# Have a softmax layer in the end to create a probability distribution for the output word.
decoder_dense = layers.Dense(portuguese_vocab_size, activation='softmax', name='decoder_dense')

# The probability distribution for the output word.
y_proba = decoder_dense(decoder_outputs)

In [103]:
# Note how the model is taking two inputs in an array.
model = tf.keras.Model([encoder_inputs, decoder_inputs], y_proba, name='hun_eng_seq2seq_nmt_no_attention')

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',  metrics='sparse_categorical_accuracy')
model.summary()

Model: "hun_eng_seq2seq_nmt_no_attention"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 decoder_inputs (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 encoder_embeddings (Embedd  (None, None, 128)            5305344   ['encoder_inputs[0][0]']      
 ing)                                                              

In [110]:
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [112]:
history = model.fit([padded_train_encoder_inputs, padded_train_decoder_inputs], padded_train_decoder_targets,
                      batch_size=batch_size,
                      epochs=epochs,
                      callbacks=[cp_callback, es_callback])

NameError: name 'cp_callback' is not defined

In [104]:
# from keras.utils.vis_utils import plot_model
# plot_model(model, to_file='hun_eng_seq2seq_nmt_no_attention.png', show_shapes=True, show_layer_names=True)

ModuleNotFoundError: No module named 'keras.utils.vis_utils'

In [80]:
### Separar em Train e Test
english_train, english_test, portuguese_train, portuguese_test = \
train_test_split(texts["english_text_tagged"], texts["portuguese_text_tagged"], random_state=10)