<a href="https://colab.research.google.com/github/angeruPpb/IA_Transformer/blob/main/Translator_e5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Ejecutar para importar keras-transformer
# pip install keras-transformer

In [None]:
import numpy as np
from keras_transformer import get_model, decode
from pickle import load
from google.colab import drive

np.random.seed(0)

In [None]:
drive.mount('/content/drive')
filename = '/content/drive/My Drive/Personal/Datasets/english-spanish.pkl'
dataset = load(open(filename, 'rb'))
print(dataset[120000,0])
print(dataset[120000,1])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
tom is a new yorker but he doesnt have a new york accent
tom es neoyorquino pero no tiene acento de nueva york


In [None]:
# Crear tokens
source_tokens = []
for sentence in dataset[:,0]:
  source_tokens.append(sentence.split(' '))
print(source_tokens[120000])

target_tokens = []
for sentence in dataset[:,1]:
  target_tokens.append(sentence.split(' '))
print(target_tokens[120000])

['tom', 'is', 'a', 'new', 'yorker', 'but', 'he', 'doesnt', 'have', 'a', 'new', 'york', 'accent']
['tom', 'es', 'neoyorquino', 'pero', 'no', 'tiene', 'acento', 'de', 'nueva', 'york']


In [None]:
def build_token_dictionary(token_list):
  token_dict = {
      '<PAD>': 0,
      '<START>': 1,
      '<END>':2
  }
  for tokens in token_list:
    for token in tokens:
      if token not in token_dict:
        token_dict[token] = len(token_dict)
  return token_dict

In [None]:
# Asignamos un valor numerico a cada token/palabra en el dataset
source_token_dict = build_token_dictionary(source_tokens)
target_token_dict = build_token_dictionary(target_tokens)
target_token_dict_inv = {v:k for k,v in target_token_dict.items()}

# print(source_token_dict)
# print(target_token_dict)
# print(target_token_dict_inv)

In [None]:
# Agregar Start,End y Pad a cada frase del set de entrenamiento
encoder_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
decoder_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
output_tokens = [tokens + ['<END>'] for tokens in target_tokens]

source_max_len = max(map(len, encoder_tokens))
target_max_len = max(map(len, decoder_tokens))

encoder_tokens = [tokens + ['<PAD>']*(source_max_len-len(tokens)) for tokens in encoder_tokens]
decoder_tokens = [tokens + ['<PAD>']*(target_max_len-len(tokens)) for tokens in decoder_tokens]
output_tokens = [tokens + ['<PAD>']*(target_max_len-len(tokens)) for tokens in output_tokens]

In [None]:
# Asi quedan los nuevos tokens
print(encoder_tokens[120000])
print(decoder_tokens[120000])
print(output_tokens[120000])

['<START>', 'go', '<END>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<START>', 've', '<END>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['ve', '<END>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<P

In [53]:
encoder_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encoder_tokens]
decoder_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decoder_tokens]
output_decoded = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in output_tokens]

print(encoder_input[120000])
print(decoder_input[120000])
print(output_decoded[120000])

[1, 56, 258, 120, 197, 12666, 2914, 32, 1577, 140, 120, 197, 5385, 4287, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 123, 387, 23929, 1210, 45, 1006, 8223, 98, 1233, 10238, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[123, 387, 23929, 1210, 45, 1006, 8223, 98, 1233, 10238, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
# Crear Transformer
model = get_model(
    token_num= max(len(source_token_dict), len(target_token_dict)),
    embed_dim = 32,         # Entrada del vector Embedding
    encoder_num = 2,        # Cant. de bloques de codificacion, son 6
    decoder_num = 2,        # Cant. de bloques de decodificacion, son 6
    head_num = 4,           # Cant. de bloques atencionales, son 8
    hidden_dim = 128,       # Cant. de neuronas en la red Neuronal de cada bloque, son 2048
    dropout_rate = 0.05,    # Porcentaje de neuropnas que se desactivan para evitar Overffiting
    use_same_embed = False, # La representacion de las frases en ingles y español son diferentes
)

model.compile('adam', 'sparse_categorical_crossentropy')
model.summary()



Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 Encoder-Input (InputLayer)  [(None, None)]               0         []                            
                                                                                                  
 Encoder-Token-Embedding (E  [(None, None, 32),           808608    ['Encoder-Input[0][0]']       
 mbeddingRet)                 (25269, 32)]                                                        
                                                                                                  
 Encoder-Embedding (TrigPos  (None, None, 32)             0         ['Encoder-Token-Embedding[0][0
 Embedding)                                                         ]']                           
                                                                                            

In [None]:
# Entrenamiento
x = [np.array(encoder_input), np.array(decoder_input)]
y = np.array(output_decoded)

model.fit(x,y, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7d8f58618e50>

In [None]:
# Guardar archivo con pesos en el drive
# model.save('/content/drive/My Drive/Personal/Transformers/translator_e5.h5')

  saving_api.save_model(


In [None]:
# newfile = '/content/drive/My Drive/Personal/Transformers/translator_e5.h5'
# La funcion load_weights es propia de keras-transformer
# model.load_weights(newfile)

In [None]:
def translate(sentence):
  sentence_tokens = [tokens + ['<END>', '<PAD>'] for tokens in [sentence.split(' ')]]
  transformer_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in sentence_tokens][0]
  print('tr_input: ', transformer_input)
  decoded = decode(
      model,
      transformer_input,
      start_token = target_token_dict['<START>'],
      end_token = target_token_dict['<END>'],
      pad_token = target_token_dict['<PAD>']
  )

  print('Frase original: {}'.format(sentence))
  print(decoded[1:-1])
  print('Traduccion: {}'.format(' '.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1]))))


In [None]:
translate('the red car')

tr_input:  [750, 199, 748, 2, 0]
Frase original: the red car
[67, 1568, 5027]
Traduccion: el coche roja
