In [187]:
import numpy as np
from keras.layers import LSTM,Input,Dense,Embedding,Dot,Concatenate,Softmax,TimeDistributed
from keras.models import Model
import random
from keras.models import load_model
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

In [174]:
# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open(r"D:\Datasets\spa.txt", 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(10000, len(lines) - 1)]:
    input_text, target_text = line.split('\t')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    # target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [175]:
def clean_sentence(sentence):
    # Add a space ' ' befor the ? word
    sentence = sentence.replace('?', ' ?')
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation
    string_punctuation = string_punctuation.replace('?','')
    # Clean the sentence
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
    return clean_sentence

In [176]:
cleaned_input_sentence = []
cleaned_output_sentence = []

for sentence in input_texts:
    sentence = clean_sentence(sentence)
    cleaned_input_sentence.append(sentence)

for sentence in target_texts:
    sentence = clean_sentence(sentence)
    sentence = '_start_ ' + sentence + ' _end_'
    cleaned_output_sentence.append(sentence)

In [177]:
cleaned_output_sentence

['_start_ ve _end_',
 '_start_ vete _end_',
 '_start_ vaya _end_',
 '_start_ váyase _end_',
 '_start_ hola _end_',
 '_start_ ¡corre _end_',
 '_start_ corred _end_',
 '_start_ ¿quién ? _end_',
 '_start_ ¡fuego _end_',
 '_start_ ¡incendio _end_',
 '_start_ ¡disparad _end_',
 '_start_ ¡ayuda _end_',
 '_start_ ¡socorro ¡auxilio _end_',
 '_start_ ¡auxilio _end_',
 '_start_ ¡salta _end_',
 '_start_ salte _end_',
 '_start_ ¡parad _end_',
 '_start_ ¡para _end_',
 '_start_ ¡pare _end_',
 '_start_ ¡espera _end_',
 '_start_ esperen _end_',
 '_start_ continúa _end_',
 '_start_ continúe _end_',
 '_start_ hola _end_',
 '_start_ corrí _end_',
 '_start_ corría _end_',
 '_start_ lo intento _end_',
 '_start_ ¡he ganado _end_',
 '_start_ ¡oh no _end_',
 '_start_ tomátelo con soda _end_',
 '_start_ sonríe _end_',
 '_start_ ¡al ataque _end_',
 '_start_ ¡atacad _end_',
 '_start_ levanta _end_',
 '_start_ ve ahora mismo _end_',
 '_start_ ¡lo tengo _end_',
 '_start_ ¿lo pillas ? _end_',
 '_start_ ¿entendiste 

In [178]:
max_input_sentence_length = max([len(txt.split()) for txt in cleaned_input_sentence])

max_output_sentence_length = max([len(txt.split()) for txt in cleaned_output_sentence])

In [179]:
max_input_sentence_length

6

In [180]:
max_output_sentence_length

11

Tokenization

In [181]:
tokenizer_x = Tokenizer()
tokenizer_y = Tokenizer()

tokenizer_x.fit_on_texts(cleaned_input_sentence)
tokenizer_y.fit_on_texts(cleaned_output_sentence)

vocab_size_x = len(tokenizer_x.word_index) + 1
vocab_size_y = len(tokenizer_y.word_index) + 1

encoder_data = tokenizer_x.texts_to_sequences(cleaned_input_sentence)
decoder_data = tokenizer_y.texts_to_sequences(cleaned_output_sentence)


padded_encoded_data = pad_sequences(encoder_data,maxlen=max_input_sentence_length,padding="post")
padded_decoded_data = pad_sequences(decoder_data,maxlen=max_output_sentence_length,padding="post")

Developing endcoder decoder model with attention mechanism

In [182]:
# encoder

encoder_input = Input(shape=(max_input_sentence_length,),name="encoder_input_layer")
embedding_layer_encoder = Embedding(input_dim=vocab_size_x,output_dim=100,trainable=True,name="encoder_embedding_layer")(encoder_input)
lstm1,state_h1,state_c1 = LSTM(256,return_state=True,return_sequences=True,name="encoder_lstm1")(embedding_layer_encoder)
lstm2,state_h2,state_c2 = LSTM(256,return_state=True,return_sequences=True,name="encoder_lstm2")(lstm1)
encoder_output,h,c = LSTM(256,return_state=True,return_sequences=True,name="encoder_output_layer")(lstm2)

encoder_states = [h,c]

# decoder

decoder_input = Input(shape=(None,),name="decoder_input_layer")
embedding_layer_decoder = Embedding(input_dim=vocab_size_y,output_dim=100,trainable=True,name="decoder_embedding_layer")(decoder_input)
decoder,state_h,state_c = LSTM(256,return_sequences=True,return_state=True,name="decoder_lstm")(embedding_layer_decoder,initial_state=encoder_states)
decoder_states = [state_h,state_c]

# Implementing Attention Mechanism
dot_layer_1 = Dot(axes=(2,2),name="dot_layer_1")([decoder,encoder_output])
activation_layer = Softmax(name="softmax_layer")(dot_layer_1)
dot_layer_2 = Dot(axes=(2,1),name="dot_layer2")([activation_layer,encoder_output])
concatenation_Layer = Concatenate(axis=-1,name="concatenation_layer")([dot_layer_2,decoder])

# output

dense_layer = TimeDistributed(Dense(vocab_size_y,activation='softmax',name="dense_layer"))(concatenation_Layer)
# dense_layer = Dense(units=vocab_size_y,activation='softmax',name="dense_layer")(concatenation_Layer)

model = Model([encoder_input,decoder_input],[dense_layer])

In [183]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])

In [184]:
model.summary()

Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_input_layer (Input  [(None, 6)]                  0         []                            
 Layer)                                                                                           
                                                                                                  
 encoder_embedding_layer (E  (None, 6, 100)               236300    ['encoder_input_layer[0][0]'] 
 mbedding)                                                                                        
                                                                                                  
 encoder_lstm1 (LSTM)        [(None, 6, 256),             365568    ['encoder_embedding_layer[0][0
                              (None, 256),                          ]']                    

In [185]:
padded_encoded_input_data = padded_encoded_data
padded_decoded_input_data = padded_decoded_data[:,:-1]
padded_decoded_output_data = padded_decoded_data.reshape(padded_decoded_data.shape[0],padded_decoded_data.shape[1],1)[:,1:]

print(padded_encoded_input_data.shape)
print(padded_decoded_input_data.shape)
print(padded_decoded_output_data.shape)

(10000, 6)
(10000, 10)
(10000, 10, 1)


In [188]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

In [189]:
model.fit([padded_encoded_input_data,padded_decoded_input_data],padded_decoded_data[:,1:],batch_size=32,epochs=100,verbose=1,validation_split=0.2,callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 2: early stopping


<keras.src.callbacks.History at 0x15fec1df610>