# Seq2Seq encoder decoder

In [1]:
import gensim
import numpy as np
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
from nltk.translate.meteor_score import single_meteor_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

2024-04-23 05:12:22.770053: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-23 05:12:22.819946: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 05:12:22.819986: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 05:12:22.822142: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-23 05:12:22.829420: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-23 05:12:22.830226: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
def one_hot_encode_sequences(sequences, vocabulary_size):
    """
    One-hot encode a list of sequences.

    Args:
    sequences (list of lists): List of sequences, where each sequence is a list of integers representing tokens.
    vocabulary_size (int): Size of the vocabulary.

    Returns:
    list of numpy arrays: List of one-hot encoded matrices.
    """
    one_hot_matrices = []
    for sequence in sequences:
        one_hot_matrix = np.zeros((len(sequence), vocabulary_size), dtype=np.int32)
        for i, token_id in enumerate(sequence):
            one_hot_matrix[i, token_id] = 1
        one_hot_matrices.append(one_hot_matrix)
    return np.array(one_hot_matrices)

In [14]:
# Load the dataset
df = pd.read_csv('translation_train.csv')
arabic_texts = df['Arabic'].values
english_texts = df['English'].values
english_texts = ['<BOS> '+s+' <EOS>' for s in english_texts]
decoder_output_size = 1000

# Tokenize the text
ar_tokenizer = Tokenizer()
ar_tokenizer.fit_on_texts(arabic_texts)
en_tokenizer = Tokenizer(num_words=decoder_output_size)
en_tokenizer.fit_on_texts(english_texts)
arabic_sequences = ar_tokenizer.texts_to_sequences(arabic_texts)
english_sequences = en_tokenizer.texts_to_sequences(english_texts)
indicies = [i for i in range(len(arabic_sequences))  if len(arabic_sequences[i])<40 & len(english_sequences[i])<38]
# Pad the sequences
max_sequence_length = 40#max(max(len(seq) for seq in arabic_sequences), max(len(seq) for seq in english_sequences))
arabic_sequences = pad_sequences([arabic_sequences[i] for i in indicies], maxlen=max_sequence_length, padding='post')
english_sequences = pad_sequences([english_sequences[i] for i in indicies], maxlen=max_sequence_length, padding='post')

decoder_input_data = []
decoder_target_data = []
for i in range(english_sequences.shape[0]):
    decoder_input_data.append(english_sequences[i][:-1].tolist())
    decoder_target_data.append(english_sequences[i][1:].tolist())


encoder_input_data = arabic_sequences
decoder_input_data = np.array(decoder_input_data)
decoder_target_data = one_hot_encode_sequences(decoder_target_data,decoder_output_size)
# Split the data
arabic_train, arabic_val, english_train, english_val, decoder_input_train, decoder_input_val, decoder_target_train, decoder_target_val = train_test_split(arabic_sequences, english_sequences, decoder_input_data, decoder_target_data, test_size=0.2)

In [15]:
# Load the ArbEngVec model
arbengvec_model_path = 'randshuffle_5window_skipgram_300size.model'
arbengvec_model = gensim.models.KeyedVectors.load(arbengvec_model_path).wv

# Define the vocabulary size and embedding dimension
vocab_size = len(arbengvec_model.key_to_index) + 1  # Plus 1 for the padding token
embedding_dim = arbengvec_model.vector_size

# Create the embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in arbengvec_model.key_to_index.items():
    embedding_vector = arbengvec_model[word]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Create the embedding layer
embedding_layer = Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            trainable=False)  # Set trainable to False to keep the embeddings fixed

In [16]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(max_sequence_length,))
x = embedding_layer(encoder_inputs)
x, state_h, state_c = LSTM(embedding_dim,
                           return_state=True)(x)
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(max_sequence_length-1,))
x = embedding_layer(decoder_inputs)
decoder_lstm = LSTM(300, return_sequences=True, return_state=True)
x,_,_ = decoder_lstm(x, initial_state=encoder_states)

# Add a Dense layer with 224 units to further reduce the shape to (None, 224)
decoder_outputs = Dense(decoder_output_size, activation='softmax')
output = decoder_outputs(x)
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], output)

# Compile & run training
model.compile(optimizer='adam', loss='categorical_crossentropy')
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 39)]                 0         []                            
                                                                                                  
 input_3 (InputLayer)        [(None, 40)]                 0         []                            
                                                                                                  
 embedding_1 (Embedding)     multiple                     2317617   ['input_3[0][0]',             
                                                          00         'input_4[0][0]']             
                                                                                                  
 lstm_2 (LSTM)               [(None, 300),                721200    ['embedding_1[0][0]']   

In [65]:
model.fit([arabic_train, decoder_input_train], decoder_target_train,
          batch_size=512,
          epochs=50,
          validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7f8dbf0d1030>

In [72]:
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(batch_shape=(300,None))
decoder_state_input_c = Input(batch_shape=(300,None))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoded, state_h, state_c = decoder_lstm(
    embedding_layer(decoder_inputs), initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
outputs = decoder_outputs(decoded)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs,
    [outputs] + decoder_states)


TypeError: 'KerasTensor' object is not callable

In [66]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(np.array([input_seq]))

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    sequence = [1]
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [pad_sequences([sequence],maxlen=max_sequence_length-1,padding='post')] + states_value)
        # Sample a token
        token = np.argmax(output_tokens[0][len(sequence)-1])
        sequence.append(token)
        
        if (token == 2 or
           len(sequence) > max_sequence_length-1):
            stop_condition = True

        # Update states
        states_value = [h, c]

    return sequence

In [67]:
predicted_tokens = list(map(decode_sequence,arabic_val[:5]))
predicted_text = en_tokenizer.sequences_to_texts(predicted_tokens)



In [68]:
ref = [x.split() for x in en_tokenizer.sequences_to_texts(english_val[:5])]
pred = [x.split() for x in predicted_text]

In [69]:
np.array(list(map(single_meteor_score,ref,pred))).mean()

0.11824324324324323

In [70]:
predicted_text

['bos tom', 'bos the', 'bos what', 'bos the', 'bos i']

In [71]:
en_tokenizer.sequences_to_texts(english_val[:5])

['bos tom the in front of house eos',
 'bos is this a picture that you eos',
 'bos she put down her on paper eos',
 'bos is the most beautiful city in eos',
 'bos she took a to the hospital eos']

# Need more training and powerful system to utilize the complete vocab size and sequence length