# Sequence To Sequence Learning

In [1]:


# Define file paths
text_path = '/content/ara_eng.txt'

# Read English text file
with open(text_path, 'r', encoding='utf-8') as file:
    text = file.readlines()




In [2]:

pairs = []
for line in text:
    # Split English and Arabic sentences by tab character
    eng, arb = line.strip().split('\t')
    # Clean and tokenize (if necessary)
    eng = eng.strip()
    arb = arb.strip()
    # Add the pair to the list of pairs
    pairs.append((eng, arb))

english_data = [pair[0] for pair in pairs]
urdu_data = [pair[1] for pair in pairs]

In [3]:
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')

# Tokenize English text
english_tokenized = [word_tokenize(sentence.strip()) for sentence in english_data]

# Tokenize Urdu text (assuming you have Urdu text stored in the variable urdu_data)
urdu_tokenized = [word_tokenize(sentence.strip()) for sentence in urdu_data]

# Print a few tokenized sentences to verify
print("English Tokenized:")
for sentence in english_tokenized[:5]:
    print(sentence)

print("\nUrdu Tokenized:")
for sentence in urdu_tokenized[:5]:
    print(sentence)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


English Tokenized:
['Hi', '.']
['Run', '!']
['Help', '!']
['Jump', '!']
['Stop', '!']

Urdu Tokenized:
['مرحبًا', '.']
['اركض', '!']
['النجدة', '!']
['اقفز', '!']
['قف', '!']


In [4]:
# Flatten the list of tokenized sentences into a single list of tokens
english_tokens_flat = [token for sentence in english_tokenized for token in sentence]

# Create a set of unique tokens
english_unique_tokens = set(english_tokens_flat)

# Assign a unique index to each token in the vocabulary
english_vocab = {token: idx for idx, token in enumerate(english_unique_tokens)}

# Print the vocabulary
print("English Vocabulary:")
print(english_vocab)
# Flatten the list of tokenized sentences into a single list of tokens
urdu_tokens_flat = [token for sentence in urdu_tokenized for token in sentence]
# Create a set of unique tokens
urdu_unique_tokens = set(urdu_tokens_flat)

# Assign a unique index to each token in the vocabulary
urdu_vocab = {token: idx for idx, token in enumerate(urdu_unique_tokens)}

# Print the vocabulary
print("Urdu Vocabulary:")
print(urdu_vocab)
en_size=len(english_vocab)
ur_size=len(urdu_vocab)


English Vocabulary:
Urdu Vocabulary:
{'استاذا': 0, 'للمقدمين': 1, 'حقوق': 2, 'ليبقى': 3, 'وتابعيه': 4, 'فيرجح': 5, 'بمتناول': 6, 'لقرايه': 7, 'أبلى': 8, 'مولودون': 9, 'تعوق': 10, 'نطرنا': 11, 'البوذيين': 12, 'يعاني': 13, 'اللازمة': 14, 'المتصفحات': 15, 'البراميل': 16, 'الباكستان': 17, 'تثار': 18, 'سأذهب': 19, 'النواحي': 20, 'زميل': 21, 'عراة': 22, 'يذع': 23, 'الفتاةَ': 24, 'تتكلم': 25, 'مراهقا': 26, 'افتراضي': 27, 'بالرهان': 28, 'الكاجونيدا': 29, 'اجزاء': 30, 'والدولية': 31, 'بيروفية': 32, 'وتساعد': 33, 'بشجاعه': 34, 'ونيجيريا': 35, 'استدار': 36, 'متحرر': 37, 'تتعلم': 38, 'يلمح': 39, 'يعبروا': 40, 'اعتماده': 41, 'ارتباك': 42, 'باطلاعنا': 43, 'فهنا': 44, 'أصبحنا': 45, 'امامنا': 46, 'بخصوص': 47, 'لغتي': 48, 'غاي': 49, 'اضطهادات': 50, 'يكتب': 51, 'التاليتين': 52, 'للكتب': 53, 'متقبلا': 54, 'يحدق': 55, 'المضايقات': 56, 'الواضحِ': 57, 'أزعجني': 58, 'شارفول': 59, 'بعدد': 60, 'المشعة': 61, 'وسيسمح': 62, 'بحرفي': 63, 'ادراجه': 64, 'المسؤول': 65, 'اللغويين': 66, 'التدنيس': 67, 'تذييل': 68, 'ميل

In [5]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Assume english_tokenized and urdu_tokenized are already defined
# Assume english_vocab and urdu_vocab are dictionaries mapping tokens to numerical IDs

# Numerical encoding for English tokenized sequences
english_encoded = [[english_vocab[token] for token in sentence] for sentence in english_tokenized]

# Numerical encoding for Urdu tokenized sequences
urdu_encoded = [[urdu_vocab[token] for token in sentence] for sentence in urdu_tokenized]

# Print a few encoded sequences to verify
print("Encoded English Sequences:")
for sequence in english_encoded[:5]:
    print(sequence)

print("\nEncoded Urdu Sequences:")
for sequence in urdu_encoded[:5]:
    print(sequence)

# Split the encoded sequences into training, validation, and test sets
# First split into training+validation and test sets
english_train_val, english_test, urdu_train_val, urdu_test = train_test_split(
    english_encoded, urdu_encoded, test_size=0.2, random_state=42)

# Then split the training+validation set into actual training and validation sets
english_train, english_val, urdu_train, urdu_val = train_test_split(
    english_train_val, urdu_train_val, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2 of the original data

# Print the sizes of the training, validation, and test sets
print("Number of training examples:", len(english_train))
print("Number of validation examples:", len(english_val))
print("Number of test examples:", len(english_test))

# Find the maximum sequence lengths for both English and Urdu sequences
max_english_length = max(len(seq) for seq in english_encoded)
max_urdu_length = max(len(seq) for seq in urdu_encoded)

# Pad the English sequences
english_train_padded = tf.keras.preprocessing.sequence.pad_sequences(english_train, maxlen=max_english_length, padding='post')
english_val_padded = tf.keras.preprocessing.sequence.pad_sequences(english_val, maxlen=max_english_length, padding='post')
english_test_padded = tf.keras.preprocessing.sequence.pad_sequences(english_test, maxlen=max_english_length, padding='post')


# Pad the Urdu sequences
urdu_train_padded = tf.keras.preprocessing.sequence.pad_sequences(urdu_train, maxlen=max_urdu_length, padding='post')
urdu_val_padded = tf.keras.preprocessing.sequence.pad_sequences(urdu_val, maxlen=max_urdu_length, padding='post')
urdu_test_padded = tf.keras.preprocessing.sequence.pad_sequences(urdu_test, maxlen=max_urdu_length, padding='post')


# Output the shapes of the padded datasets to verify
print("English Train Padded Shape:", english_train_padded.shape)
print("English Val Padded Shape:", english_val_padded.shape)
print("English Test Padded Shape:", english_test_padded.shape)
print("Urdu Train Padded Shape:", urdu_train_padded.shape)
print("Urdu Val Padded Shape:", urdu_val_padded.shape)
print("Urdu Test Padded Shape:", urdu_test_padded.shape)



Encoded English Sequences:
[17815, 16529]
[919, 17677]
[25719, 17677]
[25909, 17677]
[19090, 17677]

Encoded Urdu Sequences:
[26485, 35605]
[23895, 38222]
[39378, 38222]
[57401, 38222]
[22696, 38222]
Number of training examples: 14782
Number of validation examples: 4928
Number of test examples: 4928
English Train Padded Shape: (14782, 226)
English Val Padded Shape: (4928, 226)
English Test Padded Shape: (4928, 226)
Urdu Train Padded Shape: (14782, 225)
Urdu Val Padded Shape: (4928, 225)
Urdu Test Padded Shape: (4928, 225)


In [6]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed


In [7]:

# Define the encoder
encoder_inputs = tf.keras.layers.Input(shape=(max_english_length,))
encoder_embedding = tf.keras.layers.Embedding(input_dim=len(english_vocab), output_dim=128)(encoder_inputs)
encoder_lstm, state_h, state_c = tf.keras.layers.LSTM(128, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Define the decoder
decoder_inputs = tf.keras.layers.Input(shape=(max_urdu_length,))
decoder_embedding = tf.keras.layers.Embedding(input_dim=len(urdu_vocab), output_dim=128)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(128, return_sequences=True)(decoder_embedding, initial_state=encoder_states)
decoder_outputs = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(urdu_vocab), activation='softmax'))(decoder_lstm)

# Define the model
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 226)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 225)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 226, 128)             3412096   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 225, 128)             7407488   ['input_2[0][0]']             
                                                                                              

In [None]:

# Train the model
history = model.fit(
    [english_train_padded, urdu_train_padded],
    urdu_train_padded,
    validation_data=([english_val_padded, urdu_val_padded], urdu_val_padded),
    epochs=5,
    batch_size=64
)

Epoch 1/5


In [None]:
# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate([english_test_padded, urdu_test_padded], urdu_test_padded)

# Print test loss and accuracy
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')


In [None]:
predictions = model.predict([english_test_padded, urdu_test_padded])

# Convert one-hot encoded predictions to class labels
predicted_labels = np.argmax(predictions, axis=-1)

# Print actual and predicted values for a few samples
for i in range(5):  # Adjust the range as needed
    print("English Input:", english_test_padded[i])
    print("Actual Urdu Output:", urdu_test_padded[i])
    print("Predicted Urdu Output:", predicted_labels[i])
    print()

In [None]:
# Create reverse lookup dictionaries for English and Urdu vocabularies
reverse_english_vocab = {index: word for word, index in english_vocab.items()}
reverse_urdu_vocab = {index: word for word, index in urdu_vocab.items()}

# Print actual and predicted values for a few samples
for i in range(5):  # Adjust the range as needed
    print("English Input:", [reverse_english_vocab[index] for index in english_test_padded[i] if index != 0])
    print("Actual Urdu Output:", [reverse_urdu_vocab[index] for index in urdu_test_padded[i] if index != 0])
    print("Predicted Urdu Output:", [reverse_urdu_vocab[index] for index in predicted_labels[i] if index != 0])
    print()
