In [2]:
import pandas as pd

file_path = 'parallel-corpus.xlsx'
df = pd.read_excel(file_path)
df = df[['SENTENCES ', 'MEANING']]

print(df.head())


                                          SENTENCES   \
0             How can I communicate with my parents?   
1                           How can I make friends?’   
2                              Why do I get so sad?’   
3  If you’ve asked yourself such questions, you’r...   
4  Depending on where you’ve turned for guidance,...   

                                             MEANING  
0                 میں اپنے والدین سے کیسے بات کروں ؟  
1                             میں دوست کیسے بنائوں ؟  
2                           میں اتنا اداس کیوں ہوں؟.  
3  اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...  
4   اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...  


In [72]:
from sklearn.model_selection import train_test_split
X = df['SENTENCES '].values  # English sentences
y = df['MEANING'].values  # Urdu translations

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f'Training samples: {len(X_train)}, Validation samples: {len(X_val)}, Test samples: {len(X_test)}')

Training samples: 24131, Validation samples: 3016, Test samples: 3017


In [77]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#pre processing

#Converting enteries into string beacuse some sentence have float or int values
X_train = X_train.astype(str)
y_train = y_train.astype(str)
X_val = X_val.astype(str)
y_val = y_val.astype(str)
X_test = X_test.astype(str)
y_test = y_test.astype(str)

# Tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")

# Fit the tokenizer on the English and Urdu tokenized data
tokenizer.fit_on_texts(X_train)  # English sentences
tokenizer.fit_on_texts(y_train)  # Urdu translations


# Convert the tokenized text into sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

y_train_seq = tokenizer.texts_to_sequences(y_train)
y_val_seq = tokenizer.texts_to_sequences(y_val)
y_test_seq = tokenizer.texts_to_sequences(y_test)

# Pad the sequences
max_seq_len = 50 

X_train_padded = pad_sequences(X_train_seq, maxlen=max_seq_len, padding='post')
X_val_padded = pad_sequences(X_val_seq, maxlen=max_seq_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_seq_len, padding='post')

y_train_padded = pad_sequences(y_train_seq, maxlen=max_seq_len, padding='post')
y_val_padded = pad_sequences(y_val_seq, maxlen=max_seq_len, padding='post')
y_test_padded = pad_sequences(y_test_seq, maxlen=max_seq_len, padding='post')



In [78]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, TimeDistributed, Dropout

# model hyperparameters
vocab_size = 10000 
embedding_dim = 128 
max_seq_len = 50     

model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_len))

# SimpleRNN layer
model.add(SimpleRNN(128, return_sequences=True))

# Dense layer for output with TimeDistributed wrapper
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


In [79]:
history = model.fit(X_train_padded, y_train_padded, 
                    validation_data=(X_val_padded, y_val_padded), 
                    epochs=20, 
                    batch_size=64)

Epoch 1/20
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 685ms/step - accuracy: 0.7035 - loss: 3.4728 - val_accuracy: 0.7359 - val_loss: 1.7984
Epoch 2/20
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 658ms/step - accuracy: 0.7322 - loss: 1.8127 - val_accuracy: 0.7448 - val_loss: 1.6931
Epoch 3/20
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m243s[0m 644ms/step - accuracy: 0.7400 - loss: 1.7265 - val_accuracy: 0.7516 - val_loss: 1.6250
Epoch 4/20
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 676ms/step - accuracy: 0.7491 - loss: 1.6486 - val_accuracy: 0.7552 - val_loss: 1.5857
Epoch 5/20
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 719ms/step - accuracy: 0.7521 - loss: 1.6029 - val_accuracy: 0.7549 - val_loss: 1.8523
Epoch 6/20
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 654ms/step - accuracy: 0.7523 - loss: 1.6147 - val_accuracy: 0.7570 - val_loss: 1.5390
Epoc

In [82]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Function to decode the tokenized predictions back to text using the tokenizer
def decode_sequence(sequence, tokenizer):
    reverse_word_map = {index: word for word, index in tokenizer.word_index.items()}
    decoded_sentence = ' '.join([reverse_word_map.get(i, '') for i in sequence if i != 0])
    return decoded_sentence

# Calculate BLEU score for the model on test data
bleu_scores = []

for i in range(len(X_test_padded)):
    # Get model predictions
    prediction = model.predict(X_test_padded[i].reshape(1, max_seq_len))
    
    # Decode the predicted sequence and true sequence
    predicted_sentence = decode_sequence(prediction[0].argmax(axis=-1), tokenizer)
    true_sentence = decode_sequence(y_test_padded[i], tokenizer)

    # Calculate BLEU score
    reference = [true_sentence.split()]  # List of references for BLEU
    candidate = predicted_sentence.split()
    bleu_score = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))  # BLEU-4 score
    bleu_scores.append(bleu_score)

# Print average BLEU score
average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print(f'Average BLEU score: {average_bleu_score:.4f}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19

In [80]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test_padded, y_test_padded)
print(f'Test Accuracy: {test_acc * 100:.2f}%')


[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 123ms/step - accuracy: 0.7646 - loss: 1.4381
Test Accuracy: 76.46%


In [83]:
# Function to print example translations
def print_examples(num_examples=5):
    for i in range(num_examples):
        input_seq = X_test_padded[i].reshape(1, max_seq_len)
        
        # Get the predicted translation
        prediction = model.predict(input_seq)
        predicted_sentence = decode_sequence(prediction[0].argmax(axis=-1), tokenizer)
        
        # Get the true translation
        true_sentence = decode_sequence(y_test_padded[i], tokenizer)
        
        print(f"Input Sentence: {decode_sequence(X_test_padded[i], tokenizer)}")
        print(f"True Translation: {true_sentence}")
        print(f"Predicted Translation: {predicted_sentence}")
        print("-" * 50)

# Print some example translations
print_examples(num_examples=5)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Input Sentence: ambiance was amazing and so the staff quality of food was also good but the rates are quit high their beef <OOV> burger was not up to the mark overall a nice place to visit
True Translation: ماحول حیرت انگیز تھا اور عملہ بھی۔ کھانے کا معیار بھی اچھا تھا لیکن قیمتیں زیادہ ہیں۔ ان کا بیف کیما برگر نشان تک نہیں تھا۔ مجموعی طور پر دیکھنے کے لیے ایک اچھی جگہ۔
Predicted Translation: ماحول اچھا انگیز اور اور اور عملہ معیار معیار معیار معیار بھی اچھا لیکن لیکن لیکن قیمتیں بہت زیادہ کے کی کی کا اور نہیں نہیں آرڈر ذائقہ نہیں مجموعی طور اچھا کے کے کے لیے کے لیے
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Input Sentence: you have to promise never to tell anyone what i'm about to tell you
True Translation: تم نے وعدہ کرنا ہے کہ جو میں تمہیں بتانے لگا ہوں، وہ کسی اور کو کبھی نہیں <OOV> ہے۔
Predicted Translation: آپ نے کے نہیں نہیں نہیں

In [84]:
from tensorflow.keras.layers import LSTM
# Define model hyperparameters
vocab_size = 10000  # Adjust based on your tokenizer
embedding_dim = 128  # Dimension for embedding layer

model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_len))

# LSTM layer
model.add(LSTM(128, return_sequences=True))

# Dense layer for output with TimeDistributed wrapper
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()




In [85]:
history = model.fit(X_train_padded, y_train_padded, 
                    validation_data=(X_val_padded, y_val_padded), 
                    epochs=20, 
                    batch_size=64)

Epoch 1/20
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 695ms/step - accuracy: 0.7121 - loss: 3.5237 - val_accuracy: 0.7331 - val_loss: 1.8119
Epoch 2/20
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 717ms/step - accuracy: 0.7277 - loss: 1.8436 - val_accuracy: 0.7376 - val_loss: 1.7467
Epoch 3/20
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 673ms/step - accuracy: 0.7366 - loss: 1.7536 - val_accuracy: 0.7395 - val_loss: 1.6806
Epoch 4/20
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m996s[0m 3s/step - accuracy: 0.7364 - loss: 1.7176 - val_accuracy: 0.7449 - val_loss: 1.6354
Epoch 5/20
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 857ms/step - accuracy: 0.7445 - loss: 1.6452 - val_accuracy: 0.7498 - val_loss: 1.5979
Epoch 6/20
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 687ms/step - accuracy: 0.7462 - loss: 1.6155 - val_accuracy: 0.7544 - val_loss: 1.5629
Epoch 7

In [86]:
# Function to print example translations
def print_examples(num_examples=5):
    for i in range(num_examples):
        input_seq = X_test_padded[i].reshape(1, max_seq_len)
        
        # Get the predicted translation
        prediction = model.predict(input_seq)
        predicted_sentence = decode_sequence(prediction[0].argmax(axis=-1), tokenizer)
        
        # Get the true translation
        true_sentence = decode_sequence(y_test_padded[i], tokenizer)
        
        print(f"Input Sentence: {decode_sequence(X_test_padded[i], tokenizer)}")
        print(f"True Translation: {true_sentence}")
        print(f"Predicted Translation: {predicted_sentence}")
        print("-" * 50)

# Print some example translations
print_examples(num_examples=5)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 602ms/step
Input Sentence: ambiance was amazing and so the staff quality of food was also good but the rates are quit high their beef <OOV> burger was not up to the mark overall a nice place to visit
True Translation: ماحول حیرت انگیز تھا اور عملہ بھی۔ کھانے کا معیار بھی اچھا تھا لیکن قیمتیں زیادہ ہیں۔ ان کا بیف کیما برگر نشان تک نہیں تھا۔ مجموعی طور پر دیکھنے کے لیے ایک اچھی جگہ۔
Predicted Translation: ماحول اچھا انگیز اور اور اور عملہ معیار معیار معیار معیار بھی تھا۔ لیکن لیکن قیمتیں قیمتیں ہیں۔ ہیں۔ <OOV> <OOV> <OOV> لاجواب نہیں نہیں نہیں نہیں تھی۔ مجموعی طور طور کے کے جگہ جگہ
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Input Sentence: you have to promise never to tell anyone what i'm about to tell you
True Translation: تم نے وعدہ کرنا ہے کہ جو میں تمہیں بتانے لگا ہوں، وہ کسی اور کو کبھی نہیں <OOV> ہے۔
Predicted Translation: آپ نے کے کے نہیں نہ