English Data Cleaning

In [None]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import contractions
import emoji


#functions

def clean_text(text):
    if isinstance(text, str):  # Check if the input is a string
        text = text.lower()  # Lowercase
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
        text = re.sub(r'\W', ' ', text)  # Remove special characters
        text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
        return text
    return ''  # Return an empty string for non-string inputs

def tokenize_english(text):
    return word_tokenize(text)

def remove_stopwords(tokenized_list):
    return [word for word in tokenized_list if word not in stop_words]

def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]

def stem_words(words):
    return [stemmer.stem(word) for word in words]

#Reading File
file = pd.read_excel('parallel-corpus.xlsx', header = None,usecols=[0, 1], skiprows=1)

#Text Cleaning Punctuation etc

file[0] = file[0].apply(clean_text)  # Clean English text 

#contractions

file[0] = [''.join(doc) for doc in file[0]]
file[0] = file[0].apply(contractions.fix)

# handling emojis
file[0] = file[0].apply(emoji.demojize)


#Saving File
file.iloc[:, 0].to_frame(name='English').to_excel('English_Sentences.xlsx', index=False, header=True)


Urdu Data Cleaning

In [None]:
import pandas as pd
from LughaatNLP import NER_Urdu
from LughaatNLP import POS_urdu
from LughaatNLP import LughaatNLP


urdu_text_processing = LughaatNLP()


file = pd.read_excel('parallel-corpus.xlsx', header = None,usecols=[0, 1], skiprows=1)
duplicates = file[file.duplicated(keep=False)]

file[1] = file[1].apply(lambda x: urdu_text_processing.normalize(str(x)))
file[1] = file[1].apply(lambda x: urdu_text_processing.remove_english(str(x)))
file[1] = file[1].apply(lambda x: urdu_text_processing.remove_urls(str(x)))
file[1] = file[1].apply(lambda x: urdu_text_processing.remove_special_characters(x))




df = pd.read_excel('English_Sentences.xlsx')

df['Urdu'] = file.iloc[:, 1]

# Save the updated DataFrame back to the same file with both columns
df.to_excel('English_Urdu.xlsx', index=False, header=True)

RNN Model Training

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
df = pd.read_excel('English_Urdu.xlsx')
df = df.astype(str)
# Split the data into input (English) and output (Urdu)
english_sentences = df['English'].tolist()
urdu_sentences = df['Urdu'].tolist()

# Create tokenizers for English and Urdu
english_tokenizer = Tokenizer()
urdu_tokenizer = Tokenizer()

# Fit the tokenizers to the data
english_tokenizer.fit_on_texts(english_sentences)
urdu_tokenizer.fit_on_texts(urdu_sentences)

# Convert text to sequences
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
urdu_sequences = urdu_tokenizer.texts_to_sequences(urdu_sentences)

# Pad the sequences
max_english_length = 40
max_urdu_length = 40
padded_english = pad_sequences(english_sequences, maxlen=max_english_length, padding='post')
padded_urdu = pad_sequences(urdu_sequences, maxlen=max_urdu_length, padding='post')

# Split the data into training, validation, and test sets
train_size = int(0.8 * len(padded_english))
val_size = int(0.1 * len(padded_english))

train_english = padded_english[:train_size]
train_urdu = padded_urdu[:train_size]
val_english = padded_english[train_size:train_size+val_size]
val_urdu = padded_urdu[train_size:train_size+val_size]
test_english = padded_english[train_size+val_size:]
test_urdu = padded_urdu[train_size+val_size:]


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, TimeDistributed, Bidirectional

# Define the RNN-based model

rnn_model = Sequential([
    Embedding(input_dim=len(english_tokenizer.word_index)+1, output_dim=128),
    Bidirectional(SimpleRNN(128, return_sequences=True, activation='tanh')),
    TimeDistributed(Dense(len(urdu_tokenizer.word_index)+1, activation='softmax'))
])
# Compile the model
rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# Train the model
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint('english_to_urdu_rnn_model.keras', 
                             monitor='val_accuracy', 
                             mode='max', 
                             save_best_only=True, 
                             verbose=1)

# Train the model

rnn_model.fit(train_english, train_urdu, epochs=25, batch_size=64, 
              validation_data=(val_english, val_urdu), 
              callbacks=[checkpoint], 
              verbose=1)

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize


from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


def calculate_bleu(y_true, y_pred):
    """
    Calculate BLEU score for machine translation model.

    Args:
    y_true (numpy array): True sentences.
    y_pred (numpy array): Predicted sentences.

    Returns:
    float: Mean BLEU score.
    """
    smoothing_func = SmoothingFunction()
    bleu_scores = []

    for i in range(len(y_true)):
        # Ignore padding tokens and handle unknown words
        true_sentence = []
        for word in y_true[i]:
            if word != 0:  # ignore padding tokens
                word = urdu_tokenizer.index_word.get(word, '')
                if word:  # ignore empty strings (unknown words)
                    true_sentence.append(word)

        # Get indices of top words in y_pred
        pred_indices = np.argmax(y_pred[i], axis=1)
        
        # Get words from indices
        pred_sentence = [urdu_tokenizer.index_word.get(word, '') for word in pred_indices]
        pred_sentence = [word for word in pred_sentence if word]  # ignore unknown words

        # Tokenize sentences
        true_sentence = word_tokenize(' '.join(true_sentence))
        pred_sentence = word_tokenize(' '.join(pred_sentence))

        # Calculate BLEU score with smoothing
        score = sentence_bleu([true_sentence], pred_sentence, smoothing_function=smoothing_func.method4)
        bleu_scores.append(score)

    return np.mean(bleu_scores)# Evaluate the model
test_pred = rnn_model.predict(test_english)
test_bleu = calculate_bleu(test_urdu, test_pred)
print(f'Test BLEU score: {test_bleu:.4f}')





def translate(sentence):
    sequence = english_tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_english_length, padding='post')
    prediction = rnn_model.predict(padded_sequence)
    translation = []
    for word_prob in prediction[0]:
        predicted_index = np.argmax(word_prob)
        word = urdu_tokenizer.index_word.get(predicted_index, 'UNK')
        translation.append(word)
    return ' '.join(translation)


test_sentences = [
    "Hello, how are you?",
    "What is your name?",
    "I am from Pakistan.",
    "How old are you?",
    "What do you do?",
    "I love reading books.",
    "Where do you live?",
    "What is your favorite food?",
    "I am learning Urdu.",
    "Goodbye, take care."
]


for sentence in test_sentences:
    predicted_translation = translate(sentence)
    print(f"Input: {sentence}")
    print(f"Predicted Translation: {predicted_translation}")
    print()

RNN Model Testing

In [2]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

from tensorflow.keras.layers import Embedding


model = load_model('english_to_urdu_rnn_model.keras')
df = pd.read_excel('English_Urdu.xlsx')

# Load data
df = df.astype(str)

# Create tokenizers
english_tokenizer = Tokenizer()
urdu_tokenizer = Tokenizer()

# Fit tokenizers to data
english_tokenizer.fit_on_texts(df['English'])
urdu_tokenizer.fit_on_texts(df['Urdu'])

# Get vocabulary
english_word_index = english_tokenizer.word_index
urdu_word_index = urdu_tokenizer.word_index
urdu_index_word = {v: k for k, v in urdu_word_index.items()}

# Define max sequence length
max_english_length = 40
max_urdu_length = 40

def translate(sentence):
    sequence = english_tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_english_length, padding='post')
    prediction = model.predict(padded_sequence)
    translation = []
    for word_prob in prediction[0]:
        predicted_index = np.argmax(word_prob)
        word = urdu_index_word.get(predicted_index, 'UNK')
        translation.append(word)
    return ' '.join(translation)

# Calculate BLEU score
smooth = SmoothingFunction()

# Test sentences with corresponding reference translations
test_data = df[['English', 'Urdu']].values

# Define the number of sentences to process
num_sentences = 10

# Calculate BLEU score for each sentence
bleu_scores = []
for i, (english_sentence, urdu_reference) in enumerate(test_data):
    if i >= num_sentences:
        break

    predicted_translation = translate(english_sentence)

    predicted_translation_tokens = word_tokenize(predicted_translation)
    reference_translation_tokens = word_tokenize(urdu_reference)

    bleu_score = sentence_bleu([reference_translation_tokens], predicted_translation_tokens, smoothing_function=smooth.method4)

    bleu_scores.append(bleu_score)

    print(f"Input: {english_sentence}")
    print(f"Predicted Translation: {predicted_translation}")
    print(f"Reference Translation: {urdu_reference}")
    print(f"BLEU Score: {bleu_score:.4f}")
    print()

# Calculate overall BLEU score
overall_bleu_score = sum(bleu_scores) / len(bleu_scores)
print(f"Overall BLEU Score: {overall_bleu_score:.4f}")

# Print BLEU scores for each sentence
print("BLEU Scores for each sentence:")
for i, bleu_score in enumerate(bleu_scores):
    print(f"Sentence {i+1}: {bleu_score:.4f}")

# Additional test sentences
additional_test_sentences = [
    "How are you today?",
    "What is your favorite food?",
    "I love reading books.",
    "Where do you live?",
    "What do you do?",
    "I am learning Urdu.",
    "Goodbye, take care.",
    "Hello, how can I help you?",
    "I am from Pakistan.",
    "What is your name?"
]

print("\nAdditional Test Sentences:")
for sentence in additional_test_sentences:
    predicted_translation = translate(sentence)
    print(f"Input: {sentence}")
    print(f"Predicted Translation: {predicted_translation}")
    print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Input: how can i communicate with my parents
Predicted Translation: میں اپنے والدین سے کیسے بات کہا UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK
Reference Translation: میں اپنے والدین سے کیسے بات کروں
BLEU Score: 0.1132

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Input: how can i make friends 
Predicted Translation: میں اس کیسے بنائوں سکتا UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK
Reference Translation: میں دوست کیسے بنائوں
BLEU Score: 0.0175

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
Input: why do i get so sad 
Predicted Translation: میں اتنا اداس کیوں UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UN

LSTM Model Training

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import nltk
nltk.download('punkt')
# Download and load the parallel-corpus.xlsx dataset
df = pd.read_excel('English_Urdu.xlsx')
df = df.astype(str)

# Preprocess data
english_texts = df['English'].tolist()
urdu_texts = df['Urdu'].tolist()

english_tokenizer = Tokenizer()
urdu_tokenizer = Tokenizer()

english_tokenizer.fit_on_texts(english_texts)
urdu_tokenizer.fit_on_texts(urdu_texts)

english_vocab_size = len(english_tokenizer.word_index) + 1
urdu_vocab_size = len(urdu_tokenizer.word_index) + 1

max_length = 40

english_sequences = english_tokenizer.texts_to_sequences(english_texts)
urdu_sequences = urdu_tokenizer.texts_to_sequences(urdu_texts)

english_padded = pad_sequences(english_sequences, maxlen=max_length, padding='post')
urdu_padded = pad_sequences(urdu_sequences, maxlen=max_length, padding='post')

# Split data into training, validation, and testing sets
train_size = int(0.8 * len(english_padded))
val_size = int(0.1 * len(english_padded))
test_size = len(english_padded) - train_size - val_size

train_english, val_english, test_english = english_padded[:train_size], english_padded[train_size:train_size+val_size], english_padded[train_size+val_size:]
train_urdu, val_urdu, test_urdu = urdu_padded[:train_size], urdu_padded[train_size:train_size+val_size], urdu_padded[train_size+val_size:]

# Create dataset and data loader
train_dataset = tf.data.Dataset.from_tensor_slices((train_english, train_urdu))
val_dataset = tf.data.Dataset.from_tensor_slices((val_english, val_urdu))
test_dataset = tf.data.Dataset.from_tensor_slices((test_english, test_urdu))

batch_size = 64
train_dataset = train_dataset.shuffle(100).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Masking, TimeDistributed, Bidirectional

# Define LSTM model
model_lstm = Sequential([
    Embedding(input_dim=english_vocab_size, output_dim=256, input_length=max_length),
    Masking(mask_value=0),
    Bidirectional(LSTM(units=512, return_sequences=True)),
    TimeDistributed(Dense(urdu_vocab_size, activation='softmax'))
])

model_lstm.compile(optimizer='adam', 
                    loss='sparse_categorical_crossentropy', 
                    metrics=['accuracy'])

# Train LSTM model
history = model_lstm.fit(train_dataset, epochs=50, validation_data=val_dataset)
# Save LSTM model
model_lstm.save('lstm_translator.h5')





from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
import numpy as np

from nltk.translate.bleu_score import SmoothingFunction

smoothing_func = SmoothingFunction()

def calculate_bleu(reference, prediction):
    reference = word_tokenize(reference)
    prediction = word_tokenize(prediction)
    return sentence_bleu([reference], prediction, smoothing_function=smoothing_func.method4)

# Evaluate LSTM model
max_test_size = 20 # Limit test size
bleu_scores_lstm = []
for i in range(min(max_test_size, len(test_english))):
    reference = urdu_tokenizer.sequences_to_texts([test_urdu[i]])[0]
    prediction = model_lstm.predict(test_english[i:i+1])
    predicted_sequence = np.argmax(prediction, axis=2)[0]
    prediction = urdu_tokenizer.sequences_to_texts([predicted_sequence])[0]
    bleu_scores_lstm.append(calculate_bleu(reference, prediction))

print(f'Test Average BLEU Score (LSTM): {sum(bleu_scores_lstm) / len(bleu_scores_lstm):.4f}')


# Evaluate LSTM model on original corpus (10 sentences)
print("Evaluation on Original Corpus")
evaluation_size = 10
corpus_english = english_texts[:evaluation_size]
corpus_urdu = urdu_texts[:evaluation_size]

corpus_english_sequences = english_tokenizer.texts_to_sequences(corpus_english)
corpus_urdu_sequences = urdu_tokenizer.texts_to_sequences(corpus_urdu)

corpus_english_padded = pad_sequences(corpus_english_sequences, maxlen=max_length, padding='post')
corpus_urdu_padded = pad_sequences(corpus_urdu_sequences, maxlen=max_length, padding='post')

predictions_lstm = model_lstm.predict(corpus_english_padded)

for i in range(min(evaluation_size, len(corpus_english))):
    reference = corpus_urdu[i]
    predicted_sequence = np.argmax(predictions_lstm[i], axis=1)
    prediction = urdu_tokenizer.sequences_to_texts([predicted_sequence])[0]
    print(f'English: {corpus_english[i]}')
    print(f'Reference Urdu: {reference}')
    print(f'Predicted Urdu (LSTM): {prediction}')
    print(f'BLEU Score: {calculate_bleu(reference, prediction):.4f}')
    print('---')

# Add a termination point
print("Evaluation completed.")

LSTM Model Testing

In [1]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize


# Load data
df = pd.read_excel('English_Urdu.xlsx')
df = df.astype(str)


# Create tokenizers
english_tokenizer = Tokenizer()
urdu_tokenizer = Tokenizer()


# Fit tokenizers to data
english_tokenizer.fit_on_texts(df['English'])
urdu_tokenizer.fit_on_texts(df['Urdu'])


# Get vocabulary
english_word_index = english_tokenizer.word_index
urdu_word_index = urdu_tokenizer.word_index
urdu_index_word = {v: k for k, v in urdu_word_index.items()}


# Define max sequence length
max_english_length = 40
max_urdu_length = 40


# Load pre-trained LSTM model
model = load_model('lstm_original.h5')


# Define translation function
def translate(sentence):
    sequence = english_tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_english_length, padding='post')
    prediction = model.predict(padded_sequence)
    translation = []
    for word_prob in prediction[0]:
        predicted_index = np.argmax(word_prob)
        word = urdu_index_word.get(predicted_index, 'UNK')
        translation.append(word)
    return ' '.join(translation)


# Calculate BLEU score
smooth = SmoothingFunction()


# Test sentences with corresponding reference translations
test_data = df[['English', 'Urdu']].values


# Define the number of sentences to process
num_sentences = 10


# Calculate BLEU score for each sentence
bleu_scores = []
for i, (english_sentence, urdu_reference) in enumerate(test_data):
    if i >= num_sentences:
        break

    predicted_translation = translate(english_sentence)

    predicted_translation_tokens = word_tokenize(predicted_translation)
    reference_translation_tokens = word_tokenize(urdu_reference)

    bleu_score = sentence_bleu([reference_translation_tokens], predicted_translation_tokens, smoothing_function=smooth.method4)

    bleu_scores.append(bleu_score)

    print(f"Input: {english_sentence}")
    print(f"Predicted Translation: {predicted_translation}")
    print(f"Reference Translation: {urdu_reference}")
    print(f"BLEU Score: {bleu_score:.4f}")
    print()


# Calculate overall BLEU score
overall_bleu_score = sum(bleu_scores) / len(bleu_scores)
print(f"Overall BLEU Score: {overall_bleu_score:.4f}")


# Print BLEU scores for each sentence
print("BLEU Scores for each sentence:")
for i, bleu_score in enumerate(bleu_scores):
    print(f"Sentence {i+1}: {bleu_score:.4f}")


# Additional test sentences
additional_test_sentences = [
    "How are you today?",
    "What is your favorite food?",
    "I love reading books.",
    "Where do you live?",
    "What do you do?",
    "I am learning Urdu.",
    "Goodbye, take care.",
    "Hello, how can I help you?",
    "I am from Pakistan.",
    "What is your name?"
]

print("\nAdditional Test Sentences:")
for sentence in additional_test_sentences:
    predicted_translation = translate(sentence)
    print(f"Input: {sentence}")
    print(f"Predicted Translation: {predicted_translation}")
    print()



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 987ms/step
Input: how can i communicate with my parents
Predicted Translation: میں اپنے والدین سے کیسے بات کروں UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK
Reference Translation: میں اپنے والدین سے کیسے بات کروں
BLEU Score: 0.1399

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
Input: how can i make friends 
Predicted Translation: میں میں کیسے بنائوں UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK
Reference Translation: میں دوست کیسے بنائوں
BLEU Score: 0.0175

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
Input: why do i get so sad 
Predicted Translation: کیوں اتنا اداس کیوں ہے UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UN