In [1]:
# English-Urdu Machine Translation 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

print(os.listdir("../input"))

# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)




Using TensorFlow backend.


['new-dataset-tanzil-with-748k', 'eng-urdu-model', 'finaldataset', 'merged', 'general-eng-ur']


In [2]:
lines=pd.read_csv("/kaggle/input/new-dataset-tanzil-with-748k/english_urdu_748k.csv")

In [3]:
lines.head(20)

Unnamed: 0,eng,urdu
0,"In the name of Allah, most benevolent, ever-me...",شروع الله کا نام لے کر جو بڑا مہربان نہایت رحم...
1,"ALL PRAISE BE to Allah, Lord of all the worlds,\n",سب تعریفیں الله کے لیے ہیں جو سب جہانوں کا پال...
2,"Most beneficent, ever-merciful,\n",بڑا مہربان نہایت رحم والا\n
3,King of the Day of Judgement.\n,جزا کے دن کا مالک\n
4,"You alone we worship, and to You alone turn fo...",ہم تیری ہی عبادت کرتے ہیں اور تجھ ہی سے مدد ما...
5,"Guide us (O Lord) to the path that is straight,\n",ہمیں سیدھا راستہ دکھا\n
6,"The path of those You have blessed, Not of tho...",ان لوگوں کا راستہ جن پر تو نے انعام کیا نہ جن ...
7,ALIF LAM MIM.\n,المۤ\n
8,"This is The Book free of doubt and involution,...",یہ وہ کتاب ہے جس میں کوئی بھی شک نہیں پرہیز گا...
9,Who believe in the Unknown and fulfil their de...,جو بن دیکھے ایمان لاتے ہیں اور نماز قائم کرتے ...


In [4]:
lines.rename(columns={'eng': 'english_sentence', 'urdu': 'urdu_sentence'}, inplace=True)

In [5]:
pd.isnull(lines).sum()

english_sentence    0
urdu_sentence       0
dtype: int64

In [6]:
lines=lines[~pd.isnull(lines['english_sentence'])]

In [7]:
#lines.drop_duplicates(inplace=True)
lines.dropna(inplace=True)

* ### Let us pick any 25000 rows from the dataset.

In [8]:
lines=lines.sample(n=250000,random_state=42)
lines.shape

(250000, 2)

In [9]:
lines.head()

Unnamed: 0,english_sentence,urdu_sentence
12245,No. In fact you are not generous to the orphan...,نہیں بلکہ تم لوگ یتیم کی خاطر نہیں کرتے\n
18958,"Fight in the way of God, and remember that God...",اللہ تعالیٰ کی راه میں جہاد کرو اور جان لو کہ ...
163751,And they sold him for a mean price: a few dirh...,اور ان لوگوں نے یوسف کو معمولی قیمت پر بیچ ڈال...
516292,So which of the favors of your Lord would you ...,تو پھر تم اپنے رب کی کس کس نعمت کو جھٹلاؤ گے\n
442620,and by this secure town:\n,اور پُرامن شہر (مکہ) کی۔\n


In [10]:
# Lowercase all characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.lower())
lines['urdu_sentence']=lines['urdu_sentence'].apply(lambda x: x.lower())

In [11]:
# Remove quotes
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['urdu_sentence']=lines['urdu_sentence'].apply(lambda x: re.sub("'", '', x))

In [12]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['urdu_sentence']=lines['urdu_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [13]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['urdu_sentence']=lines['urdu_sentence'].apply(lambda x: x.translate(remove_digits))

lines['urdu_sentence'] = lines['urdu_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.strip())
lines['urdu_sentence']=lines['urdu_sentence'].apply(lambda x: x.strip())
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
lines['urdu_sentence']=lines['urdu_sentence'].apply(lambda x: re.sub(" +", " ", x))


In [14]:
# Add start and end tokens to target sequences
lines['urdu_sentence'] = lines['urdu_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [15]:
lines.head()

Unnamed: 0,english_sentence,urdu_sentence
12245,no in fact you are not generous to the orphans,START_ نہیں بلکہ تم لوگ یتیم کی خاطر نہیں کرتے...
18958,fight in the way of god and remember that god ...,START_ اللہ تعالیٰ کی راه میں جہاد کرو اور جان...
163751,and they sold him for a mean price a few dirha...,START_ اور ان لوگوں نے یوسف کو معمولی قیمت پر ...
516292,so which of the favors of your lord would you ...,START_ تو پھر تم اپنے رب کی کس کس نعمت کو جھٹل...
442620,and by this secure town,START_ اور پُرامن شہر مکہ کی۔ _END


In [16]:
### Get English and Urdu Vocabulary
all_eng_words=set()
for eng in lines['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_urdu_words=set()
for hin in lines['urdu_sentence']:
    for word in hin.split():
        if word not in all_urdu_words:
            all_urdu_words.add(word)

In [17]:
len(all_eng_words)

21943

In [18]:
len(all_urdu_words)

22981

In [19]:
lines['length_eng_sentence']=lines['english_sentence'].apply(lambda x:len(x.split(" ")))
lines['length_urdu_sentence']=lines['urdu_sentence'].apply(lambda x:len(x.split(" ")))

In [20]:
lines.head()

Unnamed: 0,english_sentence,urdu_sentence,length_eng_sentence,length_urdu_sentence
12245,no in fact you are not generous to the orphans,START_ نہیں بلکہ تم لوگ یتیم کی خاطر نہیں کرتے...,10,11
18958,fight in the way of god and remember that god ...,START_ اللہ تعالیٰ کی راه میں جہاد کرو اور جان...,15,18
163751,and they sold him for a mean price a few dirha...,START_ اور ان لوگوں نے یوسف کو معمولی قیمت پر ...,22,26
516292,so which of the favors of your lord would you ...,START_ تو پھر تم اپنے رب کی کس کس نعمت کو جھٹل...,11,14
442620,and by this secure town,START_ اور پُرامن شہر مکہ کی۔ _END,5,7


In [21]:
lines[lines['length_eng_sentence']>30].shape

(72985, 4)

In [22]:
lines=lines[lines['length_eng_sentence']<=20]
lines=lines[lines['length_urdu_sentence']<=20]

In [23]:
lines.shape

(87341, 4)

In [24]:
print("maximum length of Urdu Sentence ",max(lines['length_urdu_sentence']))
print("maximum length of English Sentence ",max(lines['length_eng_sentence']))

maximum length of Urdu Sentence  20
maximum length of English Sentence  20


In [25]:
max_length_src=max(lines['length_urdu_sentence'])
max_length_tar=max(lines['length_eng_sentence'])

In [26]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_urdu_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_urdu_words)
num_encoder_tokens, num_decoder_tokens

(21943, 22981)

In [27]:
num_decoder_tokens += 1 #for zero padding


In [28]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [29]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [30]:
lines = shuffle(lines)
lines.head(10)

Unnamed: 0,english_sentence,urdu_sentence,length_eng_sentence,length_urdu_sentence
747130,see ye the water which ye drink,START_ بھلا یہ بتاؤ جو پانی تم پیتے ہو، _END,7,10
404094,who will reside in the gardens of bliss,START_ نعمت کے باغات ہوں گے _END,8,7
42259,and keep laughing and do not weep,START_ اور ہنستے ہو اور روتے نہیں۔ _END,7,8
347661,by the heaven with its numerous forms,START_ اور ستاروں اور سیّاروں کی کہکشاؤں اور گ...,7,14
679377,o man having striven hard towards your lord yo...,START_ اے آدمی بیشک تجھے اپنے رب کی طرف ضرور د...,12,17
203342,by those angels who bring the book and the qur...,START_ پھر ذکر الہیٰ کے تلاوت کرنے والوں کی _END,17,10
109245,and not a thing is there hidden in heaven and ...,START_ اور آسمانوں اور زمین میں کوئی پوشیدہ چی...,18,20
734307,as also in your own selves will ye not then see,START_ اور خود تمہارے اپنے وجود میں ہیں کیا تم...,11,14
253119,nor are the darkness and the light,START_ اور نہ اندھیرے اور نہ روشنی _END,7,8
690360,we have not wronged them it was they who were ...,START_ اور ہم نے ان پر ظلم نہیں کیا لیکن وہ خو...,12,19


### Split the data into train and test

In [31]:
X, y = lines['english_sentence'], lines['urdu_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((69872,), (17469,))

### Let us save this data

In [32]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')


In [33]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

### Encoder-Decoder Architecture

In [34]:
latent_dim=300

In [35]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

Instructions for updating:
Colocations handled automatically by placer.


In [36]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [37]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [38]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    6582900     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    6894600     input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LS

In [39]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 10

In [40]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)



Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [41]:
model.save_weights('nmt_weights.h5')

In [42]:
model.save("./model.h5")

  '. They will not be included '


In [43]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)


In [44]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [45]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1


In [46]:
train_gen

<generator object generate_batch at 0x7f0c72dbed00>

In [47]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Urdu Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted URdu Translation:', decoded_sentence[:-4])

Input English sentence: then we destroyed the others
Actual Urdu Translation:  پھر ہم نے اوروں کو ہلاک کردیا 
Predicted URdu Translation:  پھر ہم نے دوسروں کو ہلاک کر دیا 


In [48]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Urdu Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Urdu Translation:', decoded_sentence[:-4])

Input English sentence: you are not insane by the munificence of your lord
Actual Urdu Translation:  آپ الله کے فضل سے دیوانہ نہیں ہیں 
Predicted Urdu Translation:  آپ کے رب کی عبادت کرتے ہیں اس پر کہ تمہیں اس کا


In [49]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Urdu Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Urdu Translation:', decoded_sentence[:-4])

Input English sentence: the lord of moses and aaron”
Actual Urdu Translation:  یعنی موسیٰ علیہ السلام اور ہارون کے رب پر 
Predicted Urdu Translation:  یعنی موسیٰ اور ہارون کے پروردگار پر اس کے رب پ


In [50]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Urdu Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Urdu Translation:', decoded_sentence[:-4])

Input English sentence: it may be thou frettest thy soul with grief that they do not become believers
Actual Urdu Translation:  اے محمدؐ، شاید تم اس غم میں اپنی جان کھو دو گے کہ یہ لوگ ایمان نہیں لاتے 
Predicted Urdu Translation:  اے ایمان والو تم کیا بات کیوں نہیں حاصل کرتے اگ


In [51]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Urdu Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Urdu Translation:', decoded_sentence[:-4])

Input English sentence: so we perverted you for we were perverse ourselves’
Actual Urdu Translation:  سو ہم نے تمہیں گمراہ کر دیا بے شک ہم خود گمراہ تھے، 
Predicted Urdu Translation:  تو ہم نے تمہیں گمراہ کیا کہ ہم خود گمراہ تھے، 
