In [1]:
import pandas as pd
import numpy as np
import string
from string import digits
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
import warnings
warnings.filterwarnings('ignore')

In [2]:
lines= pd.read_table('deu-eng/deu.txt', names=['eng', 'de'], index_col=False)

In [3]:
lines

Unnamed: 0,eng,de
0,Go.,Geh.
1,Hi.,Hallo!
2,Hi.,Grüß Gott!
3,Run!,Lauf!
4,Run.,Lauf!
...,...,...
224415,Even if some sentences by non-native speakers ...,Auch wenn Sätze von Nichtmuttersprachlern mitu...
224416,If someone who doesn't know your background sa...,"Wenn jemand, der deine Herkunft nicht kennt, s..."
224417,If someone who doesn't know your background sa...,"Wenn jemand Fremdes dir sagt, dass du dich wie..."
224418,If someone who doesn't know your background sa...,"Wenn jemand, der nicht weiß, woher man kommt, ..."


In [4]:
lines = lines[:10000]

In [5]:
# Lowering all the charecters
lines.eng = lines.eng.apply(lambda x:x.lower())
lines.deu = lines.de.apply(lambda x:x.lower())

In [6]:
lines

Unnamed: 0,eng,de
0,go.,Geh.
1,hi.,Hallo!
2,hi.,Grüß Gott!
3,run!,Lauf!
4,run.,Lauf!
...,...,...
9995,tom smelled it.,Tom hat das gerochen.
9996,tom smells bad.,Tom riecht schlecht.
9997,tom sounds mad.,Tom klingt verrückt.
9998,tom sounds sad.,Tom klingt traurig.


In [7]:
#Remove Quotes
lines.eng = lines.eng.apply(lambda x: re.sub("'",'',x))
lines.de = lines.de.apply(lambda x: re.sub("'",'',x))

In [8]:
exclude = set(string.punctuation) #Set of all the special charecters
# Removing all the string
lines.eng = lines.eng.apply(lambda x:''.join(ch for ch in x if ch not in exclude))
lines.de = lines.de.apply(lambda x:''.join(ch for ch in x if ch not in exclude))

In [9]:
lines

Unnamed: 0,eng,de
0,go,Geh
1,hi,Hallo
2,hi,Grüß Gott
3,run,Lauf
4,run,Lauf
...,...,...
9995,tom smelled it,Tom hat das gerochen
9996,tom smells bad,Tom riecht schlecht
9997,tom sounds mad,Tom klingt verrückt
9998,tom sounds sad,Tom klingt traurig


In [10]:
# Removing all the digits from the text
remove_degits = str.maketrans('','',digits)
lines.eng = lines.eng.apply(lambda x: x.translate(remove_degits))
lines.de = lines.de.apply(lambda x: x.translate(remove_degits))

In [11]:
lines

Unnamed: 0,eng,de
0,go,Geh
1,hi,Hallo
2,hi,Grüß Gott
3,run,Lauf
4,run,Lauf
...,...,...
9995,tom smelled it,Tom hat das gerochen
9996,tom smells bad,Tom riecht schlecht
9997,tom sounds mad,Tom klingt verrückt
9998,tom sounds sad,Tom klingt traurig


In [12]:
# Removing the extra spaces
lines.eng = lines.eng.apply(lambda x:x.strip())
lines.de = lines.de.apply(lambda x:x.strip())
lines.eng = lines.eng.apply(lambda x: re.sub(' +',' ',x))
lines.de = lines.de.apply(lambda x: re.sub(' +',' ',x))

In [13]:
lines

Unnamed: 0,eng,de
0,go,Geh
1,hi,Hallo
2,hi,Grüß Gott
3,run,Lauf
4,run,Lauf
...,...,...
9995,tom smelled it,Tom hat das gerochen
9996,tom smells bad,Tom riecht schlecht
9997,tom sounds mad,Tom klingt verrückt
9998,tom sounds sad,Tom klingt traurig


In [14]:
# Add start and end tokens to target sequences
lines.de = lines.de.apply(lambda x : 'START_ '+ x + ' _END')

In [15]:
lines

Unnamed: 0,eng,de
0,go,START_ Geh _END
1,hi,START_ Hallo _END
2,hi,START_ Grüß Gott _END
3,run,START_ Lauf _END
4,run,START_ Lauf _END
...,...,...
9995,tom smelled it,START_ Tom hat das gerochen _END
9996,tom smells bad,START_ Tom riecht schlecht _END
9997,tom sounds mad,START_ Tom klingt verrückt _END
9998,tom sounds sad,START_ Tom klingt traurig _END


In [16]:
# Vocabulary of English
all_eng_words = set()
for sentence in lines.eng:
    for word in sentence.split():
        if word not in all_eng_words:
            all_eng_words.add(word)
# Vocabulary of Deutsch
all_de_words = set()
for sentence in lines.de:
    for word in sentence.split():
        if word not in all_de_words:
            all_de_words.add(word)

In [17]:
# Maximum length of source Sentence
length_list = []
for sentence in lines.eng:
    length_list.append(len(sentence.split(' ')))
max_length_src = np.max(length_list)
max_length_src

5

In [18]:
# Maximum length of target Sentence
length_list = []
for sentence in lines.de:
    length_list.append(len(sentence.split(' ')))
max_length_target = np.max(length_list)
max_length_target

11

In [19]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_de_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_de_words)
num_encoder_tokens, num_decoder_tokens

(2248, 4072)

In [20]:
num_decoder_tokens += 1 # For zero padding
num_decoder_tokens

4073

In [21]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [22]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [23]:
lines = shuffle(lines)
lines.head(10)

Unnamed: 0,eng,de
3533,give it back,START_ Geben Sie es zurück _END
4192,let tom stay,START_ Lassen Sie Tom bleiben _END
7588,do it yourself,START_ Mach es selber _END
9581,tom got dumped,START_ Tom wurde fallen gelassen _END
2062,can we sing,START_ Können wir singen _END
1742,taste this,START_ Schmeck das _END
261,open it,START_ Öffne es _END
6466,start the car,START_ Lasse den Wagen an _END
7163,were humming,START_ Wir summen gerade _END
8156,i have sisters,START_ Ich habe Schwestern _END


In [24]:
# Train - Test Split in the ratio of 9:1
X, y = lines.eng, lines.de
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
X_train.shape, X_test.shape

((9000,), (1000,))

Save the train and test dataframes for reproducing the results later, as they are shuffled.

In [25]:
X_train.to_pickle('deu-eng/X_train.pkl')
X_test.to_pickle('deu-eng/X_test.pkl')

In [26]:
#Generating the batch of data
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    while True:
        for j in range(0 , len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src), dtype = 'float32')
            decoder_input_data = np.zeros((batch_size, max_length_target), dtype = 'float32')
            decoder_target_data = np.zeros((batch_size, max_length_target, num_decoder_tokens), dtype = 'float32')
            for i, (input_text, target_text) in enumerate (zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t,word in enumerate (input_text.split()):
                    encoder_input_data[i,t] = input_token_index[word] # encoder input seq
                for t, word in enumerate (target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i,t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

# Encoder - Decoder Model Architecture

In [27]:
latent_dim = 256

In [28]:
#Encoder
encoder_inputs = Input(shape = (None,))
encoder_embedded = Embedding(num_encoder_tokens+1, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state = True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedded)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [29]:
#Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedded_layer = Embedding(num_decoder_tokens+1, latent_dim, mask_zero = True)
decoder_embedded = decoder_embedded_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences = True, return_state = True)
decoder_outputs, _,_ = decoder_lstm(decoder_embedded, initial_state = encoder_states)
decoder_dense = Dense(num_decoder_tokens,activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs,decoder_inputs], decoder_outputs)

In [30]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [31]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 50

In [32]:
import time

In [None]:
%%time
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

# Saving the weights

In [None]:
model.save_weights('deu-eng/nmt_weights.h5')

# Load the weights

In [33]:
model.load_weights('deu-eng/nmt_weights.h5')

# Inference Setup

In [34]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= decoder_embedded_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

# Decode sample sequences

In [35]:

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

# Evaluation on Train Dataset

In [36]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

In [37]:

k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Marathi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: were you bit
Actual Marathi Translation:  Wurdet ihr gebissen 
Predicted Marathi Translation:  Wurdest du gebissen worden 


In [38]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Marathi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: keep climbing
Actual Marathi Translation:  Klettert weiter 
Predicted Marathi Translation:  Klettern Sie weiter 


In [39]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Marathi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: tom likes wine
Actual Marathi Translation:  Tom mag Wein 
Predicted Marathi Translation:  Tom mag Wein 


In [40]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Marathi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: he bit his lip
Actual Marathi Translation:  Er biss sich auf die Lippe 
Predicted Marathi Translation:  Er biss sich auf die Lippe 


In [41]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Marathi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: i see the boy
Actual Marathi Translation:  Ich sehe den Jungen 
Predicted Marathi Translation:  Ich sehe den Jungen 


# Evaluation on Validation Dataset

In [42]:
val_gen = generate_batch(X_test, y_test, batch_size = 1)
k=-1

In [43]:
k+=1
(input_seq, actual_output), _ = next(val_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_test[k:k+1].values[0])
print('Actual Marathi Translation:', y_test[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: ill take tom
Actual Marathi Translation:  Ich kümmere mich um Tom 
Predicted Marathi Translation:  Ich nehme Tom 


In [44]:
k+=1
(input_seq, actual_output), _ = next(val_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_test[k:k+1].values[0])
print('Actual Marathi Translation:', y_test[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: i let you down
Actual Marathi Translation:  Ich habe euch enttäuscht 
Predicted Marathi Translation:  Ich habe Sie enttäuscht 


In [45]:

k+=1
(input_seq, actual_output), _ = next(val_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_test[k:k+1].values[0])
print('Actual Marathi Translation:', y_test[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: have courage
Actual Marathi Translation:  Nur Mut 
Predicted Marathi Translation:  Nur Mut 


In [46]:
k+=1
(input_seq, actual_output), _ = next(val_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_test[k:k+1].values[0])
print('Actual Marathi Translation:', y_test[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: is that my mug
Actual Marathi Translation:  Ist das mein Becher 
Predicted Marathi Translation:  Ist das mein Becher 


### Giving the manual Input and translating the sentence

In [47]:
input_sentence = input('Enter the English sentence you want to convert into the German \n')

Enter the English sentence you want to convert into the German 
i love it


In [48]:
encoder_input_data = np.zeros((batch_size, max_length_src), dtype = 'float32')
#One hot representation
for t, word in enumerate (input_sentence.split()):
    encoder_input_data[0,t] = input_token_index[word] # encoder input seq

In [49]:
input_seq = encoder_input_data[0].reshape(1,encoder_input_data[0].shape[0])

In [50]:
decoded_sentence = decode_sequence(input_seq)
print("Input sentence:", input_sentence)
print("Decoded sentence:", decoded_sentence[:-4])

Input sentence: i love it
Decoded sentence:  Ich liebe es 
