In [34]:
import pandas as pd
import numpy as np
import string
from string import digits
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

In [35]:
lines= pd.read_table('deu-eng/deu.txt', names=['eng', 'de'], index_col=False)

In [36]:
lines

Unnamed: 0,eng,de
0,Go.,Geh.
1,Hi.,Hallo!
2,Hi.,Grüß Gott!
3,Run!,Lauf!
4,Run.,Lauf!
...,...,...
224415,Even if some sentences by non-native speakers ...,Auch wenn Sätze von Nichtmuttersprachlern mitu...
224416,If someone who doesn't know your background sa...,"Wenn jemand, der deine Herkunft nicht kennt, s..."
224417,If someone who doesn't know your background sa...,"Wenn jemand Fremdes dir sagt, dass du dich wie..."
224418,If someone who doesn't know your background sa...,"Wenn jemand, der nicht weiß, woher man kommt, ..."


In [37]:
lines = lines[:10000]

In [38]:
# Lowering all the charecters
lines.eng = lines.eng.apply(lambda x:x.lower())
lines.deu = lines.de.apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
  lines.deu = lines.de.apply(lambda x:x.lower())


In [39]:
lines

Unnamed: 0,eng,de
0,go.,Geh.
1,hi.,Hallo!
2,hi.,Grüß Gott!
3,run!,Lauf!
4,run.,Lauf!
...,...,...
9995,tom smelled it.,Tom hat das gerochen.
9996,tom smells bad.,Tom riecht schlecht.
9997,tom sounds mad.,Tom klingt verrückt.
9998,tom sounds sad.,Tom klingt traurig.


In [40]:
#Remove Quotes
lines.eng = lines.eng.apply(lambda x: re.sub("'",'',x))
lines.de = lines.de.apply(lambda x: re.sub("'",'',x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [41]:
exclude = set(string.punctuation) #Set of all the special charecters
# Removing all the string
lines.eng = lines.eng.apply(lambda x:''.join(ch for ch in x if ch not in exclude))
lines.de = lines.de.apply(lambda x:''.join(ch for ch in x if ch not in exclude))

In [42]:
lines

Unnamed: 0,eng,de
0,go,Geh
1,hi,Hallo
2,hi,Grüß Gott
3,run,Lauf
4,run,Lauf
...,...,...
9995,tom smelled it,Tom hat das gerochen
9996,tom smells bad,Tom riecht schlecht
9997,tom sounds mad,Tom klingt verrückt
9998,tom sounds sad,Tom klingt traurig


In [43]:
# Removing all the digits from the text
remove_degits = str.maketrans('','',digits)
lines.eng = lines.eng.apply(lambda x: x.translate(remove_degits))
lines.de = lines.de.apply(lambda x: x.translate(remove_degits))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [44]:
lines

Unnamed: 0,eng,de
0,go,Geh
1,hi,Hallo
2,hi,Grüß Gott
3,run,Lauf
4,run,Lauf
...,...,...
9995,tom smelled it,Tom hat das gerochen
9996,tom smells bad,Tom riecht schlecht
9997,tom sounds mad,Tom klingt verrückt
9998,tom sounds sad,Tom klingt traurig


In [45]:
# Removing the extra spaces
lines.eng = lines.eng.apply(lambda x:x.strip())
lines.de = lines.de.apply(lambda x:x.strip())
lines.eng = lines.eng.apply(lambda x: re.sub(' +',' ',x))
lines.de = lines.de.apply(lambda x: re.sub(' +',' ',x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [46]:
lines

Unnamed: 0,eng,de
0,go,Geh
1,hi,Hallo
2,hi,Grüß Gott
3,run,Lauf
4,run,Lauf
...,...,...
9995,tom smelled it,Tom hat das gerochen
9996,tom smells bad,Tom riecht schlecht
9997,tom sounds mad,Tom klingt verrückt
9998,tom sounds sad,Tom klingt traurig


In [47]:
# Add start and end tokens to target sequences
lines.de = lines.de.apply(lambda x : 'START_ '+ x + ' _END')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [48]:
lines

Unnamed: 0,eng,de
0,go,START_ Geh _END
1,hi,START_ Hallo _END
2,hi,START_ Grüß Gott _END
3,run,START_ Lauf _END
4,run,START_ Lauf _END
...,...,...
9995,tom smelled it,START_ Tom hat das gerochen _END
9996,tom smells bad,START_ Tom riecht schlecht _END
9997,tom sounds mad,START_ Tom klingt verrückt _END
9998,tom sounds sad,START_ Tom klingt traurig _END


In [49]:
# Vocabulary of English
all_eng_words = set()
for sentence in lines.eng:
    for word in sentence.split():
        if word not in all_eng_words:
            all_eng_words.add(word)
# Vocabulary of Deutsch
all_de_words = set()
for sentence in lines.de:
    for word in sentence.split():
        if word not in all_de_words:
            all_de_words.add(word)

In [50]:
# Maximum length of source Sentence
length_list = []
for sentence in lines.eng:
    length_list.append(len(sentence.split(' ')))
max_length_src = np.max(length_list)
max_length_src

5

In [51]:
# Maximum length of target Sentence
length_list = []
for sentence in lines.de:
    length_list.append(len(sentence.split(' ')))
max_length_target = np.max(length_list)
max_length_target

11

In [52]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_de_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_de_words)
num_encoder_tokens, num_decoder_tokens

(2248, 4072)

In [53]:
num_decoder_tokens += 1 # For zero padding
num_decoder_tokens

4073

In [54]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [55]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [56]:
lines = shuffle(lines)
lines.head(10)

Unnamed: 0,eng,de
9280,that was quick,START_ Das ging fix _END
6482,stop dreaming,START_ Hör auf zu träumen _END
2524,im selfish,START_ Ich bin selbstsüchtig _END
385,have fun,START_ Viel Vergnügen _END
1340,how absurd,START_ Wie absurd _END
3362,ask mary out,START_ Führe Maria aus _END
1309,he is busy,START_ Er ist beschäftigt _END
4152,its working,START_ Es funktioniert _END
1058,thats ok,START_ Das ist ok _END
8336,i promised tom,START_ Ich habe es Tom versprochen _END


In [57]:
# Train - Test Split in the ratio of 9:1
X, y = lines.eng, lines.de
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
X_train.shape, X_test.shape

((9000,), (1000,))

Save the train and test dataframes for reproducing the results later, as they are shuffled.

In [58]:
X_train.to_pickle('deu-eng/X_train.pkl')
X_test.to_pickle('deu-eng/X_test.pkl')

In [59]:
#Generating the batch of data
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    while True:
        for j in range(0 , len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src), dtype = 'float32')
            decoder_input_data = np.zeros((batch_size, max_length_target), dtype = 'float32')
            decoder_target_data = np.zeros((batch_size, max_length_target, num_decoder_tokens), dtype = 'float32')
            for i, (input_text, target_text) in enumerate (zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t,word in enumerate (input_text.split()):
                    encoder_input_data[i,t] = input_token_index[word] # encoder input seq
                for t, word in enumerate (target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i,t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

# Encoder - Decoder Model Architecture

In [60]:
latent_dim = 256

In [67]:
#Encoder
encoder_inputs = Input(shape = (None,))
encoder_embedded = Embedding(num_encoder_tokens+1, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state = True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedded)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [68]:
#Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedded_layer = Embedding(num_decoder_tokens+1, latent_dim, mask_zero = True)
decoder_embedded = decoder_embedded_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences = True, return_state = True)
decoder_outputs, _,_ = decoder_lstm(decoder_embedded, initial_state = encoder_states)
decoder_dense = Dense(num_decoder_tokens,activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs,decoder_inputs], decoder_outputs)

In [69]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [70]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 50

In [71]:
import time

In [72]:
%%time
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Wall time: 16min 19s


<tensorflow.python.keras.callbacks.History at 0x18dcb07ea90>

# Saving the weights

In [74]:
model.save_weights('deu-eng/nmt_weights.h5')

# Load the weights

In [75]:
model.load_weights('deu-eng/nmt_weights.h5')

# Inference Setup

In [76]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= decoder_embedded_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

# Decode sample sequeces

In [77]:

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

# Evaluation on Train Dataset

In [78]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

In [79]:

k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Marathi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: ill tell tom
Actual Marathi Translation:  Ich werde es Tom sagen 
Predicted Marathi Translation:  Ich sags Tom 


In [80]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Marathi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: im sick
Actual Marathi Translation:  Ich bin krank 
Predicted Marathi Translation:  Ich bin krank 


In [81]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Marathi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: im through
Actual Marathi Translation:  Ich bin damit durch 
Predicted Marathi Translation:  Ich bin damit 


In [82]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Marathi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: tom was late
Actual Marathi Translation:  Tom kam zu spät 
Predicted Marathi Translation:  Tom war früh 


In [83]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Marathi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: use caution
Actual Marathi Translation:  Lassen Sie Vorsicht walten 
Predicted Marathi Translation:  Lasst Vorsicht walten 


# Evaluation on Validation Dataset

In [84]:
val_gen = generate_batch(X_test, y_test, batch_size = 1)
k=-1

In [85]:
k+=1
(input_seq, actual_output), _ = next(val_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_test[k:k+1].values[0])
print('Actual Marathi Translation:', y_test[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: have fun
Actual Marathi Translation:  Viel Spaß 
Predicted Marathi Translation:  Viel Vergnügen 


In [86]:
k+=1
(input_seq, actual_output), _ = next(val_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_test[k:k+1].values[0])
print('Actual Marathi Translation:', y_test[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: may i begin
Actual Marathi Translation:  Darf ich anfangen 
Predicted Marathi Translation:  Darf ich zuerst 


In [87]:

k+=1
(input_seq, actual_output), _ = next(val_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_test[k:k+1].values[0])
print('Actual Marathi Translation:', y_test[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: its too easy
Actual Marathi Translation:  Es ist zu leicht 
Predicted Marathi Translation:  Das ist zu leicht 


In [88]:
k+=1
(input_seq, actual_output), _ = next(val_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_test[k:k+1].values[0])
print('Actual Marathi Translation:', y_test[k:k+1].values[0][6:-4])
print('Predicted Marathi Translation:', decoded_sentence[:-4])

Input English sentence: are you new
Actual Marathi Translation:  Bist du neu 
Predicted Marathi Translation:  Bist du neu 
