In [1]:
import numpy as np
import pandas as pd
from pickle import load

In [2]:
en_tokens = load(open('./en_tokens.pkl', 'rb'))
np_tokens = load(open('./np_tokens.pkl', 'rb'))

# sorted() can only be used with iterable data structure
# So, here we are converting set() type to list() type
encoder_tokens = sorted(list(en_tokens))
decoder_tokens = sorted(list(np_tokens))

In [3]:
# Here we are translating from English to Nepali language
# So, en_tokens act as encoder tokens and np_tokens act as decoder tokens
num_decoder_tokens = len(decoder_tokens)
num_encoder_tokens = len(encoder_tokens)

print("The length of Nepali/Decoder tokens is:", num_decoder_tokens)
print("The length of English/Encoder tokens is:", num_encoder_tokens)

The length of Nepali/Decoder tokens is: 186399
The length of English/Encoder tokens is: 64316


In [4]:
# For zero padding
num_decoder_tokens += 1

print("The length of Nepali/Decoder tokens is:", num_decoder_tokens)
print("The length of English/Encoder tokens is:", num_encoder_tokens)

The length of Nepali/Decoder tokens is: 186400
The length of English/Encoder tokens is: 64316


In [5]:
# Creating Vocabulary of the words
encoder_token_dict = dict([(word, i+1) for i, word in enumerate(encoder_tokens)])
decoder_token_dict = dict([(word, i+1) for i, word in enumerate(decoder_tokens)])

# Random Splitting of Data

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
data = pd.read_csv("./cleaned_data.csv")
data.head()

Unnamed: 0,Nepali,English
0,START_TOKEN “मानौ एउटी स्त्रीसँग दशवटा चाँदीका...,or what woman if she had ten drachma coins if ...
1,START_TOKEN ती दुष्ट मानिसहरू हिंस्रक सिंहहरू ...,he is like a lion that is greedy of his prey a...
2,START_TOKEN प्रक्रिया दृश्य क्रम स्तम्भ END_TOKEN,process view sort column
3,START_TOKEN जहा ट्याबहरु देखाइन सकिन्थ्यो वा स...,whether tooltips should be shown on widgets
4,START_TOKEN अनुष्ठान अनुसार जहां केटि र महिलाह...,ritual servitude where girls and women are ple...


In [8]:
x = data["English"]
y = data["Nepali"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state=42)

print("The shape of the training data is:", x_train.shape, y_train.shape)
print("The shape of the trest data is:", x_test.shape, y_test.shape)

The shape of the training data is: (144216,) (144216,)
The shape of the trest data is: (16025,) (16025,)


# Encoding/Mapping Text to Numerical Values

In [9]:
np_words_count = data["Nepali"].apply(lambda x:len(str(x).split()))
en_words_count = data["English"].apply(lambda x:len(str(x).split()))

np_sentence_max_length = max(np_words_count)
en_sentence_max_length = max(en_words_count)

print("The maximum number of words in one single Nepali sentence throughout the corpus is:", np_sentence_max_length)
print("The maximum number of words in one single English sentence throughout the corpus is:", en_sentence_max_length)

The maximum number of words in one single Nepali sentence throughout the corpus is: 388
The maximum number of words in one single English sentence throughout the corpus is: 498


In [10]:
for i, j in enumerate(np_words_count):
    if j == np_sentence_max_length:
        print(i)

128341


In [11]:
for i, j in enumerate(en_words_count):
    if j == en_sentence_max_length:
        print(i)

128341


In [12]:
print(data["English"][128341])

fall on the enemy dig below the gap on the right then quickly go right over his head pick up the gold fall through a false brick run up the ladder to the right and go to the right along the lower pole dig to the right which traps the second enemy get the gold fall dig left fall collect two gold pieces and go up two ladders move to the left via a false brick the second enemy should still be trapped below go left to the ladder down then left and down through a false brick collect one gold dig on the right get three gold dig and go up all the ladders on the right go left on the pole again but this time climb upwards get one gold and dig right to trap another enemy move up to the top go right fall and continue to the concrete area at top center outwit the enemy in the concrete area then exit by falling on him and moving to the gap on the left and down through two false bricks hold onto the pole at center screen go right and back on up to the top move left on the pole and wait for the other

In [13]:
print(data["Nepali"][128341])

START_TOKEN शत्रु माथि झर्नुहोस् दायाँको खाली स्थान तल खन्नुहोस् त्यसपछि उसको टाउको माथिदायाँ तिर छिटो दायाँ जानुहोस् सुन टिप्नुहोस् झुक्याउने इट्टाबाट झर्नुहोस् दायाँको भर्याङको माथि दगुर्नुहोस् र तल्लो पोलको दायाँ जानुहोस् दायाँ तिर खन्नुहोस् जसले दोस्रो शत्रुलाई जालमा पर्दछ सुन प्राप्त गर्नुहोस् झर्नुहोस् बायाँ खन्नुहोस् झर्नुहोस् दुइ सुनका टुक्रा सङ्कलन गर्नुहोस् र दुइ भर्याङ माथि जानुहोस् झुक्याउने इट्टा हुँदै बायाँ जानुहोस् दोस्रो शत्रु तल जालमा रहिरहेको हुनुपर्दछ भर्याङको बायाँ तल जानुहोस् त्यसपछि झुक्याउने इट्टा हुँदै बायाँ र तल जानुहोस् एउटा सुन सङ्कलन गर्नुहोस् दायाँ खन्नुहोस् तीनवटा सुन लिनुहोस् खन्नुहोस् र दायाँका सबै भर्याङ माथि जानुहोस् पोलको बायाँ फेरि जानुहोस् तर यस पाली माथ तिर उक्लनुहोस् एउटा सुन लिनुहोस् र अर्को शत्रुलाई जालमा पार्न दायाँ खन्नुहोस् माथि तिर जानुहोस् दायाँ जानुहोस् झर्नुहोस् र माथि बीचको ढलान क्षेत्रमा गइराख्नुहोस् ढलान क्षेत्रमा शत्रुलाई झुक्याउनुहोस् त्यसपछि उसमाथि झर्नुहोस् र दुइ झुक्याउने इट्टाबाट बायाँ तलको खाली स्थानमा गएर बाहिरिनुहोस् पर्दा बीच

In [14]:
def generate_batch(x, y, batch_size):
    while True:
        for i in range(0, len(x), batch_size):
            encoder_input_vec = np.zeros((batch_size, en_sentence_max_length), dtype='float32')
            decoder_input_vec = np.zeros((batch_size, np_sentence_max_length), dtype='float32')
            decoder_target_vec = np.zeros((batch_size, np_sentence_max_length, num_decoder_tokens), dtype='float32')
            
            for j, (input_text, target_text) in enumerate(zip(x[i:i+batch_size], y[i:i+batch_size])):
                for t, word in enumerate(input_text.split()):
                    # Encoder input sequence
                    encoder_input_vec[j, t] = encoder_token_dict[word]
                    for t, word in enumerate(target_text.split()):
                        if t < len(target_text.split())-1:
                            # Decoder input sequence
                            decoder_input_vec[j, t] = decoder_token_dict[word]
                        elif t > 0:
                            # decoder target sequence (one hot encoded)
                            # does not include the START_ token
                            # Offset by one timestep
                            decoder_target_vec[j, t-1, decoder_token_dict[word]] = 1.
            yield([encoder_input_vec, decoder_input_vec], decoder_target_vec)            

# Designing Encoder-Decoder Architecture

In [15]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

In [16]:
latent_dim=256

In [17]:
encoder_inputs = Input(shape=(None,))
encoder_embedded = Embedding(num_encoder_tokens, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedded)
# We discard `encoder_outputs` and only keep the states
encoder_states = [state_h, state_c]

In [18]:
# Setting up the decoder, using encoder_states as initial state.
decoder_inputs = Input(shape=(None,))
decoder_embedding_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero=True)
decoder_embedded = decoder_embedding_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, state_dh, state_dc = decoder_lstm(decoder_embedded, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [19]:
# Define the model that will turn encoder_input data & decoder_input_vec into decoder_target_vec
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [20]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    16464896    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    47718400    input_2[0][0]                    
_______________________________________________________________________________________

In [21]:
train_samples = len(x_train)
val_samples = len(x_test)
batch_size = 10
epochs = 20

In [22]:
for i in range(epochs):
    generator = generate_batch(x_train, y_train, batch_size=batch_size)
    model.fit_generator(generator=generator,
                    steps_per_epoch=train_samples//batch_size,
                    epochs=1,
                    validation_data=generate_batch(x_test, y_test, batch_size=batch_size),
                    validation_steps=val_samples//batch_size,
                    verbose=1)
    model.save(f"./Models/model_{i}.h5" )

Instructions for updating:
Please use Model.fit, which supports generators.
  106/14421 [..............................] - ETA: 37:52:35 - loss: 0.0041

KeyboardInterrupt: 