In [1]:
import numpy as np
import pandas as pd
from pickle import load

In [2]:
en_tokens = load(open('./en_tokens.pkl', 'rb'))
np_tokens = load(open('./np_tokens.pkl', 'rb'))

# sorted() can only be used with iterable data structure
# So, here we are converting set() type to list() type
encoder_tokens = sorted(list(en_tokens))
decoder_tokens = sorted(list(np_tokens))

In [3]:
# Here we are translating from English to Nepali language
# So, en_tokens act as encoder tokens and np_tokens act as decoder tokens
num_decoder_tokens = len(decoder_tokens)
num_encoder_tokens = len(encoder_tokens)

print("The length of Nepali/Decoder tokens is:", num_decoder_tokens)
print("The length of English/Encoder tokens is:", num_encoder_tokens)

The length of Nepali/Decoder tokens is: 186399
The length of English/Encoder tokens is: 64316


In [4]:
# For zero padding
num_decoder_tokens += 1

print("The length of Nepali/Decoder tokens is:", num_decoder_tokens)
print("The length of English/Encoder tokens is:", num_encoder_tokens)

The length of Nepali/Decoder tokens is: 186400
The length of English/Encoder tokens is: 64316


In [5]:
# Creating Vocabulary of the words
encoder_token_dict = dict([(word, i+1) for i, word in enumerate(encoder_tokens)])
decoder_token_dict = dict([(word, i+1) for i, word in enumerate(decoder_tokens)])

# Random Splitting of Data

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
data = pd.read_csv("./cleaned_data.csv")
data.head()

Unnamed: 0,Nepali,English
0,START_TOKEN “मानौ एउटी स्त्रीसँग दशवटा चाँदीका...,or what woman if she had ten drachma coins if ...
1,START_TOKEN ती दुष्ट मानिसहरू हिंस्रक सिंहहरू ...,he is like a lion that is greedy of his prey a...
2,START_TOKEN प्रक्रिया दृश्य क्रम स्तम्भ END_TOKEN,process view sort column
3,START_TOKEN जहा ट्याबहरु देखाइन सकिन्थ्यो वा स...,whether tooltips should be shown on widgets
4,START_TOKEN अनुष्ठान अनुसार जहां केटि र महिलाह...,ritual servitude where girls and women are ple...


In [8]:
x = data["English"]
y = data["Nepali"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state=42)

print("The shape of the training data is:", x_train.shape, y_train.shape)
print("The shape of the trest data is:", x_test.shape, y_test.shape)

The shape of the training data is: (144216,) (144216,)
The shape of the trest data is: (16025,) (16025,)
