In [1]:
import os
import re
import nltk
import string
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [2]:
working_dir = os.getcwd()
os.listdir(working_dir)

['.ipynb_checkpoints', 'data_preprocessing.ipynb']

In [3]:
data_dir = os.path.join(working_dir, '..')
os.listdir(data_dir)

['encoder_decoder', 'eng2ben - collateral.csv']

In [4]:
main_df = pd.read_csv(os.path.join(data_dir, 'eng2ben - collateral.csv'), names=['English', 'Bengali'], encoding='utf-8')
main_df.head()

Unnamed: 0,English,Bengali
0,"- You okay? - Yeah, I'm fine, mate.","- আপানি ঠিক আছেন? - হ্যাঁ, আমি ভালো আছি, ভায়া।"
1,Don't worry about it.,এটা নিয়ে একদম চিন্তা করবেন না।
2,You all right?,আপনি ঠিক আছেন তো?
3,Enjoy L.A.,লস এঞ্জেলেসে আসাটা উপভোগ্য হোক।
4,- He did it. - It's all right.,- সে সেটা করেছিলো। - হুম ঠিক আছে।


In [5]:
len(main_df)

1157

In [6]:
main_df.isnull()

Unnamed: 0,English,Bengali
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
1152,False,False
1153,False,False
1154,False,False
1155,False,False


In [7]:
df = main_df.copy()

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
def remove_punctuation(main_text):
    processed_text = "".join([c for c in main_text if c not in string.punctuation and ord(c) != 2404])
    return processed_text

In [10]:
df['Processed_English'] = df['English'].apply(lambda x: remove_punctuation(x))

In [11]:
df['Processed_English']

0                     You okay  Yeah Im fine mate
1                             Dont worry about it
2                                   You all right
3                                        Enjoy LA
4                        He did it  Its all right
                          ...                    
1152                       I do this for a living
1153                 Were almost at the next stop
1154                                      Hey Max
1155    A guy gets on the MTA here in LA and dies
1156                    Think anybody will notice
Name: Processed_English, Length: 1157, dtype: object

In [12]:
df['Processed_Bengali'] = df['Bengali'].apply(lambda x: remove_punctuation(x))

In [13]:
df['Processed_Bengali']

0                 আপানি ঠিক আছেন  হ্যাঁ আমি ভালো আছি ভায়া
1                           এটা নিয়ে একদম চিন্তা করবেন না
2                                        আপনি ঠিক আছেন তো
3                          লস এঞ্জেলেসে আসাটা উপভোগ্য হোক
4                            সে সেটা করেছিলো  হুম ঠিক আছে
                              ...                        
1152                   আমি এটা বেঁচে থাকার কাজ হিসাবে করি
1153                          আমরা প্রায় শেষ স্টপেজে আছি
1154                                          হেই ম্যাক্স
1155    লস এঞ্জেলেসে একটা মানুষ ট্রেনে উঠবে  আর সেখানে...
1156                   কেউ কি সেটা খেয়াল করবে বলে মনে করো
Name: Processed_Bengali, Length: 1157, dtype: object

In [14]:
df['Processed_English'] = df['Processed_English'].apply(lambda x: x.lower())

In [15]:
df['Processed_English']

0                     you okay  yeah im fine mate
1                             dont worry about it
2                                   you all right
3                                        enjoy la
4                        he did it  its all right
                          ...                    
1152                       i do this for a living
1153                 were almost at the next stop
1154                                      hey max
1155    a guy gets on the mta here in la and dies
1156                    think anybody will notice
Name: Processed_English, Length: 1157, dtype: object

### Tokenization part

In [16]:
def tokenize_english_data(text):
    token = re.split('\W+', text.lstrip().rstrip())
    return token

In [17]:
df['Processed_English'] = df['Processed_English'].apply(lambda x: tokenize_english_data(x))
df['Processed_English'][:20]

0                     [you, okay, yeah, im, fine, mate]
1                              [dont, worry, about, it]
2                                     [you, all, right]
3                                           [enjoy, la]
4                        [he, did, it, its, all, right]
5     [can, you, tell, me, why, everything, is, alwa...
6              [everything, is, not, always, about, me]
7     [that, gearhead, with, his, pocket, protector,...
8     [and, you, damn, well, know, it, sarcastic, yo...
9     [im, sorry, i, just, didnt, see, it, that, way...
10    [what, about, the, dig, about, the, makeover, ...
11    [what, do, you, want, me, to, do, i, work, wit...
12    [and, you, know, what, youre, perfectly, capab...
13    [you, know, something, the, last, time, i, che...
14    [so, unless, you, want, to, start, fucking, hi...
15    [pal, where, can, i, catch, a, shuttle, to, th...
16                                [back, there, thanks]
17    [it, was, him, in, that, gold, lexus, on, 

In [18]:
def tokenize_bengali_data(text):
    token = [word for word in text.split()]
    return token

In [19]:
df['Processed_Bengali'] = df['Processed_Bengali'].apply(lambda x: tokenize_bengali_data(x))

In [20]:
df['Processed_Bengali'][:20]

0       [আপানি, ঠিক, আছেন, হ্যাঁ, আমি, ভালো, আছি, ভায়া]
1                  [এটা, নিয়ে, একদম, চিন্তা, করবেন, না]
2                                 [আপনি, ঠিক, আছেন, তো]
3                  [লস, এঞ্জেলেসে, আসাটা, উপভোগ্য, হোক]
4                    [সে, সেটা, করেছিলো, হুম, ঠিক, আছে]
5     [তুমি, কি, আমাকে, বলতে, পারবে, সবকিছু, কেনো, স...
6           [সবকিছু, সবসময়, শুধু, আমাকে, ঘিরে, ঘটে, না]
7     [সে, সব, সময়, আমার, সাথে, অন্য, রকম, কিছু, করত...
8     [আর, তুমিও, সেটা, ভালো, করেই, জানতে, অন্য, রকম...
9     [আমি, দুঃখিত, আমি, আসলে, এটা, আদৌ, এভাবে, দেখত...
10    [তুমি, কি, আসলে, দেখেও, না, দেখার, ভান, করতে, ...
11    [তুমি, আসলে, আমাকে, দিয়ে, কি, করাতে, চাচ্ছো, আ...
12    [আর, তুমি, জানো, কি, তুমি, নিজেই, তোমার, সমস্য...
13    [তুমি, কি, একটা, জিনিস, জানো, শেষবার, আমি, চেক...
14    [আর, তুমি, এখন, আবার, তাকে, খুশী, করার, চেষ্টা...
15    [ভাই, আমি, এয়ারপোর্টে, যাবার, শাটল, কোথা, থেক...
16                           [পিছন, দিক, থেকে, ধন্যবাদ]
17     [সেটা, ছিলো, সেই, সোনালী, রঙের, লেক্সাস, 

In [21]:
english_token_set = set(word for l in df['Processed_English'].values for word in l)
len(english_token_set)

1576

In [22]:
bengali_token_set = set(word for l in df['Processed_Bengali'].values for word in l)
len(bengali_token_set)

2060

In [23]:
num_encoder_tokens = len(english_token_set)
num_decoder_tokens = len(bengali_token_set) + 1

print("Number of encoder tokens: ", num_encoder_tokens)
print("Number of decoder tokens: ", num_decoder_tokens)

Number of encoder tokens:  1576
Number of decoder tokens:  2061


In [24]:
english_word_to_token = dict([(word, i+1) for i, word in enumerate(list(english_token_set))])

In [25]:
bengali_word_to_token = dict([(word, i+1) for i, word in enumerate(list(bengali_token_set))])

In [26]:
#token to word
english_token_to_word = dict([i, word] for word, i in english_word_to_token.items())
bengali_token_to_word = dict([i, word] for word, i in bengali_word_to_token.items())

### Build Encoder

In [27]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        #embedding taking input dim and then arranging output dim for hidden layers
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
    
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.lstm(output, hidden)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device = device)