In [5]:
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

import re
import string

In [6]:
# Read data
with open('mal-eng\mal.txt','r') as file:
    data = file.read()

In [7]:
print(data[:500])

Hello.	നമസ്കാരം.	CC-BY 2.0 (France) Attribution: tatoeba.org #1858850 (LanguageExpert) & #651913 (jjrodz)
Really?	ശരിക്കും?	CC-BY 2.0 (France) Attribution: tatoeba.org #373216 (kotobaboke) & #7896041 (lonewolfie)
Help me.	എന്നെ സഹായിക്കൂ.	CC-BY 2.0 (France) Attribution: tatoeba.org #266065 (Zifre) & #780454 (jjrodz)
Welcome.	സ്വാഗതം.	CC-BY 2.0 (France) Attribution: tatoeba.org #138919 (CM) & #7896035 (lonewolfie)
I forgot.	ഞാന്‍ മറന്നു.	CC-BY 2.0 (France) Attribution: tatoeba.org #436603 (lukasz


In [8]:
eng_ml_data = [text.split('\t')[:2] for text in data.split('\n')]

In [9]:
print(eng_ml_data[:5])

[['Hello.', 'നമസ്കാരം.'], ['Really?', 'ശരിക്കും?'], ['Help me.', 'എന്നെ സഹായിക്കൂ.'], ['Welcome.', 'സ്വാഗതം.'], ['I forgot.', 'ഞാന്\u200d മറന്നു.']]


In [10]:
eng_data = []
mal_data = []
for item in eng_ml_data:
    if len(item)<2:
        continue
    eng_data.append(item[0])
    mal_data.append(item[1])

In [11]:
print('number of data in eng: ',len(eng_data))
print('number of data in mal: ',len(mal_data))

number of data in eng:  614
number of data in mal:  614


In [12]:
print(list(zip(eng_data,mal_data[:4])))

[('Hello.', 'നമസ്കാരം.'), ('Really?', 'ശരിക്കും?'), ('Help me.', 'എന്നെ സഹായിക്കൂ.'), ('Welcome.', 'സ്വാഗതം.')]


In [13]:
# Remove puctuations
def remove_puctuations(word):
    word = re.sub('[{}]'.format(string.punctuation),repl='',string=word)
    return word.lower()

In [14]:
eng_data = list(map(remove_puctuations,eng_data))
mal_data = list(map(remove_puctuations,mal_data))

In [15]:
print(mal_data[:4])

['നമസ്കാരം', 'ശരിക്കും', 'എന്നെ സഹായിക്കൂ', 'സ്വാഗതം']


In [44]:
class LangPorcess:
    def __init__(self):

        self.__word2int = {}
        self.__int2word = {}
        self.__num_unique_tokens = 1
        self.__word2int['SOS'] = 0
        self.__int2word[0] = 'SOS'
        self.__word_freq = {}
        self.__integer_encoded =[]

    def fit(self,X):
        X_split_words = [text.split(' ') for text in X]
        self.read_sentences(X_split_words)
        self.__integer_encoding(X_split_words)
        
        
    def read_sentences(self,X):
        # Loop thourgh each sentence and read words
        for sentence in X:
            for word in sentence:
                if word not in self.__word2int.keys():
                    self.__word_freq[word] = 1
                    self.read_words(word)
                else:
                    self.__word_freq[word] += 1


    
    def read_words(self,word):
            self.__word2int[word] = self.__num_unique_tokens
            self.__int2word[self.__num_unique_tokens] = word
            self.__num_unique_tokens+=1

    def get_word_frequency(self):
        return dict(sorted(self.__word_freq.items(),
                           key=lambda x:x[1],
                           reverse=True))
    
    def get_word2index(self):
        return self.__int2word
    
    def get_index2word(self):
        return self.__word2int
    
    def __integer_encoding(self,X):  
        for word in X:
            torch_tensor = torch.tensor(list(map(lambda word : self.__word2int[word],word)))
            self.__integer_encoded.append(torch_tensor)

    def get_integer_encoding(self,padding=True,max_len=None):
        if padding:
            return self.__pad_sequence(self.__integer_encoded,max_len)
        else:
            return self.__integer_encoded
        
    def __pad_sequence(self,X,max_len):
            maxlen_padded = pad_sequence(self.__integer_encoded,padding_value=0,batch_first=True)
            if max_len:
                return maxlen_padded[:,:max_len]
            else:
                return maxlen_padded
       
        

In [45]:
eng_lang = LangPorcess()
eng_lang.fit(eng_data)

mal_lang = LangPorcess()
mal_lang.fit(mal_data)

In [47]:
eng_integers = eng_lang.get_integer_encoding(max_len=11)
mal_integers = mal_lang.get_integer_encoding(max_len=7)

In [48]:
class TorchDataset(Dataset):
    def __init__(self,enc_sentance,dec_sentance):
        super().__init__()
        self.enc_sentance = enc_sentance
        self.dec_sentance = dec_sentance

    def __len__(self):
        return len(self.enc_sentance)

    def __getitem__(self, index):
        return self.enc_sentance[index], self.dec_sentance[index]



In [49]:
data = TorchDataset(enc_sentance=eng_integers,dec_sentance=mal_integers)

In [50]:
dataloader = DataLoader(data,
                        batch_size=32,
                        shuffle=True,
                        pin_memory=True)

In [53]:
for i,(inp,ter) in enumerate(dataloader):
    if i ==1:
        break
    print(inp,ter)

tensor([[ 28,  51,  78, 251, 252,   0,   0,   0,   0,   0,   0],
        [  6, 108, 109,  82,   0,   0,   0,   0,   0,   0,   0],
        [  6,  33,  34,   0,   0,   0,   0,   0,   0,   0,   0],
        [  6, 189,  43,  18,  50, 366,   0,   0,   0,   0,   0],
        [ 43,  76,  80,   4, 160,   0,   0,   0,   0,   0,   0],
        [  6,  61,  18, 293,  16,   0,   0,   0,   0,   0,   0],
        [494, 274, 495, 496,   0,   0,   0,   0,   0,   0,   0],
        [477, 186, 478, 237, 479,   0,   0,   0,   0,   0,   0],
        [  6,   8,   9,   0,   0,   0,   0,   0,   0,   0,   0],
        [326, 399,   4, 400,   0,   0,   0,   0,   0,   0,   0],
        [683, 167,  80, 274, 181, 505, 405,   0,   0,   0,   0],
        [ 98, 205, 456, 929, 930, 148, 153, 931,   0,   0,   0],
        [ 16, 375, 150, 183, 398, 173,  80, 957,  80,  69, 792],
        [  6, 185, 220,  87,   0,   0,   0,   0,   0,   0,   0],
        [  6, 407, 569, 529, 536,   0,   0,   0,   0,   0,   0],
        [ 28, 139, 474, 5

In [113]:
class Encoder(nn.Module):
  def __init__(self, 
               embedding_size, 
               hidden_size, 
               vocab_size):
    super().__init__(self)

    # encoder initialization
    self.embedding = nn.Embedding(num_embeddings=vocab_size,
                             embedding_dim=embedding_size)
    self.encoder_lstm = nn.LSTM(input_size=embedding_size,
                           hidden_size=hidden_size,
                           batch_first=True)

  def forward(self,input):
    # network flow
    embedding_input = self.embedding(input)
    encoder_outputs, (final_hidden_state, final_cell_state) = self.encoder_lstm(embedding_input)
    
    return encoder_outputs, final_hidden_state, final_cell_state




In [None]:
class Attention(nn.Module):
  def __init__(self, hidden_size):
    super().__init__(self)

    self.network = nn.Sequential(
          nn.Linear(2*hidden_size,hidden_size),
          nn.SELU(),
          nn.Linear(hidden_size,1),
          nn.Softmax(dim=1)
        )
    
  def forward(self,encoder_outputs,hidden_state):
    """ Concat encoeder_output and hidden_state, encoder_output shape = (32,timestept,hidden_size), hidden_state shape = 32,1,hidden_side
    first we need to make it same shape to concat hidden_state should be 32,timestep hidden_size, timestpe will be repeatation of same one vector from hidden size"""

    encoder_timestep_len = encoder_outputs.size(1)  # hidden_size will be (1,32,hidden_size) according to doc we need to change
    hidden_state = hidden_state.permute(1,0,2) # shape: (32,1,5)
    hidden_repeated = hidden_state(1,encoder_timestep_len,1) # hidden_state repetation 

    # concat with encoder_output and hidden output
    encoder_hidden_concat = torch.concat((encoder_outputs,hidden_repeated),dim=-1) # shape : 32,timestep,hidden_size*2
    weights = self.network(encoder_hidden_concat) # (32,timestepe,1)
    weights = weights.permute(0,2,1) # for bmm (32,1,timesteps)
    context_vectores = weights.bmm(encoder_outputs) # 32,1,hidden_size
    context_vectores = context_vectores.permute(1,0,2) # convert back to the way lstm take hidden state

    return context_vectores

In [None]:
class Decoder(nn.Module):
  def __init__(self,
               vocab_size_tr,
               embedding_dim,
               hidden_size,
               max_len=20,
               sos_token=1,
               ):
    super().__init__()
    
    self.MAX_LEN = max_len
    self.SOS_TOKEN = sos_token

    # Layers Initialization
    self.embedding_layer = nn.Embedding(vocab_size_tr, embedding_dim)
    self.lstm_layer = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
    self.fnn = nn.Linear(hidden_size ,vocab_size_tr)
    self.attention_vector = Attention(hidden_size)

  def forward(self,
              encoder_outputs,
              hidden_state,
              cell_state,
              target_output=None):
    
    batch_size = encoder_outputs.shape[0]     # encoder gets the input from train loader which defines the batchsize
    decoder_input = torch.empty(size=(batch_size,1),dtype=torch.long).fill_(self.SOS_TOKEN)     # Initialize first input [32 sos_tokens]
    decoder_outputs = []

    for i in range(self.MAX_LEN):
      output_logits ,hidden_state = self.forward_step(encoder_outputs,decoder_input, hidden_state, cell_state)
      decoder_outputs.append(output_logits.unsqueeze(1))      # decoder ouput = [(32,vocab_size),...(32,vocab_size)], this list will have max_len item , lastly we will concat this to make (32,max_len,vocab_size)

      # teacher_forcing, occurs if we give target_output in the decoder
      if target_output:
        decoder_input = target_output[:,i].unsqueeze(1)
      else:
        _, decoder_input = output_logits.topk(1,dim=-1)

    decoder_final_output = torch.cat(decoder_outputs,dim=1)
    return decoder_final_output

  def forward_step(self,encoder_outputs, decoder_input, hidden_state, cell_state):
    embedded_decoder_input = self.embedding_layer(decoder_input)      # embedded shape : (32,1,embedd_size), here 1 , becuase we are giving each word or token to decoder and make it predict next word
    lstm_output, (decoder_hidden, decoder_cell) = self.lstm_layer(embedded_decoder_input, (hidden_state, cell_state))     # lstm_output: (32,1,hidden_size)
    output_logit = self.fnn(lstm_output.squeeze(1))   # squeeze (32,1,hidden_size) -> 32,hidden_size
    hidden_state = self.attention_vector(encoder_outputs, decoder_hidden)     # ouput_logits: (32,vocab_size) , 32 prediction of word , we will pic top item

    return output_logit ,hidden_state , cell_state

In [None]:
class Seq2SeqAttentionModel(nn.Module):
  def __init__(self,
               embedding_size,
               hidden_size,
               vocab_size_en,
               vocab_size_tr):
    super().__init__(self)

    self.encoder = Encoder(embedding_size,
                           hidden_size,
                           vocab_size_en)

    self.decoder = Decoder(embedding_size,
                           hidden_size,
                           vocab_size_tr,
                           max_len=20,
                           sos_token=1)

    def forward(self, input, target_output):
      encoder_outputs, encoder_hidden_state, encoder_cell_state = self.encoder(input)
      decoder_output = self.decoder(encoder_outputs, encoder_hidden_state, encoder_cell_state, target_output)

      return decoder_output
