In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Embedding, LSTMCell, LSTM, Input, RNN, Activation, add, concatenate
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm
import os
from datetime import datetime
import warnings
import re
warnings.filterwarnings("ignore")
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
%reload_ext tensorboard

In [1]:
data = "ita.txt" ## contains the italian to english 
#visulaizing sample of ita.txt
with open(data,"r") as f:
    print(f.readline())
    print(f.readline())

Hi.	Ciao!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #607364 (Cero)

Hi.	Ciao.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4522287 (Guybrush88)



<pre><b>Observation:</b>

contains the english and italian with some copyrights
seperated by tabs</pre>


In [5]:
class process_data:
    
    def __init__(self,data):
        self.data = data
        self.english = []
        self.italian = []
        
    def readFile(self):
        
        with open(self.data,errors="ignore") as f:
            for line in f:
                eng,ita,cc = line.split("\t")
                self.english.append(eng)
                self.italian.append(ita)
                
        data = pd.DataFrame({
            "english": self.english,
            "italian": self.italian
        })
        
        return data
        
    def cleanText(self,data):
        
        """
        Function removes:
        1. Numbers in the data.
        2. special characters.
        3. lowers the data.
        4. remove extra spaces if any.
        """
        
        data = data.lower()      ## lowers the text
        data = re.sub("[^A-Za-z0-9]+"," ",data) ## remove special character expect number and charcter
        data = re.sub("[0-9]"," ",data)
        data = " ".join(data.split(" ")) ## removes extra spaces if any
        return data
    
    def main(self):
        
        data = self.readFile()
        data["processed_english"] = data["english"].apply(self.cleanText)
        data["processed_italian"] = data["italian"].apply(self.cleanText)
        data.drop(columns=["english","italian"],inplace=True)
        
        return data
    

In [6]:
start_time = datetime.now()
data = process_data("ita.txt").main()
print("Total time taken to process: ",datetime.now()-start_time)
data.sample(5)

Total time taken to process:  0:00:10.271851


Unnamed: 0,processed_english,processed_italian
268954,i want you to stay here with her,voglio che tu stia qui con lei
94691,there s little to do,c poco da fare
348748,she came to my defence when i was accused of p...,lei venuta in mia difesa quando sono stata acc...
213269,he can play tennis very well,sa giocare a tennis molto bene
4988,i got angry,mi sono arrabbiato


In [7]:
data.loc[:,"processed_italian"] = "<start> " + data["processed_italian"]+" <end>"
data["decoder_input"] = "<start> "+ data["processed_english"]
data["decoder_output"] = data["processed_english"]+" <end>"

In [None]:
#droping the processed_english
data.drop(columns=["processed_english"],inplace=True)

In [None]:
## add the one <end> in the decoder input for the tokenization
data.iloc[0]["decoder_input"] = data.iloc[0]["decoder_input"] + " <end>"

## Train Test Split

In [None]:
df_train,df_test = train_test_split(data,shuffle=True,test_size=0.2)
df_train,df_cv = train_test_split(df_train,shuffle=True,test_size=0.2)

In [None]:
print("Shape of Train : ",df_train.shape[0])
print(f"Shape of CV : {df_cv.shape[0]}")
print(f"Shape of Test : {df_test.shape[0]}")

## Tokenization

##### Encoder inputs

In [60]:
tokenizer_encoder = Tokenizer(filters="",oov_token = 'UNK')
tokenizer_encoder.fit_on_texts(df_train["processed_italian"])
encoder_input_train = tokenizer_encoder.texts_to_sequences(df_train["processed_italian"])
encoder_input_cv = tokenizer_encoder.texts_to_sequences(df_cv["processed_italian"])
encoder_word_to_int = tokenizer_encoder.word_index
encoder_vocab_size = len(encoder_word_to_int)+1 ## used in embedding

In [None]:
#### Decoder Output

In [61]:
tokenizer_decoder = Tokenizer(filters="",oov_token="UNK")
tokenizer_decoder.fit_on_texts(df_train["decoder_input"])
## for decoder input
decoder_input_train = tokenizer_decoder.texts_to_sequences(df_train["decoder_input"])
decoder_input_cv = tokenizer_decoder.texts_to_sequences(df_cv["decoder_input"])
## for decoder output
decoder_output_train = tokenizer_decoder.texts_to_sequences(df_train["decoder_output"])
decoder_output_cv = tokenizer_decoder.texts_to_sequences(df_cv["decoder_output"])
decoder_word_to_int = tokenizer_decoder.word_index
decoder_vocab_size = len(decoder_word_to_int)+1

In [62]:
## Padding

In [None]:
## find the optimal length for padding
count = data["processed_italian"].apply(lambda x:len(x.split(" ")))
plt.figure(figsize=(16,6))
c = sns.countplot(count)
plt.show()

#### Observation
<pre>
20 seems favourable for padding
</pre>

In [63]:
def padding(data, max_len=20):

    return pad_sequences(data,
                         maxlen=max_len,
                         padding='post',
                         truncating='post')

In [None]:
## encoder
encoder_input_train = padding(encoder_input_train)
encoder_input_cv = padding(encoder_input_cv)



In [None]:
decoder_input_train = padding(decoder_input_train)
decoder_input_cv = padding(decoder_input_cv)
## for output decoder
decoder_output_train = padding(decoder_output_train)
decoder_output_cv = padding(decoder_output_cv)

In [None]:
print("Shape of encoder input: ",encoder_input_train.shape)
print("Shape of decoder input: ",decoder_input_train.shape)
print("Shape of decoder output: ",decoder_output_train.shape)

In [None]:
## Dataloader

In [65]:
class dataLoader(tf.keras.utils.Sequence):
    
    def __init__(self,encoder_input,decoder_input,decoder_output,batch):
        self.encoder_input = encoder_input
        self.decoder_input = decoder_input
        self.decoder_output = decoder_output
        self.batch_size = batch
        
    def __getitem__(self,i):
        start = i * self.batch_size
        end = (i+1) * self.batch_size
        encoder = []
        decoder_input = []
        decoder_output = []
        for i in range(start,end):
            encoder.append(self.encoder_input[i])
            decoder_input.append(self.decoder_input[i])
            decoder_output.append(self.decoder_output[i])
            
        return [np.array(encoder),np.array(decoder_input)],np.array(decoder_output)
    
    def __len__(self):
        return len(self.encoder_input)//self.batch_size

In [66]:
class Encoder(tf.keras.Model):
    
    def __init__(self,
                 inp_vocab_size,
                 embedding_size,
                 lstm_size,
                 input_length):

        super().__init__()
        #Embedding layer
        self.embedding = Embedding(inp_vocab_size, embedding_size, input_length = input_length)
        # Encoder LSTM layer
        self.lstm_size = lstm_size
        lstmcell = LSTMCell(lstm_size)
        self.lstm = RNN(lstmcell, return_sequences = True, return_state = True)


    def call(self,
             input_sequence,
             states):
  
        embeddings = self.embedding(input_sequence)
        
        enc_out, enc_h_state, enc_c_state = self.lstm(embeddings, initial_state = states)
        
        return enc_out, enc_h_state, enc_c_state

    
    def initialize_states(self,
                          batch_size):
  
      return tf.zeros((batch_size, self.lstm_size)), tf.zeros((batch_size, self.lstm_size))

In [67]:
class Decoder(tf.keras.Model):

    def __init__(self,
                 out_vocab_size,
                 embedding_size,
                 lstm_size,
                 input_length):

        super().__init__()
        #Embedding layer
        self.embedding = Embedding(out_vocab_size, embedding_size, input_length = input_length)
        #Decoder LSTM layer
        lstmcell = LSTMCell(lstm_size)
        self.lstm = RNN(lstmcell, return_sequences = True, return_state = True)

    def call(self,
             input_sequence,
             initial_states):
 
        embeddings = self.embedding(input_sequence)
        dec_out, dec_h_state, dec_c_state = self.lstm(embeddings, initial_state = initial_states)
        
        return dec_out, dec_h_state, dec_c_state

In [68]:
class encoderDecoder(tf.keras.Model):
    
    def __init__(self,
                 input_vocab_size,
                 output_vocab_size,
                 lstm_size,
                 embedding_size,
                 encoder_input_length,
                 decoder_input_length,
                 batch_size):
        
        super().__init__()
        self.batch_size = batch_size
        self.encoder = Encoder(input_vocab_size,embedding_size,lstm_size,encoder_input_length)
        self.decoder = Decoder(output_vocab_size,embedding_size,lstm_size,decoder_input_length)
        self.dense = Dense(output_vocab_size)
        
    def call(self,inputs):
        
        encoder_input = inputs[0]
        decoder_input = inputs[1]
        initial_states = self.encoder.initialize_states(tf.shape(encoder_input)[0])
        enc_out, enc_h_state, enc_c_state = self.encoder(encoder_input,initial_states)
        dec_out, dec_h_state, dec_c_state = self.decoder(decoder_input,[enc_h_state,enc_c_state])
        ita = self.dense(dec_out)
        
        return ita
    

In [69]:
!rm -rf ./logs/
logdir = os.path.join('logs', datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
%tensorboard --logdir $logdir

In [70]:
tf.keras.backend.clear_session()
#defining the model
vanilla_encoder_decoder = encoderDecoder(encoder_vocab_size,
                                          decoder_vocab_size,
                                          350,
                                         350,
                                         encoder_input_train.shape[-1],
                                          decoder_input_train.shape[-1],
                                          batch_size = 64
                                        )
#defining callbacks
callbacks = [
             
             tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', min_delta = 0.01, patience = 1)]

#              tf.keras.callbacks.ModelCheckpoint('vanilla_enc_dec_{epoch}', monitor = 'val_loss', save_best_only  = True,save_format="tf") ]
#compiling the model
vanilla_encoder_decoder.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.002),
                                loss = 'sparse_categorical_crossentropy',
                               metrics="acc")



In [74]:
batch_size = 256
train_data_generator = dataLoader(encoder_input_train, decoder_input_train, decoder_output_train, batch_size)
val_data_generator = dataLoader(encoder_input_cv, decoder_input_cv, decoder_output_cv, batch_size)

vanilla_encoder_decoder.fit(train_data_generator,
                            validation_data = val_data_generator,
                            epochs = 5,
                            callbacks = callbacks)



<keras.callbacks.History at 0x17e75fd50d0>

In [None]:
decoder_int_to_word = {decoder_word_to_int[key]:key for key in decoder_word_to_int }

In [75]:
def prediction(text):
    #tokenize test sentences
    encoder_inputs = tokenizer_encoder.texts_to_sequences([text])
    # pad the test sentences
    encoder_inputs = padding(encoder_inputs)
    initial_state = vanilla_encoder_decoder.layers[0].initialize_states(1)
    enc_out, enc_h_state, enc_c_state = vanilla_encoder_decoder.layers[0](encoder_inputs,initial_state)
    decoder_states = [enc_h_state,enc_c_state]
    # define the start of sentence for the decoder
    decoder_initial_input = np.array([[tokenizer_decoder.word_index['<start>']]])
    prediction = []
    while True:
        ## defining decoder layers
        dec_out, dec_h_state, dec_c_state = vanilla_encoder_decoder.layers[1](decoder_initial_input,
                                                                              initial_states=decoder_states)
        pred_vector = vanilla_encoder_decoder.layers[2](dec_out)
        pred_index = np.argmax(pred_vector)
        pre_word = decoder_int_to_word[pred_index]
        prediction.append(pre_word)
        if pre_word == "<end>" or len(prediction)>=20:
            return " ".join(prediction)
        
        decoder_states = [dec_h_state,dec_c_state]
        decoder_initial_input = np.array([[pred_index]])
        

In [76]:
bleu_score = []
print("=" * 50)     
#prediction




Italian sentence: <start> non ã¨ il nostro problema ora <end>
Actual Translation: that is not our problem now <end>
Predicted Translation: that is not our problem now <end>


Italian sentence: <start> perchã© state cercando di perdere peso <end>
Actual Translation: why are you trying to lose weight <end>
Predicted Translation: why are you trying to lose weight <end>


Italian sentence: <start> lasciatemi dire perchã© a me non piace tom <end>
Actual Translation: let me tell you why i do not like tom <end>
Predicted Translation: let me tell me how to do that and do that <end>


Italian sentence: <start> pensavo che tom non l avrebbe mai piã¹ rivista <end>
Actual Translation: i thought tom would never see you again <end>
Predicted Translation: i thought tom would never take the gun <end>


Italian sentence: <start> tom andã² via <end>
Actual Translation: tom went away <end>
Predicted Translation: tom went away <end>


Italian sentence: <start> ãˆ rischioso per te andare in quella zona da