### Downloading the dataset

In [1]:
!wget http://www.manythings.org/anki/fra-eng.zip

--2021-01-21 13:33:24--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.21.55.222, 172.67.173.198, 2606:4700:3036::ac43:adc6, ...
Connecting to www.manythings.org (www.manythings.org)|104.21.55.222|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6129887 (5.8M) [application/zip]
Saving to: ‘fra-eng.zip’


2021-01-21 13:33:26 (5.13 MB/s) - ‘fra-eng.zip’ saved [6129887/6129887]



### Opening the dataset

In [2]:
!unzip ./fra-eng.zip

Archive:  ./fra-eng.zip
  inflating: _about.txt              
  inflating: fra.txt                 


### Importing required libraries

In [3]:
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Dense,LSTM,Input,Embedding,TimeDistributed,RepeatVector
from nltk.translate.bleu_score import SmoothingFunction,corpus_bleu
smoothie = SmoothingFunction().method4

### Data cleaning and train_test_split

In [4]:
data_path = './fra.txt' # path of the file
num_sentences = 20000 # no of sentences from the dataset that we are going to use

# opening the text file and getting the data 
with open(data_path,'r') as f:
    lines = f.read().split('\n')

    
c=0 # to count the number of sentences

# data cleaning
source_texts,target_texts = [],[]
for line in lines: # going through each lines
    if c == num_sentences: # if we have 20000 sentences than we will get out of this loop
        break 
    elif '\t' in line:
        op_data,ip_data,_ = line.lower().rstrip().split('\t') # lowering the data and then spliting the data
        
        # to remove the punctuation we did not include last character
        source_text = ip_data[:-1].strip()
        target_text = op_data[:-1].strip()
        # removing the unprintable character
        # for english and french we will take anly alphabets of brespective languages and numbers
        target_text = re.sub("[^a-z 1-9\'-]","",target_text) 
        source_text = re.sub("[^a-zàâãçéèêëîïôœùûüÿ 1-9\'-]","",source_text) 
        
        source_texts.append(source_text)
        target_texts.append(target_text)
        c+=1

# train_test_split of the source and target data
source_train,source_test,target_train,target_test = train_test_split(source_texts,target_texts,test_size = 0.2, random_state= 0)

### Making the required functions for the data preprocessing

In [5]:
# tokenizer for data
def create_tokenizer(texts):
    tokenizer = Tokenizer(oov_token='<UNK>')
    tokenizer.fit_on_texts(texts)
    return tokenizer

# one_hot encoding of the target data
def one_hot(pad_seq,max_sent_length,num_vocab):
    target_data_one_hot = np.zeros((len(pad_seq),max_sent_length,num_vocab))
    for i,w in enumerate(pad_seq):
        for j,d in enumerate(w):
            target_data_one_hot[i,j,d] = 1
    return target_data_one_hot

# for padding the data
def encoding_text(tokenizer,text,max_length):
    text_seq = tokenizer.texts_to_sequences(text)
    pad_seq = pad_sequences(text_seq,maxlen= max_length)
    return pad_seq

# to find the maximum length of the sentence from data
def max_length(text):
    return max(len(l.split()) for l in text)
    

### Preparing training and testing data

In [6]:
# preparing source tokenizer and getting relevant information
source_tokenizer = create_tokenizer(source_train)
source_vocab = source_tokenizer.word_index
num_source_vocab = len(source_vocab)+1
max_source_length = max_length(source_train)

# preparing target tokenizer and getting relevant information
target_tokenizer = create_tokenizer(target_train)
target_vocab = target_tokenizer.word_index
num_target_vocab = len(target_vocab)+1
max_target_length = max_length(target_train)

# preparing the training data
source_train_seq_pad = encoding_text(source_tokenizer,source_train,max_source_length) # padding of the source sentences
target_train_seq_pad = encoding_text(target_tokenizer,target_train,max_target_length) # padding of the target sentences
target_train_seq_pad = one_hot(target_train_seq_pad,max_target_length,num_target_vocab) # one hot encoding of the padded target senteces

# preparing the test data
source_test_seq_pad = encoding_text(source_tokenizer,source_test,max_source_length) # padding of the source sentences
target_test_seq_pad = encoding_text(target_tokenizer,target_test,max_target_length) # padding of the target sentences
target_test_seq_pad = one_hot(target_test_seq_pad,max_target_length,num_target_vocab) # one hot encoding of the padded target senteces
 
print(num_source_vocab,num_target_vocab,max_source_length,max_target_length)

5989 3227 11 5


### Preparing and running the Autoencoder model 

In [7]:
model = Sequential()
model.add(Input(shape=(max_source_length,)))
model.add(Embedding(num_source_vocab,512,mask_zero=True))
model.add(LSTM(512,return_sequences = False))
model.add(RepeatVector(max_target_length))
model.add(LSTM(512,return_sequences = True))
model.add(TimeDistributed(Dense(num_target_vocab,activation = 'softmax')))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['acc'])

model.summary()

es = EarlyStopping(monitor='val_acc',patience= 5,min_delta=0.01) # EarlyStoping callback to stop the fitting before all epochs
filepath = './fre2eng.h5' # filepath required for checkpoint
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') # ModelCheckPoint to save the best model

history = model.fit(source_train_seq_pad, target_train_seq_pad, 
                    epochs= 50,
                    batch_size=64, 
                    validation_data = (source_test_seq_pad,target_test_seq_pad), 
                    verbose=1,
                    callbacks=[checkpoint,es])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 11, 512)           3066368   
_________________________________________________________________
lstm (LSTM)                  (None, 512)               2099200   
_________________________________________________________________
repeat_vector (RepeatVector) (None, 5, 512)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 5, 512)            2099200   
_________________________________________________________________
time_distributed (TimeDistri (None, 5, 3227)           1655451   
Total params: 8,920,219
Trainable params: 8,920,219
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 00001: val_acc improved from -inf to 0.41795, saving model to ./fre2eng.h5
Epoch 2/50
Epoch 000

In [8]:
# loading the weights from the best saved model
model.load_weights(filepath)

### Making the functions to predict the sequence and BLEU_sccore

In [9]:
# a dictionary having key is a token number for a particular word and value is a word
# this will required to decode the predicted sequence
target_vocab_idx = {v:k for k,v in target_tokenizer.word_index.items()}

# function to predict the decoded sequence
def predict_sequence(model,sent,vocab_idx):
    prediction = model.predict(sent.reshape(1,max_source_length))[0]
    integers = [np.argmax(vector) for vector in prediction]
    target = []
    for i in integers:
        if i != 0:
            word = vocab_idx[i]
            if word is None:
                break
            target.append(word)
            
    return ' '.join(target)

# for evaluation of the model through BLEU_score
def bleu_score(model,ip,ip_raw,op_raw,vocab_idx):
    
    prediction,actual = [],[]
    for i,sent in enumerate(ip):
        
        if i%10 == 0: # to print the progress
            print('\rprogress ',(i+1)*100//len(ip),'%',sep='',end='',flush = True)
        
        translation = predict_sequence(model,sent,vocab_idx)
        
        prediction.append(translation)
        actual.append(op_raw[i])
    
    print()
    # printing the first ten sentences
    for i in range(10):
        print('French_sentence -',ip_raw[i],' | ',
            'English_actual_sentence -',op_raw[i],' | ',
            'English_predicted_sentence -',prediction[i])
    
    print()
    # printing the BLEU_score
    print('BLEU_SCORE')
    print('BLEU score-1: %f' % corpus_bleu(actual, prediction, weights=(1.0, 0, 0, 0),smoothing_function=smoothie,auto_reweigh=False))
    print('BLEU score-2: %f' % corpus_bleu(actual, prediction, weights=(0.5, 0.5, 0, 0),smoothing_function=smoothie,auto_reweigh=False))
    print('BLEU score-3: %f' % corpus_bleu(actual, prediction, weights=(0.3, 0.3, 0.3, 0),smoothing_function=smoothie,auto_reweigh=False))
    print('BLEU score-4: %f' % corpus_bleu(actual, prediction, weights=(0.25, 0.25, 0.25, 0.25),smoothing_function=smoothie,auto_reweigh=False))

### Evaluating the model on training dataset

In [10]:
bleu_score(model,source_train_seq_pad,source_train,target_train,target_vocab_idx)

progress 99%
French_sentence - nous savons tout cela  |  English_actual_sentence - we know all this  |  English_predicted_sentence - we know all this
French_sentence - tu n'as pas de vie  |  English_actual_sentence - you have no life  |  English_predicted_sentence - you have no life
French_sentence - je me suis senti triste  |  English_actual_sentence - i felt sad  |  English_predicted_sentence - i felt sad
French_sentence - donnez-moi le vin  |  English_actual_sentence - give me the wine  |  English_predicted_sentence - give me the wine
French_sentence - nous sommes en train d'attendre  |  English_actual_sentence - we're waiting  |  English_predicted_sentence - we're talking
French_sentence - ce n'est pas équitable  |  English_actual_sentence - it's not fair  |  English_predicted_sentence - it's not fair
French_sentence - c'est un vieux de la vieille  |  English_actual_sentence - he's an old timer  |  English_predicted_sentence - he's an old injured
French_sentence - je l'apprécie  | 

### Evaluating the model on testing dataset

In [11]:
bleu_score(model,source_test_seq_pad,source_test,target_test,target_vocab_idx)

progress 99%
French_sentence - ne virez personne  |  English_actual_sentence - don't fire anyone  |  English_predicted_sentence - no anyone
French_sentence - qui l'a bâti  |  English_actual_sentence - who built it  |  English_predicted_sentence - who built it
French_sentence - puis-je revenir  |  English_actual_sentence - may i come again  |  English_predicted_sentence - can i i out
French_sentence - ne répondez pas à cela  |  English_actual_sentence - don't answer that  |  English_predicted_sentence - don't answer that
French_sentence - ça a marché  |  English_actual_sentence - did that work  |  English_predicted_sentence - it worked
French_sentence - ferme-la  |  English_actual_sentence - shut up  |  English_predicted_sentence - shut shut up
French_sentence - nous y sommes  |  English_actual_sentence - we are here  |  English_predicted_sentence - we everyone
French_sentence - je dispose de l'immunité  |  English_actual_sentence - i have immunity  |  English_predicted_sentence - i hav