In [2]:
# Fetching the Dataset - Run only once
!wget -P /content "http://statmt.org/europarl/v7/fr-en.tgz"
!tar -zxvf "/content/fr-en.tgz" -C "/content" 

--2021-12-01 04:10:56--  http://statmt.org/europarl/v7/fr-en.tgz
Resolving statmt.org (statmt.org)... 129.215.197.184
Connecting to statmt.org (statmt.org)|129.215.197.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 202718517 (193M) [application/x-gzip]
Saving to: ‘/content/fr-en.tgz.1’

fr-en.tgz.1           0%[                    ] 488.00K   141KB/s    eta 23m 20s^C
europarl-v7.fr-en.en
^C


In [2]:
# Imports

# Data preprocessing
import numpy as np
import pandas as pd
  
# NLP and RNN
import tensorflow as tf
import random
from math import ceil
from typing import Union
from os import mkdir
from tensorflow.keras.preprocessing.text import Tokenizer

In [12]:
# Load Data
french = pd.read_csv("europarl-v7.fr-en.fr", delimiter = "\n", encoding = "utf-8",
                     header = None, error_bad_lines = False)

english = pd.read_csv("europarl-v7.fr-en.en", delimiter = "\n", encoding = "utf-8",
                      header = None, error_bad_lines = False)

b'Skipping line 20094: expected 1 fields, saw 2\nSkipping line 27726: expected 1 fields, saw 2\nSkipping line 28814: expected 1 fields, saw 2\nSkipping line 30220: expected 1 fields, saw 2\nSkipping line 30221: expected 1 fields, saw 2\nSkipping line 32604: expected 1 fields, saw 2\nSkipping line 41903: expected 1 fields, saw 2\nSkipping line 53850: expected 1 fields, saw 2\nSkipping line 54949: expected 1 fields, saw 2\nSkipping line 59494: expected 1 fields, saw 4\nSkipping line 59496: expected 1 fields, saw 3\nSkipping line 79555: expected 1 fields, saw 2\nSkipping line 95031: expected 1 fields, saw 2\nSkipping line 95064: expected 1 fields, saw 2\nSkipping line 98382: expected 1 fields, saw 2\nSkipping line 101181: expected 1 fields, saw 2\nSkipping line 114498: expected 1 fields, saw 2\nSkipping line 115143: expected 1 fields, saw 2\nSkipping line 125396: expected 1 fields, saw 8\nSkipping line 128023: expected 1 fields, saw 2\nSkipping line 128626: expected 1 fields, saw 2\nSkipp

In [None]:
""" Preprocessing """

' Preprocessing '

In [13]:
# Step 1: Data subset selection

# we select the first 7465 rows because there are no lines removed in that 
# section of the dataset
french = french[0:500]
english = english[0:500]

In [14]:
# Step 2: Dataframe concatenation
data = pd.concat([english, french], axis = 1)
data.columns = ['English', 'French']

data.head()

Unnamed: 0,English,French
0,Resumption of the session,Reprise de la session
1,I declare resumed the session of the European ...,Je déclare reprise la session du Parlement eur...
2,"Although, as you will have seen, the dreaded '...","Comme vous avez pu le constater, le grand ""bog..."
3,You have requested a debate on this subject in...,Vous avez souhaité un débat à ce sujet dans le...
4,"In the meantime, I should like to observe a mi...","En attendant, je souhaiterais, comme un certai..."


In [15]:
# Step 3: Convert data to lowercase
data['English'] = data['English'].apply(lambda x: x.lower()) # convert to lowercase
data['French'] = data['French'].apply(lambda x: x.lower())

data.head()

Unnamed: 0,English,French
0,resumption of the session,reprise de la session
1,i declare resumed the session of the european ...,je déclare reprise la session du parlement eur...
2,"although, as you will have seen, the dreaded '...","comme vous avez pu le constater, le grand ""bog..."
3,you have requested a debate on this subject in...,vous avez souhaité un débat à ce sujet dans le...
4,"in the meantime, i should like to observe a mi...","en attendant, je souhaiterais, comme un certai..."


In [16]:
# Step 4: Word Indexing

en_tokenizer = Tokenizer() # initiailize
fr_tokenizer = Tokenizer()
print(data['English'][0])
en_tokenizer.fit_on_texts(data['English']) # fit each tokenizer on respective
fr_tokenizer.fit_on_texts(data['French']) # language sentences

en_word_index = en_tokenizer.word_index
fr_word_index = fr_tokenizer.word_index

for key in en_word_index:
  en_word_index[key] = en_word_index[key] -1
for key in fr_word_index:
  fr_word_index[key] = fr_word_index[key] -1

fr_length = len(fr_word_index)
fr_word_index['<SOS>'] = fr_length
fr_word_index['<EOS>'] = fr_length+1

en_count = len(en_word_index)
fr_count = len(fr_word_index)

print(fr_word_index)
print(fr_word_index['le'])
print(en_count)
print(fr_count)

resumption of the session
{'de': 0, 'la': 1, 'et': 2, 'le': 3, 'des': 4, 'que': 5, 'à': 6, 'les': 7, 'en': 8, 'nous': 9, 'du': 10, 'je': 11, 'pour': 12, 'dans': 13, 'qui': 14, 'ce': 15, 'est': 16, 'une': 17, 'au': 18, 'pas': 19, 'un': 20, 'commission': 21, 'il': 22, 'a': 23, 'par': 24, 'sur': 25, 'parlement': 26, 'ne': 27, 'cette': 28, 'plus': 29, 'aux': 30, 'été': 31, "l'": 32, 'ces': 33, 'fonds': 34, 'vous': 35, 'monsieur': 36, 'fait': 37, 'mais': 38, 'sécurité': 39, 'voudrais': 40, 'si': 41, "qu'il": 42, 'se': 43, 'rapport': 44, 'président': 45, 'présidente': 46, 'madame': 47, 'tout': 48, 'ou': 49, 'm': 50, 'également': 51, 'ont': 52, "c'est": 53, 'états': 54, 'sont': 55, 'pays': 56, 'faire': 57, 'son': 58, 'groupe': 59, 'transport': 60, 'politique': 61, 'bien': 62, 'avec': 63, 'y': 64, 'proposition': 65, "d'une": 66, 'marchandises': 67, 'transports': 68, 'régions': 69, 'directive': 70, 'nos': 71, 'donc': 72, 'membres': 73, 'dangereuses': 74, 'question': 75, 'leur': 76, 'être': 77, 

In [17]:
# Step 5: Sentence sequencing
en_sequences = en_tokenizer.texts_to_sequences(data['English'])
fr_sequences = fr_tokenizer.texts_to_sequences(data['French'])

for l in fr_sequences:
  l.insert(0, fr_length)
  l.append(fr_length+1)

print(en_sequences)
print(fr_sequences)

[[1158, 1, 0, 205], [9, 1159, 1160, 0, 205, 1, 0, 32, 29, 1161, 12, 1162, 730, 1163, 141, 3, 9, 20, 26, 230, 292, 2, 414, 42, 7, 731, 95, 185, 4, 0, 206, 5, 42, 1164, 7, 1165, 1166, 125], [415, 17, 42, 31, 16, 538, 0, 1167, 1168, 1169, 1170, 2, 1171, 170, 0, 111, 4, 7, 231, 1, 112, 1172, 7, 1173, 1, 416, 349, 5, 1174, 113, 732], [42, 16, 417, 7, 85, 12, 8, 418, 4, 0, 207, 1, 0, 126, 350, 1175, 253, 8, 96, 205], [4, 0, 1176, 9, 30, 26, 2, 1177, 7, 539, 46, 540, 17, 7, 231, 1, 293, 16, 417, 12, 232, 1, 37, 0, 1178, 171, 127, 114, 1, 0, 541, 1179, 4, 0, 351, 112, 1, 0, 32, 97], [733, 734, 172, 11, 8, 539, 46, 540], [0, 186, 1180, 3, 735, 7, 539, 46, 540], [58, 24, 12, 7, 99, 1, 254], [42, 31, 15, 542, 53, 0, 543, 3, 736, 5, 39, 16, 27, 7, 231, 1, 1181, 1182, 3, 1183, 4, 419, 544], [73, 1, 0, 111, 1184, 43, 545, 4, 419, 544, 64, 25, 1185, 1186, 100, 91, 1187, 0, 32, 29, 128, 7, 350, 255, 737], [20, 13, 15, 420, 11, 42, 58, 24, 2, 1188, 7, 421, 2, 0, 419, 1189, 24, 738, 352, 422, 44, 115, 3

In [18]:
# Step 6: Sequence padding
'''
We pad the sentence sequences so that each tensor introduced to the model is of
equal length. If this is not done, Tensorflow cannot work with the data.
'''
from tensorflow.keras.preprocessing.sequence import pad_sequences
# print(len(en_sequences[66]))
pad_fr_seq = pad_sequences(fr_sequences, padding = 'post')
pad_en_seq = pad_sequences(en_sequences, maxlen = pad_fr_seq.shape[1],
                           padding = 'post')


# print(pad_en_seq[1][2])
print(pad_fr_seq)

print(pad_en_seq.shape)
print(pad_fr_seq.shape)

[[2921  772    0 ...    0    0    0]
 [2921   11 1242 ...    0    0    0]
 [2921   81   35 ...    0    0    0]
 ...
 [2921   12 2907 ...    0    0    0]
 [2921  741    2 ...    0    0    0]
 [2921    7 2918 ...    0    0    0]]
(500, 135)
(500, 135)


In [None]:
""" Pre-training steps """

' Pre-training steps '

In [19]:
from sklearn.model_selection import train_test_split

en_train, en_test, fr_train, fr_test = train_test_split(pad_en_seq, pad_fr_seq,
                                                        test_size = 0.1,
                                                        random_state = 1)
print(en_train)
print(en_train.shape)
print(en_test.shape)
print(fr_train.shape)
print(fr_test.shape)

[[  0 148  21 ...   0   0   0]
 [  4   8 169 ...   0   0   0]
 [733 734 172 ...   0   0   0]
 ...
 [ 10  19 677 ...   0   0   0]
 [  0 120   2 ...   0   0   0]
 [  9  20  75 ...   0   0   0]]
(450, 135)
(50, 135)
(450, 135)
(50, 135)


In [20]:
def getOneHotWords(vocabulary, words):
    if(words[0]==0):
      return np.zeros((len(words), len(vocabulary)))
    n_words = len(words)
    n_voc = len(vocabulary)
    
    indices = np.array([word for word in words])
    a = np.zeros((n_words, n_voc))
    
    a[np.arange(n_words), indices] = 1
    # print(np.arange(n_words),indices, n_voc, n_words 
    
    return a

In [None]:
""" Model Construction """

' Model Construction '

In [21]:
class RNN_Model:
  def __init__(self,en_vocab,fr_vocab,h_size):
    self.encoder_vocab = en_vocab
    self.decoder_vocab = fr_vocab
    self.h_size = h_size
    self.encoder_vocab_size = len(self.encoder_vocab)
    self.decoder_vocab_size = len(self.decoder_vocab)
    self.total_encoder_size = self.h_size + self.encoder_vocab_size
    self.total_decoder_size = self.h_size + self.decoder_vocab_size

    print(self.encoder_vocab_size,self.decoder_vocab_size,self.total_encoder_size,self.total_decoder_size)
    #weights to calulate the hidden matrix of the encoder
    self.Wh_encoder = tf.Variable(tf.random.normal(
        stddev = 1.0/(self.total_encoder_size +self.h_size),
        shape = (self.total_encoder_size, self.h_size),
        dtype = tf.double
    ))
    self.bh_encoder = tf.Variable(tf.random.normal(
        stddev = 1.0/(1 + self.h_size),
        shape = (1, self.h_size),
        dtype = tf.double
    ))
    #weights to calulate the hidden matrix of the decoder
    self.Wh_decoder = tf.Variable(tf.random.normal(
        stddev = 1.0/(self.total_decoder_size +self.h_size),
        shape = (self.total_decoder_size, self.h_size),
        dtype = tf.double
    ))
    self.bh_decoder = tf.Variable(tf.random.normal(
        stddev = 1.0/(1 + self.h_size),
        shape = (1, self.h_size),
        dtype = tf.double
    ))
    #weights to calulate the output matrix of the decoder
    self.Wy_decoder = tf.Variable(tf.random.normal(
        stddev = 1.0/(self.decoder_vocab_size +self.h_size),
        shape = (self.h_size, self.decoder_vocab_size),
        dtype = tf.double
      ))
    self.by_decoder = tf.Variable(tf.random.normal(
        stddev = 1.0/(self.decoder_vocab_size),
        shape = (1, self.decoder_vocab_size),
        dtype = tf.double
      ))


    self.optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
    self.weights = [self.Wh_encoder, self.bh_encoder, self.Wh_decoder, self.bh_decoder, self.Wy_decoder, self.by_decoder]
    
      
  def get_encoder_hidden_matrix(self, h , x):
    h_next = tf.math.tanh(tf.linalg.matmul(tf.concat([h,x], axis =1 ),self.Wh_encoder)+self.bh_encoder)
    # print(h_next)
    return h_next
  
  def get_decoder_stats(self, h , x, y):
    h_next = tf.math.tanh(tf.linalg.matmul(tf.concat([h,x], axis =1 ),self.Wh_decoder)+self.bh_decoder)
    # y_pred = tf.nn.softmax(tf.linalg.matmul(h_next, self.Wy_decoder) + self.by_decoder)
    
    y_logits = tf.linalg.matmul(h_next, self.Wy_decoder) + self.by_decoder
    y_pred = np.zeros(y_logits.shape)
    for i in range(len(y_logits)):
      y_pred[i] = tf.nn.softmax(y_logits[i])
    # print(y_logits)
    y_ret = np.zeros(y_pred.shape)
    
    for i in range(len(y_pred)):
      y_ret[i] = getOneHotWords(fr_word_index,[tf.math.argmax(y_pred[i])])

    return (h_next, y_ret, tf.nn.softmax_cross_entropy_with_logits(y,y_logits))


  def fit_model(self, encoder_sequences, decoder_sequences, batch_size : int = 128, epochs : int = 10 ):

    total_length = len(encoder_sequences)
    batches = ceil(total_length/batch_size)
    
    for epoch in range(epochs):
      batch_index =0
      for batch in range(batches):
        # print(batch_index)
        start = batch_index * batch_size
        end  = min(start + batch_size -1 , total_length-1)
       
        # print(start,end)
        number_of_sequences = len(encoder_sequences[start:end+1])

        h = np.zeros((number_of_sequences, self.h_size))
        h_decoder = np.zeros((number_of_sequences, self.h_size))
        x = np.zeros((number_of_sequences, self.encoder_vocab_size))
        x_prev = np.zeros((number_of_sequences, self.encoder_vocab_size))
        timesteps = len(encoder_sequences[0])

        #Encoder logic to encode the input

        for i in range(timesteps):
          x_curr = np.zeros((number_of_sequences, self.encoder_vocab_size))
          # print("timstep: ",i+1)
          for j in range(start,end+1):
            # print(j - batch_index * batch_size)
            x_curr[j - batch_index * batch_size] = getOneHotWords(self.encoder_vocab,[encoder_sequences[j][i]])
            if(i == len(en_sequences[j])):
              # print("j: ",j, "len: ",i ,"len: ",len(en_sequences[j]))
              h_decoder[j - (batch_index) * batch_size] = h[j - (batch_index) * batch_size]
          h = self.get_encoder_hidden_matrix(h, x_curr)
          
        # print(h_decoder)
        #Decoder starts here
        h = h_decoder
        
        # h_prev =h
        x = np.zeros((number_of_sequences, self.decoder_vocab_size))
        x_prev = np.zeros((number_of_sequences, self.decoder_vocab_size))
        for i in range(len(x)):
          x[i] = getOneHotWords(fr_word_index,[fr_word_index['<SOS>']])
        # getOneHotWords(fr_word_index,[-1]
        # print(x)
        x_prev =x
        with tf.GradientTape() as tape:
          y_prev = np.zeros((number_of_sequences, self.decoder_vocab_size))
          training_losses = []
          for i in range(timesteps):
            loss_multiplier = []
            y_curr = np.zeros((number_of_sequences, self.decoder_vocab_size))
            for j in range(start,end+1):
              y_curr[j - batch_index * batch_size] = getOneHotWords(self.decoder_vocab,[decoder_sequences[j][i]])
              if(i >= len(fr_sequences[j])):
                loss_multiplier.append(0)
              else:
                loss_multiplier.append(1)

            h ,x, loss = self.get_decoder_stats(h , x, y_curr)

            x_prev =x
            loss_tensor = tf.convert_to_tensor(loss_multiplier,dtype=tf.float64)
            
            training_losses.append(tf.math.reduce_mean(loss*loss_tensor))
          loss_value = tf.math.reduce_mean(training_losses)
          
          grads = tape.gradient(loss_value, self.weights)
          self.optimizer.apply_gradients(zip(grads, self.weights))

        batch_index+=1
        # epoch_loss = loss_value
        print("Training Epoch : " ,epoch+1 ,"batch: ",batch+1,"Loss: ", loss_value.numpy())
      # print("Epoch Completed!!!")
  def test_model(self, encoder_sequences, decoder_sequences, batch_size : int = 128, epochs : int = 10 ):

    total_length = len(encoder_sequences)
    batches = ceil(total_length/batch_size)
        
    for epoch in range(epochs):
      batch_index =0
      for batch in range(batches):
        # print(batch_index)
        start = batch_index * batch_size
        end  = min(start + batch_size -1 , total_length-1)
          
        # print(start,end)
        number_of_sequences = len(encoder_sequences[start:end+1])

        h = np.zeros((number_of_sequences, self.h_size))
        h_decoder = np.zeros((number_of_sequences, self.h_size))
        x = np.zeros((number_of_sequences, self.encoder_vocab_size))

        timesteps = len(encoder_sequences[0])

        #Encoder logic to encode the input

        for i in range(timesteps):
          x_curr = np.zeros((number_of_sequences, self.encoder_vocab_size))
          
          for j in range(start,end+1):
            
            x_curr[j - batch_index * batch_size] = getOneHotWords(self.encoder_vocab,[encoder_sequences[j][i]])
            if(i == len(en_sequences[j])):
              h_decoder[j - (batch_index) * batch_size] = h[j - (batch_index) * batch_size]
              
          h = self.get_encoder_hidden_matrix(h, x_curr)
              
            
        #Decoder starts here
        h = h_decoder
            
        # h_prev =h
        x = np.zeros((number_of_sequences, self.decoder_vocab_size))
        x_prev = np.zeros((number_of_sequences, self.decoder_vocab_size))
        for i in range(len(x)):
          x[i] = getOneHotWords(fr_word_index,[fr_word_index['<SOS>']])
          # getOneHotWords(fr_word_index,[-1]
          # print(x)
        
          y_prev = np.zeros((number_of_sequences, self.decoder_vocab_size))
          training_losses = []
          for i in range(timesteps):
            loss_multiplier = []
            y_curr = np.zeros((number_of_sequences, self.decoder_vocab_size))
            for j in range(start,end+1):
              y_curr[j - batch_index * batch_size] = getOneHotWords(self.decoder_vocab,[decoder_sequences[j][i]])
              if(i >= len(fr_sequences[j])):
                loss_multiplier.append(0)
              else:
                loss_multiplier.append(1)


                # print(y_curr)
            h ,x, loss = self.get_decoder_stats(h , x, y_curr)
            loss_tensor = tf.convert_to_tensor(loss_multiplier,dtype=tf.float64)
              
            training_losses.append(tf.math.reduce_mean(loss*loss_tensor))
          loss_value = tf.math.reduce_mean(training_losses)
            
        batch_index+=1
            # epoch_loss = loss_value
        print("Testing Epoch : " ,epoch+1 ,"batch: ",batch+1,"Loss: ", loss_value.numpy())

  def predict(self, sequence):
    # print(sequence)
    seq_timesteps = len(sequence)

    h = np.zeros((1, self.h_size))
    h_decoder = np.zeros((1, self.h_size))
    x = np.zeros((1, self.encoder_vocab_size))

    for i in range(seq_timesteps):
      x_curr = np.zeros((1, self.encoder_vocab_size))

      x_curr = getOneHotWords(self.encoder_vocab,[sequence[i]])
      h = self.get_encoder_hidden_matrix(h,x)

    h_decoder = h
    x = np.zeros((1, self.decoder_vocab_size))

    x = getOneHotWords(fr_word_index,[fr_word_index['<SOS>']])
    result =[]
    result.append(x)
    output_seq =[]
    french =[]
    for i in range(seq_timesteps):
      y_curr = np.zeros((1, self.decoder_vocab_size))

      h ,x, loss = self.get_decoder_stats(h , x, y_curr)
      output_seq.append(np.where(x == 1))
      result.append(x)

      for key,value in fr_word_index.items():
        if(value == output_seq[0][1][0]):
          french.append(key)

    print(french)


In [None]:
x = tf.constant(([1, 2, 3, 4]))
tf.math.multiply(x, 2*x)

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([ 2,  8, 18, 32], dtype=int32)>

In [14]:
for i in range(len(pad_fr_seq)):
  for j in range(len(pad_fr_seq[0])):
    if(pad_fr_seq[i][j]==-1):
      print(i,j)

In [22]:
model = RNN_Model(en_word_index,fr_word_index,1024)

2352 2923 3376 3947


In [None]:
model.fit_model(en_train,fr_train,50,5)

Training Epoch :  1 batch:  1 Loss:  1.0475036442723005
Training Epoch :  1 batch:  2 Loss:  1.2104940805235815
Training Epoch :  1 batch:  3 Loss:  0.9194994363080655
Training Epoch :  1 batch:  4 Loss:  1.2028288456314642
Training Epoch :  1 batch:  5 Loss:  1.2849504279764312
Training Epoch :  1 batch:  6 Loss:  0.9912781192174865
Training Epoch :  1 batch:  7 Loss:  1.486684568449157
Training Epoch :  1 batch:  8 Loss:  1.268296259555826
Training Epoch :  1 batch:  9 Loss:  1.0059947270033094
Training Epoch :  2 batch:  1 Loss:  0.9201410874045686
Training Epoch :  2 batch:  2 Loss:  1.0542001928661708
Training Epoch :  2 batch:  3 Loss:  0.8001361193070183
Training Epoch :  2 batch:  4 Loss:  1.0603187087508887
Training Epoch :  2 batch:  5 Loss:  1.1223560420282197
Training Epoch :  2 batch:  6 Loss:  0.8853730746775518
Training Epoch :  2 batch:  7 Loss:  1.3328782497178038


In [None]:
model.test_model(en_test,fr_test,20,1)

Testing Epoch :  1 batch:  1 Loss:  0.9351207848502289
