In [None]:
import pandas as pd 
import numpy as np 
import os 
from tqdm import tqdm 
import re
from d2l import tensorflow as d2l
import tensorflow as tf

In [None]:
# reading the data 
file = pd.read_csv(r"D:\en-french\archive0\eng-french.csv", encoding="utf8")

In [None]:
file.head()

In [None]:
#distributing the languages
english = file['English words/sentences']
french = file['French words/sentences']

In [None]:
print('total english words:',len(english))
print('total french words:',len(french))

In [None]:
#processing the data 
def edit(text_eng,text_fre,punc):
    eng_sent = ""
    for i,char in enumerate(text_eng.lower()):
        if char in ('~', ':', '+', '[', '\\', '@', '^', '{', '%', '(', '-', '"', '*', '|', ',', '&', '<', '`', '}', '.', '_', '=', ']', '!', '>', ';', '?', '#', '$', ')', '/') and i>0:
            if punc:
                eng_sent += ' '
                eng_sent += char
                eng_sent += ' '
            else:
                eng_sent += ' '
        else:
            eng_sent += char
    eng_sent = eng_sent.replace('\u202f', ' ').replace('\xa0', ' ')
    eng_sent = re.sub("\("," ( ",eng_sent)
    eng_sent = re.sub("\s\s+", " ", eng_sent)

    
    fre_sent = ""
    for i,char in enumerate(text_fre.lower()):
        if char in ('~', ':', '+', '[', '\\', '@', '^', '{', '%', '(', '-', '"', '*', '|', ',', '&', '<', '}', '.', '_', '=', ']', '!', '>', ';', '?', '#', '$', ')', '/') and i>0:
            if punc:
                fre_sent += ' '
                fre_sent += char
                fre_sent += ' '
            else:
                fre_sent += ' '
        else:
            fre_sent += char
    # fre_sent = re.sub(","," , ",fre_sent)
    fre_sent = fre_sent.replace('\u202f', ' ').replace('\xa0', ' ')
    fre_sent = re.sub("\("," ( ",fre_sent)
    fre_sent = re.sub("\s\s+"," ", fre_sent)
    return eng_sent,fre_sent

In [None]:
punc = True
E=[]
F=[]
for i in tqdm(range(len(english))):
    english_data,french_data = edit(english[i],french[i],punc)
    english_data = english_data.split(' ')
    french_data = french_data.split(' ')
    try:
        english_data.remove('')
        french_data.remove('')
    except:
        continue
    E.append(english_data)
    F.append(french_data)
E = np.array(E, dtype=object)
F = np.array(F, dtype=object)


In [None]:
# finding the largest sentence length for both languages 
c = []
def sent_len(lang):
    for list in lang:
        c.append(len(list))
    return np.array(c).max()
    

In [None]:
max_e = sent_len(E)
max_f = sent_len(F)

In [None]:
print('longest sentence for english corpus:',max_e)
print('longest sentence for french corpus:',max_f)

In [None]:
# Adding the <bos> and <eos> to all sentences 
def add_pad(lang):
    temp = []
    for list in lang:
        list.insert(0,'<bos>')
        list.append('<eos>')
        temp.append(list)
    return np.array(temp, dtype=object)


In [None]:
E = add_pad(E)
F = add_pad(F)

In [None]:
eng_vocab = d2l.Vocab(E, min_freq=1, reserved_tokens=['<bos>', '<eos>','<pad>'])
fre_vocab = d2l.Vocab(F, min_freq=1, reserved_tokens=['<bos>', '<eos>','<pad>'])

In [None]:
# Tokenization of the dataset 
def token(text,vocab):
    temp_text = []
    for array in tqdm(text):
        temp_array = []
        for word in array:
            try:
                temp_array.append(vocab[word])
            except:
                temp_array.append(vocab['<unk>'])
        temp_text.append(temp_array)
    return temp_text


In [None]:
E_token = token(E,eng_vocab.token_to_idx)
F_token = token(F,fre_vocab.token_to_idx)

In [None]:
# padding the tokenized text 
padded_english = tf.keras.preprocessing.sequence.pad_sequences(E_token,maxlen=50, padding="post")
print(len(padded_english[2]))
padded_french = tf.keras.preprocessing.sequence.pad_sequences(F_token, maxlen=50+1, padding="post")
print(len(padded_french[2]))

In [None]:
# saving the tokenized padded text and vocabularies
# saving english vocabulary
np.save('imp/e_vocab.npy',eng_vocab.token_to_idx,allow_pickle=True)
np.save('imp/e_vocab_rev.npy',eng_vocab.idx_to_token,allow_pickle=True)
# saving the french vocabulary 
np.save('imp/f_vocab.npy',fre_vocab.token_to_idx,allow_pickle=True)
np.save('imp/f_vocab_rev.npy',fre_vocab.idx_to_token,allow_pickle=True)
# saving the tokenied padded dataset 
np.save('imp/etp.npy',padded_english,allow_pickle=True)
np.save('imp/ftp.npy',padded_french,allow_pickle=True)
