# Data Preprocessing

In [3]:
# reading the data
en = []
with open("../data/raw/europarl-v7.fr-en.en", "r") as file:
    for line in file:
        en.append(line.strip())

fr = []
with open("../data/raw/europarl-v7.fr-en.fr", "r") as file:
    for line in file:
        fr.append(line.strip())

In [4]:
print(len(en))
print(len(fr))

2007723
2007723


In [5]:
print(en[:10])
print(fr[:10])

['Resumption of the session', 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.', "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.", 'You have requested a debate on this subject in the course of the next few days, during this part-session.', "In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.", "Please rise, then, for this minute' s silence.", "(The House rose and observed a minute' s silence)", 'Madam President, on a point of order.', 'You will be aware from the press and television that there have been a num

In [6]:
# cleaning the data
import re

clean_en = []
clean_fr = []

for en_line, fr_line in zip(en, fr):
    en_line_clean = en_line.strip()
    fr_line_clean = fr_line.strip()

    if len(en_line_clean) == 0 or len(fr_line_clean) == 0:
        continue

    en_line_clean = re.sub(r"\s+", " ", en_line_clean)
    fr_line_clean = re.sub(r"\s+", " ", fr_line_clean)

    en_line_clean = en_line_clean.lower()
    fr_line_clean = fr_line_clean.lower()

    clean_en.append(en_line_clean)
    clean_fr.append(fr_line_clean)

In [5]:
print(len(clean_en))
print(len(clean_fr))

2002756
2002756


In [6]:
print(clean_en[:5])
print(clean_fr[:5])

['resumption of the session', 'i declare resumed the session of the european parliament adjourned on friday 17 december 1999, and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.', "although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.", 'you have requested a debate on this subject in the course of the next few days, during this part-session.', "in the meantime, i should like to observe a minute' s silence, as a number of members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the european union."]
['reprise de la session', 'je déclare reprise la session du parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacan

In [7]:
# length filtering to prevent vanishing/exploding gradients
# 2 conditions: if length > 100 or length ratio too large

filtered_en = []
filtered_fr = []

for en_line, fr_line in zip(clean_en, clean_fr):
    en_len = len(en_line.split())
    fr_len = len(fr_line.split())

    ratio = max(en_len, fr_len) / min(en_len, fr_len)

    if en_len > 100 or fr_len > 100 or ratio > 3:
        continue

    filtered_en.append(en_line)
    filtered_fr.append(fr_line)

In [8]:
print(len(filtered_en), len(filtered_fr))
print(filtered_en[:5])
print(filtered_fr[:5])

1996288 1996288
['resumption of the session', 'i declare resumed the session of the european parliament adjourned on friday 17 december 1999, and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.', "although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.", 'you have requested a debate on this subject in the course of the next few days, during this part-session.', "in the meantime, i should like to observe a minute' s silence, as a number of members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the european union."]
['reprise de la session', 'je déclare reprise la session du parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé

In [9]:
# writing a combined corpus for the sentence piece model

with open("../data/spm/corpus.txt", "w") as file:
    for en, fr in zip(filtered_en, filtered_fr):
        file.write(en + "\n")
        file.write(fr + "\n")

In [10]:
spm_corpus = []

with open("../data/spm/corpus.txt", "r") as file:
    for line in file:
        spm_corpus.append(line)

print(len(spm_corpus))
print(spm_corpus[:5])

3992576
['resumption of the session\n', 'reprise de la session\n', 'i declare resumed the session of the european parliament adjourned on friday 17 december 1999, and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.\n', 'je déclare reprise la session du parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances.\n', "although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.\n"]


In [15]:
# training the spm tokenizer

import sentencepiece as spm

spm.SentencePieceTrainer.Train(
    input='../data/spm/corpus.txt',
    model_prefix='../data/spm/sentencepiece',
    vocab_size=16000,
    model_type='bpe',
    character_coverage=1.0,
    input_sentence_size=250000,
    shuffle_input_sentence=True,
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3,
    user_defined_symbols=[]
)

In [11]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
# sp.load('../data/spm/spm.model')
sp.load('../data/spm/sentencepiece.model')
print("Vocab size: ", sp.get_piece_size())

for i in range(30):
    print(i, sp.id_to_piece(i))

Vocab size:  16000
0 <pad>
1 <unk>
2 <s>
3 </s>
4 ▁t
5 on
6 ▁d
7 es
8 en
9 ▁a
10 ▁p
11 ▁l
12 ▁c
13 in
14 ▁th
15 re
16 ti
17 ▁s
18 er
19 is
20 ou
21 an
22 ▁the
23 ent
24 ▁m
25 ▁de
26 it
27 ▁e
28 or
29 qu


In [8]:
# tokenized_en = []
# tokenized_fr = []

# for en, fr in zip(clean_en, clean_fr):
#     tok_en = sp.encode(en, out_type=str)
#     tok_fr = sp.encode(fr, out_type=str)

#     tokenized_en.append(tok_en)
#     tokenized_fr.append(tok_fr)

# this is more optimized
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load("../data/spm/sentencepiece.model")

filtered_en = [s.strip() for s in filtered_en]
filtered_fr = [s.strip() for s in filtered_fr]

tokenized_en_ids = sp.encode(filtered_en, out_type=int)
tokenized_fr_ids = sp.encode(filtered_fr, out_type=int)


In [None]:
# print(tokenized_en[:5])
# print(tokenized_fr[:5])

[[219, 5309, 49, 22, 3801], [75, 11220, 11163, 22, 3801, 49, 22, 230, 448, 14673, 9781, 95, 12120, 3784, 4534, 8028, 63, 75, 361, 537, 2275, 723, 51, 1697, 450, 9, 5754, 652, 700, 40, 22, 1396, 93, 450, 14986, 9, 9399, 92, 33, 7, 214, 2626, 15875], [2491, 15873, 138, 450, 255, 200, 3228, 15873, 22, 6, 1155, 83, 1179, 15866, 186, 3685, 2367, 39, 1069, 15879, 6509, 51, 4954, 314, 15873, 1318, 22, 674, 40, 9, 1535, 49, 619, 9082, 9, 6674, 49, 4462, 7724, 93, 6393, 961, 6, 1155, 1083, 15875], [450, 200, 7920, 9, 1079, 95, 141, 1834, 40, 22, 1380, 49, 22, 1797, 1912, 4428, 15873, 2106, 141, 461, 15884, 9403, 15875], [40, 22, 12142, 15873, 75, 378, 537, 51, 8005, 9, 6395, 15879, 17, 8006, 15873, 138, 9, 1535, 49, 1215, 200, 7920, 15873, 95, 1843, 49, 284, 22, 4047, 1990, 15873, 1627, 825, 49, 22, 7077, 11484, 578, 15873, 40, 22, 2447, 619, 49, 22, 230, 411, 15875]]
[[6423, 25, 45, 3801], [171, 8398, 6423, 45, 3801, 132, 412, 218, 160, 2575, 476, 15232, 56, 10349, 3784, 4220, 2485, 66, 171, 4

In [None]:
# tokenized_id_en = [sp.piece_to_id(piece) for piece in tokenized_en]
# tokenized_id_fr = [sp.piece_to_id(piece) for piece in tokenized_fr]

In [10]:
# print(tokenized_en_ids[:5])
# print(tokenized_fr_ids[:5])

print(tokenized_en_ids[:5])
print(tokenized_fr_ids[:5])

[[219, 5309, 49, 22, 3801], [75, 11220, 11163, 22, 3801, 49, 22, 230, 448, 14673, 9781, 95, 12120, 3784, 4534, 8028, 63, 75, 361, 537, 2275, 723, 51, 1697, 450, 9, 5754, 652, 700, 40, 22, 1396, 93, 450, 14986, 9, 9399, 92, 33, 7, 214, 2626, 15875], [2491, 15873, 138, 450, 255, 200, 3228, 15873, 22, 6, 1155, 83, 1179, 15866, 186, 3685, 2367, 39, 1069, 15879, 6509, 51, 4954, 314, 15873, 1318, 22, 674, 40, 9, 1535, 49, 619, 9082, 9, 6674, 49, 4462, 7724, 93, 6393, 961, 6, 1155, 1083, 15875], [450, 200, 7920, 9, 1079, 95, 141, 1834, 40, 22, 1380, 49, 22, 1797, 1912, 4428, 15873, 2106, 141, 461, 15884, 9403, 15875], [40, 22, 12142, 15873, 75, 378, 537, 51, 8005, 9, 6395, 15879, 17, 8006, 15873, 138, 9, 1535, 49, 1215, 200, 7920, 15873, 95, 1843, 49, 284, 22, 4047, 1990, 15873, 1627, 825, 49, 22, 7077, 11484, 578, 15873, 40, 22, 2447, 619, 49, 22, 230, 411, 15875]]
[[6423, 25, 45, 3801], [171, 8398, 6423, 45, 3801, 132, 412, 218, 160, 2575, 476, 15232, 56, 10349, 3784, 4220, 2485, 66, 171, 4

In [11]:
# enc_in = tokenized_id_en[:]

# dec_in = []
# dec_tgt = []

# for sentence in tokenized_id_fr:
#     dec_in.append([1] + sentence)
#     dec_tgt.append(sentence + [2])

enc_in = tokenized_en_ids
dec_in = [[2] + ids for ids in tokenized_fr_ids]
dec_tgt = [ids + [3] for ids in tokenized_fr_ids]

In [12]:
print(enc_in[0])
print(dec_in[0])
print(dec_tgt[0])

[219, 5309, 49, 22, 3801]
[2, 6423, 25, 45, 3801]
[6423, 25, 45, 3801, 3]


In [None]:
# sanity check
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load("../data/spm/sentencepiece.model")

print(sp.id_to_piece(0))
print(sp.id_to_piece(1))
print(sp.id_to_piece(2))
print(sp.id_to_piece(3))
print(sp.id_to_piece(4))

<pad>
<unk>
<s>
</s>
▁t


In [14]:
# storing the preprocessed data for future use
import torch

torch.save(enc_in, "../data/processed/enc_in.pt")
torch.save(dec_in, "../data/processed/dec_in.pt")
torch.save(dec_tgt, "../data/processed/dec_tgt.pt")
