**Preprocessing raw data**

In [None]:
import pickle
from pickle import dump

In [None]:
def load_doc(filename):

  file = open(filename, mode='rt', encoding="utf-8")

  text = file.read()
  file.close()

  return text

In [None]:
def to_sentences(doc):
  return doc.strip().split('\n')

In [None]:
def sentence_lengths(sentences):
  lengths = [len(s.split()) for s in sentences]
  return min(lengths), max(lengths)

In [None]:
# clean lines
import re
import string
import unicodedata
def clean_lines(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for line in lines:
		# normalize unicode characters
		line = unicodedata.normalize('NFD', line).encode('ascii', 'ignore')
		line = line.decode('UTF-8')
		# tokenize on white space
		line = line.split()
		# convert to lower case
		line = [word.lower() for word in line]
		# remove punctuation from each token
		line = [word.translate(table) for word in line]
		# remove non-printable chars form each token
		line = [re_print.sub('', w) for w in line]
		# remove tokens with numbers in them
		line = [word for word in line if word.isalpha()]
		# store as string
		cleaned.append(' '.join(line))
	return cleaned


In [None]:
filename = '/content/drive/MyDrive/Transformers/Data/europarl-v7.fr-en.en'
doc = load_doc(filename)
sentences = to_sentences(doc)

min_len, max_len = sentence_lengths(sentences)
print("English data: sentences=%d, min=%d, max=%d" % (len(sentences), min_len, max_len))
cleanf = clean_lines(sentences)

English data: sentences=2007723, min=0, max=668


In [None]:
filename = 'English.pkl'
outfile = open(filename, 'wb')
pickle.dump(cleanf, outfile)
outfile.close()
print(filename, " saved")

English.pkl  saved


In [None]:
filename = '/content/drive/MyDrive/Transformers/Data/europarl-v7.fr-en.fr'
doc = load_doc(filename)
sentences = to_sentences(doc)

min_len, max_len = sentence_lengths(sentences)
print("French data: sentences=%d, min=%d, max=%d" % (len(sentences), min_len, max_len))
cleanf = clean_lines(sentences)

filename = 'French.pkl'
outfile = open(filename, 'wb')
pickle.dump(cleanf, outfile)
outfile.close()
print(filename, " saved")

French data: sentences=2007723, min=0, max=693
French.pkl  saved


**Finalizing the preprocessing of the datasets**

In [None]:
from pickle import load
from pickle import dump
from collections import Counter

#Load a clean dataset
def load_clean_sentences(filename):
  return load(open(filename, 'rb'))

#Save a list of clean sentences to file
def save_clean_sentences(sentences, filename):
  dump(sentences, open(filename, 'wb'))
  print("Saved: %s" % filename)

In [None]:
def to_vocab(lines):
  vocab = Counter()
  for line in lines:
    tokens = line.split()
    vocab.update(tokens)
  return vocab

In [None]:
def trim_vocab(vocab, min_occurrence):
  tokens = [k for k,c in vocab.items() if c>=min_occurrence]
  return set(tokens)

In [None]:
# mark oov tokens into unknown - (Out-Of-Vocabulary)
def update_dataset(lines, vocab):
	new_lines = list()
	for line in lines:
		new_tokens = list()
		for token in line.split():
			if token in vocab:
				new_tokens.append(token)
			else:
				new_tokens.append('unk')
		new_line = ' '.join(new_tokens)
		new_lines.append(new_line)
	return new_lines

In [None]:
filename = "English.pkl"
lines = load_clean_sentences(filename)

vocab = to_vocab(lines)
print("English Vocabulary: %d" % len(vocab))

#reduce vocabulary
vocab = trim_vocab(vocab, 5)
print("New English Vocabulary: %d" % len(vocab))

lines = update_dataset(lines, vocab)
filename = 'english_vocab.pkl'

save_clean_sentences(lines, filename)

for i in range(20):
  print("line", i, ":", lines[i])

English Vocabulary: 105357
New English Vocabulary: 41746
Saved: english_vocab.pkl
line 0 : resumption of the session
line 1 : i declare resumed the session of the european parliament adjourned on friday december and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period
line 2 : although as you will have seen the dreaded millennium bug failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful
line 3 : you have requested a debate on this subject in the course of the next few days during this partsession
line 4 : in the meantime i should like to observe a minute s silence as a number of members have requested on behalf of all the victims concerned particularly those of the terrible storms in the various countries of the european union
line 5 : please rise then for this minute s silence
line 6 : the house rose and observed a minute s silence
line 7 : madam president o

In [None]:
# load French dataset
filename = 'French.pkl'
lines = load_clean_sentences(filename)
# calculate vocabulary
vocab = to_vocab(lines)
print('French Vocabulary: %d' % len(vocab))
# reduce vocabulary
vocab = trim_vocab(vocab, 5)
print('New French Vocabulary: %d' % len(vocab))
# mark out of vocabulary words
lines = update_dataset(lines, vocab)
# save updated dataset
filename = 'french_vocab.pkl'
save_clean_sentences(lines, filename)
# spot check
for i in range(20):
	print("line",i,":",lines[i])

French Vocabulary: 141642
New French Vocabulary: 58800
Saved: french_vocab.pkl
line 0 : reprise de la session
line 1 : je declare reprise la session du parlement europeen qui avait ete interrompue le vendredi decembre dernier et je vous renouvelle tous mes vux en esperant que vous avez passe de bonnes vacances
line 2 : comme vous avez pu le constater le grand bogue de lan ne sest pas produit en revanche les citoyens dun certain nombre de nos pays ont ete victimes de catastrophes naturelles qui ont vraiment ete terribles
line 3 : vous avez souhaite un debat a ce sujet dans les prochains jours au cours de cette periode de session
line 4 : en attendant je souhaiterais comme un certain nombre de collegues me lont demande que nous observions une minute de silence pour toutes les victimes des tempetes notamment dans les differents pays de lunion europeenne qui ont ete touches
line 5 : je vous invite a vous lever pour cette minute de silence
line 6 : le parlement debout observe une minute de 