In [None]:
import pandas as pd
import numpy as np

# EuroParl English-Spanish Dataset

In this notebook, I explore the EuroParl dataset for English/Spanish translation. My goal is to gain a somewhat good understanding of the data.

## Loading the Data
First, we load the datasets. There are two files provided when we download the Europarl dataset: `Europarl.<lan_1>-<lan-2>.<lan_1>` and `Europarl.<lan_1>-<lan-2>.<lan_2>` It is up to the user to decide which one is the data and which one are the labels.

In [37]:
europarl_english_path = "dataset/Europarl.en-es.en"

# Load datasets from file
with open(europarl_english_path, 'r') as corpus:
    english_corpus = corpus.read().lower()
    
europarl_spanish_path = "dataset/Europarl.en-es.es"
with open(europarl_spanish_path, 'r') as corpus:
    spanish_corpus = corpus.read().lower()

# Split corpi into sentences for later training and inference.
english = english_corpus.split('\n')
spanish = spanish_corpus.split('\n')

assert len(english) == len(spanish), f"Number of sentences between both languages is not equal! {len(english):,} vs {len(spanish):,}"
total_sentences = len(english)
    
# Some basic statistics about the corpi
print(f'Length in Words: EN: {len(english_corpus):,} | ES: {len(spanish_corpus):,} | Difference: {abs(len(english_corpus) - len(spanish_corpus)):,}')
print(f'Number of Sentences: EN: {len(english):,} | ES: {len(spanish):,}')

# Show an example sentence trainslation
import random

ran_sen = random.randint(0, total_sentences)
sample_english = english[ran_sen]
sample_spanish = spanish[ran_sen]

print(f'Sample Translation:\n')
print(f'\t{sample_english}\n')
print(f'\t{sample_spanish}')


Length in Words: EN: 301,185,109 | ES: 325,981,216 | Difference: 24,796,107
Number of Sentences: EN: 2,009,074 | ES: 2,009,074
Sample Translation:

	both the criticism and the statement about the alleged failure of icao negotiations are incorrect and inappropriate.

	tanto la crítica como la declaración sobre el supuesto fracaso de las negociaciones de la oaci son incorrectas e inadecuadas.


## Tokenizing the Sentences and Collect Vocabulary

Tokenization is helpful for spliting the words and the special characters to feed into the model so that it can process it. In out case, we split between spaces and special characters. For example, the sentence `Today's dollar.` would be split into -> [`today`, `'`, `s`, `dollar`, `.`].

Further, the vocabulary is obtained for each language. This vocabulary will contain the `MAX_VOCABULARY` most populat words in each corpus.

In [42]:
special_chars = ',?;.:/*!+-()[]{}"\'&'

print('Working on English...')
english_sentences = [re.sub(f'[{re.escape(special_chars)}]', ' \g<0> ', s).split(' ') for s in english ]
# Removes tokens that were empty. This helps in separating special characters!
english_tokens = [[w for w in s if len(w)] for s in english_sentences]

# English vocabulary calculation
english_words = [w for s in english_tokens for w in s]

english_vocab = Counter(english_words).most_common(MAX_VOCABULARY)
english_vocab = [w[0] for w in english_vocab]

print('Working on Spanish...')
spanish_sentences = [re.sub(f'[{re.escape(special_chars)}]', ' \g<0> ', s).split(' ') for s in spanish ]
# Removes tokens that were empty. This helps in separating special characters!
spanish_tokens = [[w for w in s if len(w)] for s in spanish_sentences]

# Spanish vocabulary calculation
spanish_words = [w for s in spanish_tokens for w in s]

spanish_vocab = Counter(spanish_words).most_common(MAX_VOCABULARY)
spanish_vocab = [w[0] for w in spanish_vocab]

Working on English...
Working on Spanish...


## Save Languages

This is helpful for loading the languages later, skipping over all of the above processing from the raw files.

The sentences are stored in a dictionary, separating among actual tokenized instances, vocabulary, and a flag over the language. Numerical information such as the number of sentences or MAX_VOCABULARY can be extracted from the data itself, so it is not included.

In [49]:
import pickle

save_languages = True

english_europarl = {
    'tokens':     english_tokens,
    'vocabulary': english_vocab,
    'language': 'en',
}

spanish_europarl = {
    'tokens':     spanish_tokens,
    'vocabulary': spanish_vocab,
    'language': 'es',
}

if save_languages:
    
    with open('dataset/Europarl.en.pkl', 'wb') as jar:
        pickle.dump(english_europarl, jar)
        
    with open('dataset/Europarl.es.pkl', 'wb') as jar:
        pickle.dump(spanish_europarl, jar)

In [50]:
english_europarl['vocabulary']

['the',
 ',',
 '.',
 'of',
 'to',
 'and',
 'in',
 'that',
 'a',
 'is',
 'for',
 'we',
 'i',
 'this',
 'on',
 'it',
 'be',
 '-',
 'are',
 'as',
 'have',
 'not',
 'with',
 'which',
 'european',
 'by',
 'will',
 "'",
 'has',
 'mr',
 'at',
 'commission',
 'an',
 'would',
 'also',
 'all',
 'but',
 'should',
 'our',
 'from',
 'president',
 'must',
 's',
 'there',
 'been',
 'you',
 'union',
 'parliament',
 'can',
 'states',
 'member',
 'more',
 'was',
 'report',
 'its',
 'or',
 'these',
 'they',
 'their',
 'do',
 'council',
 'like',
 'what',
 'very',
 '(',
 'one',
 'if',
 ')',
 'so',
 'europe',
 'eu',
 'countries',
 'us',
 'my',
 'no',
 'other',
 'about',
 'people',
 'need',
 'who',
 'only',
 'policy',
 'important',
 ':',
 'new',
 'time',
 'because',
 'now',
 'up',
 'such',
 'rights',
 '?',
 'out',
 'am',
 'therefore',
 'support',
 'when',
 'those',
 'however',
 'take',
 'make',
 'into',
 'between',
 'some',
 'work',
 'economic',
 'committee',
 'any',
 'being',
 'political',
 'them',
 'made',