# Setup

In [1]:
import numpy as np
import string 
import re 
import spacy
import nltk
from nltk.corpus import stopwords 
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
import pandas as pd
from tqdm import tqdm

In [2]:
pd.set_option('display.max_colwidth',200)
tqdm.pandas()

nltk.download("stopwords")
nltk.download("punkt")
tqdm.pandas()

[nltk_data] Downloading package stopwords to /Users/aiman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/aiman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
from src.load import load_data_as_df
from src.preprocessing import remove_stop_words
import config

# Load Data

In [10]:
en_file = "data/europarl-v7.nl-en.en"
nl_file = "data/europarl-v7.nl-en.nl"
%time
    df = load_data_as_df(source_file=config.en_file, source_col=config.source_col, target_file=config.nl_file, target_col=config.target_col

CPU times: user 3 μs, sys: 2 μs, total: 5 μs
Wall time: 6.91 μs
Loading English Corpora from: data/europarl-v7.nl-en.en ...
Loading Dutch Corpora from: data/europarl-v7.nl-en.nl ...


In [12]:
len(df)

1997775

In [14]:
df.head()

Unnamed: 0,English,Dutch
0,Resumption of the session,Hervatting van de zitting
1,"I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant fest...","Ik verklaar de zitting van het Europees Parlement, die op vrijdag 17 december werd onderbroken, te zijn hervat. Ik wens u allen een gelukkig nieuwjaar en hoop dat u een goede vakantie heeft gehad."
2,"Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.","Zoals u heeft kunnen constateren, is de grote ""millenniumbug"" uitgebleven. De burgers van een aantal van onze lidstaten zijn daarentegen door verschrikkelijke natuurrampen getroffen."
3,"You have requested a debate on this subject in the course of the next few days, during this part-session.",U heeft aangegeven dat u deze vergaderperiode een debat wilt over deze rampen.
4,"In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the vari...",Nu wil ik graag op verzoek van een aantal collega's een minuut stilte in acht nemen ter nagedachtenis van de slachtoffers. Ik doel hiermee met name op de slachtoffers van het noodweer dat verschil...


# Preprocessing
To prepare the parallel English-Dutch corpus for model training, we applied a series of preprocessing steps designed to improve data quality while preserving translation context.

- Lowercasing
    - All text was converted to lowercase to reduce vocabulary size and ensure uniform representation.
- XML and Non-Text Removal
    - Lines starting with < (metadata/XML tags) and non-string/null values were removed.
- Punctuation and Stopwords
    - We retained punctuation (remove_punct=False) to preserve sentence structure and context important for translation (e.g., question marks, commas).

    - We kept stopwords (remove_stopwords=False) since they carry significant meaning in translation (e.g., prepositions and articles).
- Number Removal
    - Numeric digits were removed (remove_nums=True) to avoid noise from non-linguistic tokens not relevant to translation.
- Length Filtering
    - Sentences shorter than 3 characters or longer than 100 tokens were filtered out to reduce noise and extreme outliers.
- Duplicate and Empty Rows
    - Deleted repeated pairs and rows that became empty after cleaning.
These steps made the dataset cleaner while still keeping the words and structure needed for good translation.

In [66]:
def preprocess_text(text: str, lan: str) -> str:
    """
    Preprocesses a given text string by applying normalization, punctuation removal,
    stopword removal, and number removal based on configuration flags.

    Steps performed:
    1. Validates input: returns an empty string if input is not a valid non-empty string.
    2. Returns an empty string if the line starts with '<'.
    3. Converts text to lowercase and trims whitespace.
    4. Removes punctuation if `config.remove_punct` is True.
    5. Removes stopwords using the language-specific stopword list if `config.remove_stopwords` is True.
    6. Removes numeric digits if `config.remove_nums` is True.

    Args:
        text (str): The input text to preprocess.
        lan (str): The language for stopword removal ('English' or 'Dutch').

    Returns:
        str: The preprocessed text string.
    """
    if not isinstance(text, str) or not text.strip():
        return ""
    text = text.strip()
    if text.startswith("<"):
        return ""
    text = text.lower()

    if config.remove_punct:
        text = "".join(char for char in text if char not in string.punctuation)

    if config.remove_stopwords:
        stop_words = get_stopwords(lan)
        tokens = word_tokenize(text)
        text = " ".join(word for word in tokens if word not in stop_words)

    if config.remove_nums:
        text = re.sub(r"\d+", "", text)

    return text

def get_stopwords(lan: str) -> set:
    """
    Returns a set of stopwords for the given language.
    Supports only 'English' and 'Dutch'.
    """
    lan = lan.strip().capitalize()
    if lan == "English":
        return set(stopwords.words("english"))
    elif lan == "Dutch":
        nl_stop = set(stopwords.words("dutch"))
        nl_stop.update(config.custom_nl_stopwords)
        return nl_stop
    else:
        raise ValueError("Language must be 'English' or 'Dutch'")

In [68]:
def preprocess_dataframe(df, source_col, target_col):

    print(f"[Start Preprocessing] Total raw rows: {len(df):,}")

    # Remove non-strings and nulls
    df = df[df[source_col].apply(lambda x: isinstance(x, str))]
    df = df[df[target_col].apply(lambda x: isinstance(x, str))]
    df = df.dropna(subset=[source_col, target_col])
    print(f"[Step 1] Rows after removing nulls/non-strings: {len(df):,}")

    # Preprocess text
    print("[Step 2] Preprocessing each row ...")
    df[source_col] = df[source_col].progress_apply(lambda x: preprocess_text(x, source_col))
    df[target_col] = df[target_col].progress_apply(lambda x: preprocess_text(x, target_col))

    # Drop duplicates
    before = len(df)
    df = df.drop_duplicates(keep="first")
    print(f"[Step 3] Removed duplicates: {before - len(df):,} rows dropped ({len(df):,} remain)")

    # Remove rows with too short strings
    before = len(df)
    df = df[(df[source_col].str.strip() != "") & (df[target_col].str.strip() != "")]
    df = df[(df[source_col].str.len() >= config.min_len_chars) &
            (df[target_col].str.len() >= config.min_len_chars)]
    print(f"[Step 4] Removed empty/short rows: {before - len(df):,} rows dropped ({len(df):,} remain)")

    # Remove overly long sentences
    before = len(df)
    df = df[df.apply(
        lambda row: len(row[source_col].split()) <= config.max_len_tokens and
                    len(row[target_col].split()) <= config.max_len_tokens, axis=1)]
    print(f"[Step 5] Removed overly long rows: {before - len(df):,} rows dropped ({len(df):,} remain)")

    print(f"[Done] Final preprocessed rows: {len(df):,}\n")
    return df

In [70]:
df_clean = preprocess_dataframe(df=df, source_col=config.source_col, target_col=config.target_col)

[Start Preprocessing] Total raw rows: 1,997,775
[Step 1] Rows after removing nulls/non-strings: 1,997,775
[Step 2] Preprocessing each row ...


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1997775/1997775 [00:08<00:00, 227191.44it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1997775/1997775 [00:08<00:00, 229131.15it/s]


[Step 3] Removed duplicates: 44,105 rows dropped (1,953,670 remain)
[Step 4] Removed empty/short rows: 17,887 rows dropped (1,935,783 remain)
[Step 5] Removed overly long rows: 3,629 rows dropped (1,932,154 remain)
[Done] Final preprocessed rows: 1,932,154



In [72]:
# Sample a fraction
df_sampled = df_clean.sample(frac=config.sample_frac, random_state=config.random_state).reset_index(drop=True)
df_sampled

Unnamed: 0,English,Dutch
0,"i also agree with the rapporteur in expressing the desire that rules covering the remaining risk factors for health and safety in the workplace - that is, electromagnetic fields and optical radiat...","ik ben het eens met de rapporteur als deze de wens uitspreekt dat er spoedig regels zullen worden uitgevaardigd met betrekking tot andere risicofactoren op het werk, zoals elektromagnetische velde..."
1,"obviously, it is important that europe' s researchers cooperate and that research efforts are coordinated to some degree.",uiteraard is het belangrijk dat de europese onderzoekers samenwerken en de onderzoeksinspanningen tot op zekere hoogte worden gecoördineerd.
2,"however, i was forced to recognise that this was unlikely to be supported by a majority in this house, for of course it is quite correct that setting up a new funding instrument takes a great deal...","ik heb echter moeten inzien dat ik voor dit standpunt geen meerderheid zou kunnen krijgen, want het kost onder andere natuurlijk veel tijd om een nieuw stimuleringsinstrument te creëren. en zo vee..."
3,"the latter agreement will also encompass joint border patrols for the surveillance of the green border, that is between the authorised border crossing points.","het laatstgenoemde akkoord omvat ook gezamenlijke grenscontroles ter bewaking van de groene grens, dat wil zeggen de gebieden tussen de officiële grensovergangen."
4,"lastly, we maintain that it is important to open negotiations with important trading partners in asia, including the people's republic of china and india, since the economic growth of that entire ...","tot slot achten wij het van groot belang dat er onderhandelingen worden geopend met belangrijke aziatische handelspartners, waaronder de volksrepubliek china en india, aangezien dergelijke overeen..."
...,...,...
19317,let me make a second comment on the substance of your speech.,"ik wil nu een opmerking over de inhoud van uw toespraak maken. eigenlijk kan men alles wat u gezegd hebt onderstrepen, mijnheer de eerste minister."
19318,the principle that has been retained will stand or fall on that.,daarmee staat of valt de goedgekeurde regeling.
19319,"i do not agree because all the european union' s behaviour up until now has been nothing but a huge demonstration of good faith, to the extent that, contrary to what had previously been laid down,...",ik ga hiermee niet akkoord omdat de houding van de europese unie tot dusver één grote uiting van goede trouw is geweest. wij zijn hierin zelfs zo ver gegaan dat wij in tegenstelling tot de oorspro...
19320,"the conclusion of these negotiations, together with approximation of the relevant legislation, will mark a new stage in relations between the eu and tunisia and progress towards the integration of...","de afronding van deze onderhandelingen zal, samen met de afstemming van de betreffende wetgeving, een nieuwe fase inluiden in de betrekkingen tussen de eu en tunesië en de integratie van de tunesi..."


In [73]:
import spacy

nl_nlp = spacy.load("nl_core_news_sm")  
en_nlp = spacy.load("en_core_web_sm")   

In [74]:
def tokenizer_en(text):
    return [token.text for token in en_nlp.tokenizer(text)]

def tokenizer_nl(text):
    return [token.text for token in nl_nlp.tokenizer(text)]

In [75]:
tokenized_en = df_sampled["English"].progress_apply(tokenizer_en)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 19322/19322 [00:01<00:00, 18441.42it/s]


In [76]:
tokenized_nl = df_sampled["Dutch"].progress_apply(tokenizer_nl)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 19322/19322 [00:01<00:00, 15241.51it/s]


In [77]:
df_sampled.head()

Unnamed: 0,English,Dutch
0,"i also agree with the rapporteur in expressing the desire that rules covering the remaining risk factors for health and safety in the workplace - that is, electromagnetic fields and optical radiat...","ik ben het eens met de rapporteur als deze de wens uitspreekt dat er spoedig regels zullen worden uitgevaardigd met betrekking tot andere risicofactoren op het werk, zoals elektromagnetische velde..."
1,"obviously, it is important that europe' s researchers cooperate and that research efforts are coordinated to some degree.",uiteraard is het belangrijk dat de europese onderzoekers samenwerken en de onderzoeksinspanningen tot op zekere hoogte worden gecoördineerd.
2,"however, i was forced to recognise that this was unlikely to be supported by a majority in this house, for of course it is quite correct that setting up a new funding instrument takes a great deal...","ik heb echter moeten inzien dat ik voor dit standpunt geen meerderheid zou kunnen krijgen, want het kost onder andere natuurlijk veel tijd om een nieuw stimuleringsinstrument te creëren. en zo vee..."
3,"the latter agreement will also encompass joint border patrols for the surveillance of the green border, that is between the authorised border crossing points.","het laatstgenoemde akkoord omvat ook gezamenlijke grenscontroles ter bewaking van de groene grens, dat wil zeggen de gebieden tussen de officiële grensovergangen."
4,"lastly, we maintain that it is important to open negotiations with important trading partners in asia, including the people's republic of china and india, since the economic growth of that entire ...","tot slot achten wij het van groot belang dat er onderhandelingen worden geopend met belangrijke aziatische handelspartners, waaronder de volksrepubliek china en india, aangezien dergelijke overeen..."


In [90]:
tokenized_en.head()

0    [i, also, agree, with, the, rapporteur, in, expressing, the, desire, that, rules, covering, the, remaining, risk, factors, for, health, and, safety, in, the, workplace, -, that, is, ,, electromagn...
1                                                         [obviously, ,, it, is, important, that, europe, ', s, researchers, cooperate, and, that, research, efforts, are, coordinated, to, some, degree, .]
2    [however, ,, i, was, forced, to, recognise, that, this, was, unlikely, to, be, supported, by, a, majority, in, this, house, ,, for, of, course, it, is, quite, correct, that, setting, up, a, new, f...
3                [the, latter, agreement, will, also, encompass, joint, border, patrols, for, the, surveillance, of, the, green, border, ,, that, is, between, the, authorised, border, crossing, points, .]
4    [lastly, ,, we, maintain, that, it, is, important, to, open, negotiations, with, important, trading, partners, in, asia, ,, including, the, people, 's, republic, of, china, an

In [92]:
tokenized_nl.head()

0    [ik, ben, het, eens, met, de, rapporteur, als, deze, de, wens, uitspreekt, dat, er, spoedig, regels, zullen, worden, uitgevaardigd, met, betrekking, tot, andere, risicofactoren, op, het, werk, ,, ...
1                                          [uiteraard, is, het, belangrijk, dat, de, europese, onderzoekers, samenwerken, en, de, onderzoeksinspanningen, tot, op, zekere, hoogte, worden, gecoördineerd, .]
2    [ik, heb, echter, moeten, inzien, dat, ik, voor, dit, standpunt, geen, meerderheid, zou, kunnen, krijgen, ,, want, het, kost, onder, andere, natuurlijk, veel, tijd, om, een, nieuw, stimuleringsins...
3              [het, laatstgenoemde, akkoord, omvat, ook, gezamenlijke, grenscontroles, ter, bewaking, van, de, groene, grens, ,, dat, wil, zeggen, de, gebieden, tussen, de, officiële, grensovergangen, .]
4    [tot, slot, achten, wij, het, van, groot, belang, dat, er, onderhandelingen, worden, geopend, met, belangrijke, aziatische, handelspartners, ,, waaronder, de, volksrepubliek, 

# Word Embedding Example

In [113]:
import torch
import numpy as np

def load_embeddings(file_path, embedding_dim=300):
    stoi = {}
    itos = []
    vectors = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f):
            values = line.strip().split()

            # Skip header if present (common in FastText)
            if line_num == 0 and len(values) == 2 and values[0].isdigit():
                print(f"[Info] Skipping header line: vocab_size={values[0]}, dim={values[1]}")
                continue

            word = values[0]
            vector = list(map(float, values[1:]))

            if len(vector) != embedding_dim:
                continue  # Skip malformed rows

            stoi[word] = len(itos)
            itos.append(word)
            vectors.append(vector)

    # Add special tokens
    for token in ["<pad>", "<sos>", "<eos>", "<unk>"]:
        if token not in stoi:
            stoi[token] = len(itos)
            itos.append(token)
            vectors.append([0.0] * embedding_dim)

    vectors = torch.tensor(np.array(vectors), dtype=torch.float32)
    print(f"[Done] Loaded {len(itos):,} tokens with {embedding_dim}-dim embeddings\n")

    return stoi, itos, vectors


In [115]:
def sentence_to_tensor(sentence, stoi, vectors):
    tokens = sentence.lower().split()
    indices = [stoi.get(token, stoi["<unk>"]) for token in tokens]
    return vectors[indices]  # shape: (len, emb_dim)

In [121]:
glove_path = "embedding_models/glove.6B.300d.txt"  # path to GloVe
stoi, itos, vectors = load_embeddings(glove_path)

[Done] Loaded 400,004 tokens with 300-dim embeddings



In [122]:
vectors

tensor([[ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        [-0.2554, -0.2572,  0.1317,  ..., -0.2329, -0.1223,  0.3550],
        [-0.1256,  0.0136,  0.1031,  ..., -0.3422, -0.0224,  0.1368],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [123]:
sentence = "I like deep learning"
tensor = sentence_to_tensor(sentence, stoi, vectors)  # shape (len, 300)
print(tensor.shape)  # e.g., torch.Size([4, 300])

torch.Size([4, 300])
