# Setup

In [130]:
import numpy as np
import string 
import re 
import spacy
import nltk
from nltk.corpus import stopwords 
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 

import pandas as pd
pd.set_option('display.max_colwidth',200)

In [132]:
from src.load import load_data_as_df
from src.preprocessing import remove_stop_words
import config

# Load Data

In [135]:
en_file = "data/europarl-v7.nl-en.en"
nl_file = "data/europarl-v7.nl-en.nl"
%time
df = load_data_as_df(en_file, nl_file)

CPU times: user 3 μs, sys: 1 μs, total: 4 μs
Wall time: 10 μs
Loading English Corpora from: data/europarl-v7.nl-en.en ...
Loading Dutch Corpora from: data/europarl-v7.nl-en.nl ...


In [136]:
len(df)

1997775

In [137]:
df.head()

Unnamed: 0,English,Dutch
0,Resumption of the session,Hervatting van de zitting
1,"I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant fest...","Ik verklaar de zitting van het Europees Parlement, die op vrijdag 17 december werd onderbroken, te zijn hervat. Ik wens u allen een gelukkig nieuwjaar en hoop dat u een goede vakantie heeft gehad."
2,"Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.","Zoals u heeft kunnen constateren, is de grote ""millenniumbug"" uitgebleven. De burgers van een aantal van onze lidstaten zijn daarentegen door verschrikkelijke natuurrampen getroffen."
3,"You have requested a debate on this subject in the course of the next few days, during this part-session.",U heeft aangegeven dat u deze vergaderperiode een debat wilt over deze rampen.
4,"In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the vari...",Nu wil ik graag op verzoek van een aantal collega's een minuut stilte in acht nemen ter nagedachtenis van de slachtoffers. Ik doel hiermee met name op de slachtoffers van het noodweer dat verschil...


# Preprocessing

In [139]:
en_stop_words = set(stopwords.words("english"))
nl_stop_words = set(stopwords.words("dutch"))
nl_stop_words.update(["we", "wij", "onze"])

In [140]:
def preprocess(text, stop_words):
    # always lowercase & remove whitespaces
    
    text = text.lower().strip()

    
    # Remove everything after '<' if it exists
    if "<" in text:
        text = text.split("<")[0].strip()

    if not text:
        return ""  # Leave empty for later row drop

    if config.remove_punct:
        #print("Removing punctuation ...")
        text = "".join([char for char in text if char not in string.punctuation])

    if config.remove_stopwords:
        #print("Removing stopwords ...")
        text = " ".join([word for word in word_tokenize(text) if word not in stop_words])

    if config.remove_nums:
        #print("Removing numbers ...")
        text = re.sub(r"\d+", "", text)

    return text

In [141]:
df["English"] = df["English"].apply(lambda x: preprocess(x, en_stop_words))

In [142]:
df["Dutch"] = df["Dutch"].apply(lambda x: preprocess(x, nl_stop_words))

In [148]:
df.head(12)

Unnamed: 0,English,Dutch
0,resumption session,hervatting zitting
1,declare resumed session european parliament adjourned friday 17 december 1999 would like wish happy new year hope enjoyed pleasant festive period,verklaar zitting europees parlement vrijdag 17 december onderbroken hervat wens allen gelukkig nieuwjaar hoop goede vakantie gehad
2,although seen dreaded millennium bug failed materialise still people number countries suffered series natural disasters truly dreadful,zoals constateren grote millenniumbug uitgebleven burgers aantal lidstaten daarentegen verschrikkelijke natuurrampen getroffen
3,requested debate subject course next days partsession,aangegeven vergaderperiode debat wilt rampen
4,meantime like observe minute silence number members requested behalf victims concerned particularly terrible storms various countries european union,graag verzoek aantal collegas minuut stilte acht nemen ter nagedachtenis slachtoffers doel hiermee name slachtoffers noodweer verschillende lidstaten unie geteisterd
5,please rise minute silence,vragen minuut stilte staande acht nemen
6,house rose observed minute silence,parlement neemt staande minuut stilte acht
7,madam president point order,mevrouw voorzitter motie orde stellen
8,aware press television number bomb explosions killings sri lanka,zult via media vernomen sri lanka aantal bomexplosies schietpartijen voorgedaan
9,one people assassinated recently sri lanka mr kumar ponnambalam visited european parliament months ago,mensen zeer recent sri lanka vermoord heer kumar ponnambalam paar maanden geleden bezoek bracht europees parlement


## Remove Duplicates

In [150]:
df.duplicated(keep="first").sum()

49065

In [151]:
df = df.drop_duplicates(keep="first")
df.head()

Unnamed: 0,English,Dutch
0,resumption session,hervatting zitting
1,declare resumed session european parliament adjourned friday 17 december 1999 would like wish happy new year hope enjoyed pleasant festive period,verklaar zitting europees parlement vrijdag 17 december onderbroken hervat wens allen gelukkig nieuwjaar hoop goede vakantie gehad
2,although seen dreaded millennium bug failed materialise still people number countries suffered series natural disasters truly dreadful,zoals constateren grote millenniumbug uitgebleven burgers aantal lidstaten daarentegen verschrikkelijke natuurrampen getroffen
3,requested debate subject course next days partsession,aangegeven vergaderperiode debat wilt rampen
4,meantime like observe minute silence number members requested behalf victims concerned particularly terrible storms various countries european union,graag verzoek aantal collegas minuut stilte acht nemen ter nagedachtenis slachtoffers doel hiermee name slachtoffers noodweer verschillende lidstaten unie geteisterd


In [152]:
len(df)

1948710

## Remove Empty

In [154]:
before_rows = len(df)
# Drop rows where either column is empty
df = df[(df["English"] != "") & (df["Dutch"] != "")]
after_rows = len(df)

print(f"Deleted {before_rows - after_rows} rows due to empty cells")

Deleted 18611 rows due to empty cells


In [162]:
len(df)

1930099