In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# data split
from sklearn.model_selection import train_test_split

import tensorflow as tf

# nlp preprocessing
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import RegexpTokenizer
import contractions # to deal with english contractions

[nltk_data] Downloading package stopwords to /Users/ygao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ygao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
EN_FILE_PATH = '../data/nl-en/europarl-v7.nl-en.en'
NL_FILE_PATH = '../data/nl-en/europarl-v7.nl-en.nl'
RANDOM_SEED = 297

In [14]:
# load data into data frames
def load(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        lines = [line.strip() for line in lines]
        data = pd.DataFrame({'Text': lines})
        return data
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"Error loading file '{file_path}': {e}")
        return None

In [36]:
# custom print function to check the dataframe
def printCheck(df):
    for index, row in df.head().iterrows():
        print(f"{index}: {row['Text'][:1000]}")

In [27]:
en_data = load(EN_FILE_PATH)
nl_data = load(NL_FILE_PATH)
print('English Corpus')
printCheck(en_data)
print('Dutch Corpus')
printCheck(nl_data)

English Corpus
0: Resumption of the session
1: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
2: Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.
3: You have requested a debate on this subject in the course of the next few days, during this part-session.
4: In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.
Dutch Corpus
0: Hervatting van de zitting
1: Ik verklaar de zitting van het Europees Parlement, die op vrijdag 17 december werd onderbroken, te zijn hervat. Ik wens u allen een gelukkig nieuwjaar en hoop d

In [34]:
# train, val, test split
# reorder and reassign the indeces
en_train, en_test, nl_train, nl_test = train_test_split(en_data, nl_data, test_size=0.1, random_state=RANDOM_SEED)
en_test.sort_index(inplace=True, ignore_index=True)
nl_test.sort_index(inplace=True, ignore_index=True)
en_train, en_val, nl_train, nl_val = train_test_split(en_train, nl_train, test_size=0.11, random_state=RANDOM_SEED)
en_train.sort_index(inplace=True, ignore_index=True)
nl_train.sort_index(inplace=True, ignore_index=True)
en_val.sort_index(inplace=True, ignore_index=True)
nl_val.sort_index(inplace=True, ignore_index=True)

In [35]:
printCheck(en_train)
printCheck(nl_train)

0: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
1: In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.
2: Please rise, then, for this minute' s silence.
3: (The House rose and observed a minute' s silence)
4: Madam President, on a point of order.
0: Ik verklaar de zitting van het Europees Parlement, die op vrijdag 17 december werd onderbroken, te zijn hervat. Ik wens u allen een gelukkig nieuwjaar en hoop dat u een goede vakantie heeft gehad.
1: Nu wil ik graag op verzoek van een aantal collega's een minuut stilte in acht nemen ter nagedachtenis van de slachtoffers. Ik doel hiermee met name op de slachtoffers van het noodweer dat verschillende lids

In [58]:
# clean text

def clean_text(df, eng = True):
    cleaned_texts = []
    lemmatizer = WordNetLemmatizer()
    if eng:
        stop_words = stopwords.words('english')
    else:
        stop_words = stopwords.words('dutch')
        
    for index, row in df.iterrows():
        text = row['Text']
        text = text.lower() #covert to lower case
        text = contractions.fix(text) #extend english contractions
        text = re.sub(r'[^A-Za-z0-9\s\'s]', '', text) #keep only alphabets, numbers, and space, maybe add in' as well for possessive
        
        tokens = wordpunct_tokenize(text) #tokenize by words
        # tokens = [token for token in tokens if token not in stop_words] #delete stop words (maybe not deleting stopwords)
        cleaned_text = ' '.join(tokens)
        cleaned_texts.append(cleaned_text)
    #return the text in a dataframe
    df_cleaned = pd.DataFrame(cleaned_texts, columns=['Text'])
    return df_cleaned

In [59]:
# clean text and check
en_train_clean = clean_text(en_train)
nl_train_clean = clean_text(nl_train, eng=False)

In [60]:
printCheck(en_train_clean)
printCheck(nl_train_clean)

0: i declare resumed the session of the european parliament adjourned on friday 17 december 1999 and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period
1: in the meantime i should like to observe a minute ' s silence as a number of members have requested on behalf of all the victims concerned particularly those of the terrible storms in the various countries of the european union
2: please rise then for this minute ' s silence
3: the house rose and observed a minute ' s silence
4: madam president on a point of order
0: ik verklaar de zitting van het europees parlement die op vrijdag 17 december werd onderbroken te zijn hervat ik wens you allen een gelukkig nieuwjaar en hoop dat you een goede vakantie heeft gehad
1: nu wil ik graag op verzoek van een aantal collega ' s een minuut stilte in acht nemen ter nagedachtenis van de slachtoffers ik doel hiermee met name op de slachtoffers van het noodweer dat verschillende lidstaten van d