In [2]:
import os
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt

# data split
from sklearn.model_selection import train_test_split

# nlp preprocessing
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import contractions # to deal with english contractions

import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

from collections import Counter

from transformers import AutoTokenizer

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>
[nltk_data] Error loading wordnet: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>
[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>


In [3]:
EN_FILE_PATH = '../data/nl-en/europarl-v7.nl-en.en'
NL_FILE_PATH = '../data/nl-en/europarl-v7.nl-en.nl'
CLEAN_DATA_PATH = '../data/clean/'

RANDOM_SEED = 297
MAX_INPUT_LENGTH = 50

In [4]:
# load data into data frames
def load(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        lines = [line.strip() for line in lines]
        data = pd.DataFrame({'Text': lines})
        return data
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"Error loading file '{file_path}': {e}")
        return None

In [13]:
# custom print function to check the dataframe
def print_check(df):
    for index, row in df.head().iterrows():
        print(f"{index}: {row['Text'][:1000]}")

In [14]:
en_data = load(EN_FILE_PATH)
nl_data = load(NL_FILE_PATH)
print('English Corpus')
print_check(en_data)
print('Dutch Corpus')
print_check(nl_data)

English Corpus
0: Resumption of the session
1: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
2: Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.
3: You have requested a debate on this subject in the course of the next few days, during this part-session.
4: In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.
Dutch Corpus
0: Hervatting van de zitting
1: Ik verklaar de zitting van het Europees Parlement, die op vrijdag 17 december werd onderbroken, te zijn hervat. Ik wens u allen een gelukkig nieuwjaar en hoop d

In [15]:
# train, val, test split
# reorder and reassign the indeces
en_train, en_test, nl_train, nl_test = train_test_split(en_data, nl_data, test_size=0.1, random_state=RANDOM_SEED)
en_test.sort_index(inplace=True, ignore_index=True)
nl_test.sort_index(inplace=True, ignore_index=True)
en_train, en_val, nl_train, nl_val = train_test_split(en_train, nl_train, test_size=0.11, random_state=RANDOM_SEED)
en_train.sort_index(inplace=True, ignore_index=True)
nl_train.sort_index(inplace=True, ignore_index=True)
en_val.sort_index(inplace=True, ignore_index=True)
nl_val.sort_index(inplace=True, ignore_index=True)

In [16]:
print_check(en_train)
print_check(nl_train)

0: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
1: In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.
2: Please rise, then, for this minute' s silence.
3: (The House rose and observed a minute' s silence)
4: Madam President, on a point of order.
0: Ik verklaar de zitting van het Europees Parlement, die op vrijdag 17 december werd onderbroken, te zijn hervat. Ik wens u allen een gelukkig nieuwjaar en hoop dat u een goede vakantie heeft gehad.
1: Nu wil ik graag op verzoek van een aantal collega's een minuut stilte in acht nemen ter nagedachtenis van de slachtoffers. Ik doel hiermee met name op de slachtoffers van het noodweer dat verschillende lids

In [38]:
og_train_set = pd.concat([en_train,nl_train],axis=1)
og_train_set.columns = ['inputs', 'targets']
og_val_set = pd.concat([en_val,nl_val],axis=1)
og_val_set.columns = ['inputs', 'targets']

In [40]:
# clean text

def clean_text(df):
    cleaned_inputs = []
    cleaned_targets = []
        
    for index, row in df.iterrows():
        input_text = row['inputs']
        target_text = row['targets']
        
        input_text = re.sub(r'<.*?>', '', input_text) #remove html tag
        input_text = re.sub(r'http\S+', '', input_text) #remove url
        input_text = re.sub(r'\[.*?\]', '', input_text) #remove everything in between square brackets
        input_text = re.sub(r'\(.*?\)', '', input_text) #remove everything in between parentheses
        input_text = re.sub(r'[A_Z]\d-\d{4}\/\d{4}', '', input_text) #deleting more weird stuff
        # input_text = contractions.fix(input_text) #extend english contractions
        input_text = re.sub(r'[^A-Za-z0-9\s\'s]', '', input_text) #keep only alphabets, numbers, and space, maybe add in' as well for possessive
        input_text = input_text.lower() #covert to lower case
        cleaned_inputs.append(input_text)
        
        target_text = re.sub(r'<.*?>', '', target_text) #remove html tag
        target_text = re.sub(r'http\S+', '', target_text) #remove url
        target_text = re.sub(r'\[.*?\]', '', target_text) #remove everything in between square brackets
        target_text = re.sub(r'\(.*?\)', '', target_text) #remove everything in between parentheses
        target_text = re.sub(r'[A_Z]\d-\d{4}\/\d{4}', '', target_text) #deleting more weird stuff
        target_text = re.sub(r'[^A-Za-z0-9\s\'s]', '', target_text) #keep only alphabets, numbers, and space, maybe add in' as well for possessive
        target_text = target_text.lower() #covert to lower case
        cleaned_targets.append(target_text)
        
    #remove empty
    cleaned_df = pd.DataFrame({'inputs': cleaned_inputs, 'targets': cleaned_targets})
    cleaned_df = cleaned_df[(cleaned_df['inputs'].str.strip() != '') | (cleaned_df['targets'].str.strip() != '')]
    return cleaned_df

In [41]:
# save df
def save_df(df, path):
    df_name = [name for name, val in globals().items() if val is df][0]
    file_name = f"{df_name}.csv"
    file_path = os.path.join(path, file_name)
    df.to_csv(file_path, index=False)

In [42]:
# clean text and check
train_cleaned = clean_text(og_train_set)
val_cleaned = clean_text(og_val_set)

In [257]:
save_df(en_train, CLEAN_DATA_PATH)
save_df(nl_train, CLEAN_DATA_PATH)
save_df(en_val, CLEAN_DATA_PATH)
save_df(nl_val, CLEAN_DATA_PATH)
save_df(en_test, CLEAN_DATA_PATH)
save_df(nl_test, CLEAN_DATA_PATH)

In [44]:
save_df(og_train_set, CLEAN_DATA_PATH)
save_df(og_val_set, CLEAN_DATA_PATH)

In [45]:
save_df(train_cleaned, CLEAN_DATA_PATH)
save_df(val_cleaned, CLEAN_DATA_PATH)

In [46]:
# tokenize and add in special tokens
def tokenize_text(df):
    inputs_tk = []
    targets_tk = []
    for index, row in df.iterrows():
        inputs = row['inputs']
        inputs_tokens = word_tokenize(inputs)
        inputs_tokens = ['<bos>'] + inputs_tokens + ['<eos>']
        inputs_tk.append(inputs_tokens)
        
        targets = row['targets']
        targets_tokens = word_tokenize(targets)
        targets_tokens = ['<bos>'] + targets_tokens + ['<eos>']
        targets_tk.append(targets_tokens)
    
    tokenized_df = pd.DataFrame({'inputs': inputs_tk, 'targets': targets_tk})
    return tokenized_df


In [47]:
train_tokenized = tokenize_text(train_cleaned)
val_tokenized = tokenize_text(val_cleaned)

In [52]:
save_df(train_tokenized, CLEAN_DATA_PATH)
save_df(val_tokenized, CLEAN_DATA_PATH)

In [49]:
print(train_tokenized.head())

                                              inputs  \
0  [<bos>, i, declare, resumed, the, session, of,...   
1  [<bos>, in, the, meantime, i, should, like, to...   
2  [<bos>, please, rise, then, for, this, minute,...   
3  [<bos>, madam, president, on, a, point, of, or...   
4  [<bos>, you, will, be, aware, from, the, press...   

                                             targets  
0  [<bos>, ik, verklaar, de, zitting, van, het, e...  
1  [<bos>, nu, wil, ik, graag, op, verzoek, van, ...  
2  [<bos>, ik, wil, u, vragen, deze, minuut, stil...  
3  [<bos>, mevrouw, de, voorzitter, ik, wil, een,...  
4  [<bos>, u, zult, via, de, media, hebben, verno...  


In [54]:
# vocabulary
def word_index(column):
    wordcount = Counter()
    for sentence in column:
        wordcount.update(sentence)
    
    vocab = [word for word, count in wordcount.most_common()]
    word_to_index = {word: idx for idx, word in enumerate(vocab)}
    index_to_word = {idx: word for idx, word in enumerate(vocab)}
    
    return word_to_index, index_to_word

In [55]:
en_word2idx, en_idx2word = word_index(train_tokenized['inputs'])
nl_word2idx, nl_idx2word = word_index(train_tokenized['targets'])

In [56]:
print(en_word2idx)



In [67]:
# convert text to vec
def convert_text(df, en_w2idx=en_word2idx, nl_w2idx=nl_word2idx):
    en = []
    nl = []
    
    for index, row in df.iterrows():
        en_sentence = row['inputs']
        en_vec = [en_w2idx.get(word, 0) for word in en_sentence] #unknown vocab -> 0
        en.append(en_vec)
        
        nl_sentence = row['targets']
        nl_vec = [nl_w2idx.get(word, 0) for word in nl_sentence]
        nl.append(nl_vec)
        
    vectorized_df = pd.DataFrame({'inputs': en, 'targets': nl})
    return vectorized_df

In [68]:
train_vectorized = convert_text(train_tokenized)
val_vectorized = convert_text(val_tokenized)

In [69]:
print(train_vectorized.head())

                                              inputs  \
0  [1, 12, 2648, 4626, 0, 1598, 3, 0, 23, 46, 483...   
1  [1, 6, 0, 3057, 12, 35, 59, 4, 2940, 8, 2826, ...   
2  [1, 1382, 1177, 188, 10, 13, 2826, 70, 348, 36...   
3                [1, 233, 40, 14, 8, 133, 3, 138, 2]   
4  [1, 43, 24, 16, 459, 38, 0, 1196, 5, 2041, 7, ...   

                                             targets  
0  [2, 13, 4454, 0, 1747, 1, 4, 69, 41, 14, 12, 3...  
1  [2, 63, 53, 13, 207, 12, 418, 1, 8, 120, 153, ...  
2  [2, 13, 53, 45, 201, 26, 2870, 5652, 4669, 6, ...  
3    [2, 86, 0, 39, 13, 53, 8, 2118, 1, 292, 137, 3]  
4  [2, 45, 1186, 342, 0, 1127, 29, 4379, 7, 25, 6...  


In [87]:
# pad the sentences
def padding_sequence(df, max_len=MAX_INPUT_LENGTH, padding_value = 0):
    padded_sequences = []
    for index, row in df.iterrows():
        inputs = row['inputs']
        targets = row['targets']
            
        padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
            [inputs], maxlen=max_len, padding='post', truncating='post', value=padding_value
        )[0]
        padded_targets = tf.keras.preprocessing.sequence.pad_sequences(
            [targets], maxlen=max_len, padding='post', truncating='post', value=padding_value
        )[0]
        padded_sequences.append({'inputs': padded_inputs, 'targets': padded_targets})
    return pd.DataFrame(padded_sequences)

In [88]:
train_padded = padding_sequence(train_vectorized, MAX_INPUT_LENGTH)
val_padded = padding_sequence(val_vectorized, MAX_INPUT_LENGTH)

In [89]:
print(train_padded.head())

                                              inputs  \
0  [1, 12, 2648, 4626, 0, 1598, 3, 0, 23, 46, 483...   
1  [1, 6, 0, 3057, 12, 35, 59, 4, 2940, 8, 2826, ...   
2  [1, 1382, 1177, 188, 10, 13, 2826, 70, 348, 36...   
3  [1, 233, 40, 14, 8, 133, 3, 138, 2, 0, 0, 0, 0...   
4  [1, 43, 24, 16, 459, 38, 0, 1196, 5, 2041, 7, ...   

                                             targets  
0  [2, 13, 4454, 0, 1747, 1, 4, 69, 41, 14, 12, 3...  
1  [2, 63, 53, 13, 207, 12, 418, 1, 8, 120, 153, ...  
2  [2, 13, 53, 45, 201, 26, 2870, 5652, 4669, 6, ...  
3  [2, 86, 0, 39, 13, 53, 8, 2118, 1, 292, 137, 3...  
4  [2, 45, 1186, 342, 0, 1127, 29, 4379, 7, 25, 6...  


In [207]:
def save_list(list,path):
    name = [name for name, val in globals().items() if val is list][0]
    file_name = f"{name}.csv"
    file_path = os.path.join(path, file_name)
    with open(file_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(list)

In [90]:
save_df(train_padded,CLEAN_DATA_PATH)
save_df(val_padded, CLEAN_DATA_PATH)