In [2]:
import os
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt

# data split
from sklearn.model_selection import train_test_split

# nlp preprocessing
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import contractions # to deal with english contractions

import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

from collections import Counter

from transformers import AutoTokenizer

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>
[nltk_data] Error loading wordnet: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>
[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>


In [3]:
EN_FILE_PATH = '../data/nl-en/europarl-v7.nl-en.en'
NL_FILE_PATH = '../data/nl-en/europarl-v7.nl-en.nl'
CLEAN_DATA_PATH = '../data/clean/'

RANDOM_SEED = 297
MAX_INPUT_LENGTH = 50

In [4]:
# load data into data frames
def load(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        lines = [line.strip() for line in lines]
        data = pd.DataFrame({'Text': lines})
        return data
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"Error loading file '{file_path}': {e}")
        return None

In [13]:
# custom print function to check the dataframe
def print_check(df):
    for index, row in df.head().iterrows():
        print(f"{index}: {row['Text'][:1000]}")

In [14]:
en_data = load(EN_FILE_PATH)
nl_data = load(NL_FILE_PATH)
print('English Corpus')
print_check(en_data)
print('Dutch Corpus')
print_check(nl_data)

English Corpus
0: Resumption of the session
1: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
2: Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.
3: You have requested a debate on this subject in the course of the next few days, during this part-session.
4: In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.
Dutch Corpus
0: Hervatting van de zitting
1: Ik verklaar de zitting van het Europees Parlement, die op vrijdag 17 december werd onderbroken, te zijn hervat. Ik wens u allen een gelukkig nieuwjaar en hoop d

In [15]:
# train, val, test split
# reorder and reassign the indeces
en_train, en_test, nl_train, nl_test = train_test_split(en_data, nl_data, test_size=0.1, random_state=RANDOM_SEED)
en_test.sort_index(inplace=True, ignore_index=True)
nl_test.sort_index(inplace=True, ignore_index=True)
en_train, en_val, nl_train, nl_val = train_test_split(en_train, nl_train, test_size=0.11, random_state=RANDOM_SEED)
en_train.sort_index(inplace=True, ignore_index=True)
nl_train.sort_index(inplace=True, ignore_index=True)
en_val.sort_index(inplace=True, ignore_index=True)
nl_val.sort_index(inplace=True, ignore_index=True)

In [16]:
print_check(en_train)
print_check(nl_train)

0: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
1: In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.
2: Please rise, then, for this minute' s silence.
3: (The House rose and observed a minute' s silence)
4: Madam President, on a point of order.
0: Ik verklaar de zitting van het Europees Parlement, die op vrijdag 17 december werd onderbroken, te zijn hervat. Ik wens u allen een gelukkig nieuwjaar en hoop dat u een goede vakantie heeft gehad.
1: Nu wil ik graag op verzoek van een aantal collega's een minuut stilte in acht nemen ter nagedachtenis van de slachtoffers. Ik doel hiermee met name op de slachtoffers van het noodweer dat verschillende lids

In [38]:
og_train_set = pd.concat([en_train,nl_train],axis=1)
og_train_set.columns = ['inputs', 'targets']
og_val_set = pd.concat([en_val,nl_val],axis=1)
og_val_set.columns = ['inputs', 'targets']

In [40]:
# clean text

def clean_text(df):
    cleaned_inputs = []
    cleaned_targets = []
        
    for index, row in df.iterrows():
        input_text = row['inputs']
        target_text = row['targets']
        
        input_text = re.sub(r'<.*?>', '', input_text) #remove html tag
        input_text = re.sub(r'http\S+', '', input_text) #remove url
        input_text = re.sub(r'\[.*?\]', '', input_text) #remove everything in between square brackets
        input_text = re.sub(r'\(.*?\)', '', input_text) #remove everything in between parentheses
        input_text = re.sub(r'[A_Z]\d-\d{4}\/\d{4}', '', input_text) #deleting more weird stuff
        # input_text = contractions.fix(input_text) #extend english contractions
        input_text = re.sub(r'[^A-Za-z0-9\s\'s]', '', input_text) #keep only alphabets, numbers, and space, maybe add in' as well for possessive
        input_text = input_text.lower() #covert to lower case
        cleaned_inputs.append(input_text)
        
        target_text = re.sub(r'<.*?>', '', target_text) #remove html tag
        target_text = re.sub(r'http\S+', '', target_text) #remove url
        target_text = re.sub(r'\[.*?\]', '', target_text) #remove everything in between square brackets
        target_text = re.sub(r'\(.*?\)', '', target_text) #remove everything in between parentheses
        target_text = re.sub(r'[A_Z]\d-\d{4}\/\d{4}', '', target_text) #deleting more weird stuff
        target_text = re.sub(r'[^A-Za-z0-9\s\'s]', '', target_text) #keep only alphabets, numbers, and space, maybe add in' as well for possessive
        target_text = target_text.lower() #covert to lower case
        cleaned_targets.append(target_text)
        
    #remove empty
    cleaned_df = pd.DataFrame({'inputs': cleaned_inputs, 'targets': cleaned_targets})
    cleaned_df = cleaned_df[(cleaned_df['inputs'].str.strip() != '') | (cleaned_df['targets'].str.strip() != '')]
    return cleaned_df

In [41]:
# save df
def save_df(df, path):
    df_name = [name for name, val in globals().items() if val is df][0]
    file_name = f"{df_name}.csv"
    file_path = os.path.join(path, file_name)
    df.to_csv(file_path, index=False)

In [42]:
# clean text and check
train_cleaned = clean_text(og_train_set)
val_cleaned = clean_text(og_val_set)

In [257]:
save_df(en_train, CLEAN_DATA_PATH)
save_df(nl_train, CLEAN_DATA_PATH)
save_df(en_val, CLEAN_DATA_PATH)
save_df(nl_val, CLEAN_DATA_PATH)
save_df(en_test, CLEAN_DATA_PATH)
save_df(nl_test, CLEAN_DATA_PATH)

In [44]:
save_df(og_train_set, CLEAN_DATA_PATH)
save_df(og_val_set, CLEAN_DATA_PATH)

In [45]:
save_df(train_cleaned, CLEAN_DATA_PATH)
save_df(val_cleaned, CLEAN_DATA_PATH)

In [196]:
# tokenize and add in special tokens
def tokenize_text(df):
    tk_text = []
    for index, row in df.iterrows():
        text = row['Text']
        tokens = word_tokenize(text)
        tokens = ['<bos>'] + tokens + ['<eos>']
        tk_text.append(tokens)
    return tk_text


In [197]:
en_train_tk = tokenize_text(df_en_train_clean)
nl_train_tk = tokenize_text(df_nl_train_clean)
en_val_tk = tokenize_text(df_en_val_clean)
nl_val_tk = tokenize_text(df_nl_val_clean)
en_test_tk = tokenize_text(df_en_test_clean)
nl_test_tk = tokenize_text(df_nl_test_clean)

In [198]:
max_sentence_length = max(len(sentence) for sequence in en_train_tk for sentence in sequence)
print(max_sentence_length)

41


In [200]:
# vocabulary
def word_index(text):
    wordcount = Counter()
    for sentence in text:
        wordcount.update(sentence)
    
    vocab = [word for word, count in wordcount.most_common()]
    word_to_index = {word: idx for idx, word in enumerate(vocab)}
    index_to_word = {idx: word for idx, word in enumerate(vocab)}
    
    return word_to_index, index_to_word

In [201]:
en_word2idx, en_idx2word = word_index(en_train_tk)
nl_word2idx, nl_idx2word = word_index(nl_train_tk)

In [202]:
print(en_word2idx)



In [203]:
# convert text to vec
def convert_text(text, word2idx):
    text_vec = []
    for sentence in text:
        vec = [word2idx.get(word, 0) for word in sentence] #unknown vocab -> 0
        text_vec.append(vec)
    return text_vec

In [204]:
# pad the sentences
def padding_sequence(sequence, max_len, padding_value = 0):
    padded = tf.keras.preprocessing.sequence.pad_sequences (
        sequence, maxlen = max_len, 
        padding = 'post', truncating='post',
        value = padding_value
    )
    return padded

In [205]:
en_train_seq = convert_text(en_train_tk, en_word2idx)
nl_train_seq = convert_text(nl_train_tk, nl_word2idx)
en_val_seq = convert_text(en_val_tk, en_word2idx)
nl_val_seq = convert_text(nl_val_tk, nl_word2idx)
en_test_seq = convert_text(en_test_tk, en_word2idx)
nl_test_seq = convert_text(nl_test_tk, nl_word2idx)

In [206]:
en_train_padded = padding_sequence(en_train_seq, MAX_INPUT_LENGTH)
nl_train_padded = padding_sequence(nl_train_seq, MAX_INPUT_LENGTH)
en_val_padded = padding_sequence(en_val_seq, MAX_INPUT_LENGTH)
nl_val_padded = padding_sequence(nl_val_seq, MAX_INPUT_LENGTH)
en_test_padded = padding_sequence(en_test_seq, MAX_INPUT_LENGTH)
nl_test_padded = padding_sequence(nl_test_seq, MAX_INPUT_LENGTH)

In [207]:
def save_list(list,path):
    name = [name for name, val in globals().items() if val is list][0]
    file_name = f"{name}.csv"
    file_path = os.path.join(path, file_name)
    with open(file_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(list)

In [208]:
save_list(en_train_padded, CLEAN_DATA_PATH)
save_list(nl_train_padded, CLEAN_DATA_PATH)
save_list(en_val_padded, CLEAN_DATA_PATH)
save_list(nl_val_padded, CLEAN_DATA_PATH)
save_list(en_test_padded, CLEAN_DATA_PATH)
save_list(nl_test_padded, CLEAN_DATA_PATH)

In [33]:
def crafting_dataset(input_dataset,target_dataset):
    dataset = tf.data.Dataset.from_tensor_slices((input_dataset, target_dataset))
    return dataset

In [34]:
train_dataset = crafting_dataset(en_train,nl_train)
val_dataset = crafting_dataset(en_val,nl_val)

In [20]:
def check_shape(data_list):
    num_rows = len(data_list)
    if num_rows > 0:
        num_columns = len(data_list[0])  # Assuming all rows have the same length
        return num_rows, num_columns
    else:
        return 0, 0  # Empty list


In [28]:
rows, columns = check_shape(en_train_clean)
print("Number of rows:", rows)
print("Number of columns:", columns)
rows, columns = check_shape(nl_train_clean)
print("Number of rows:", rows)
print("Number of columns:", columns)

Number of rows: 1580310
Number of columns: 207
Number of rows: 1589806
Number of columns: 196


In [39]:
print(og_train_set.shape)
print(og_val_set.shape)

(1600217, 2)
(197780, 2)


In [43]:
print(train_cleaned.shape)
print(val_cleaned.shape)

(1590533, 2)
(196597, 2)


In [244]:
def convert_df(data_list):
    data_arr = [np.array(row) for row in data_list]
    df_list = pd.DataFrame(data_arr)
    return df_list


In [221]:
df_en_train_padded = convert_df(en_train_padded)
df_nl_train_padded = convert_df(nl_train_padded)
df_en_val_padded = convert_df(en_val_padded)
df_nl_val_padded = convert_df(nl_val_padded)