In [None]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, RepeatVector
import matplotlib.pyplot as plt
from keras import optimizers
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import random
from keras.models import Sequential

Load in the data

In [None]:
def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read().splitlines()

Clean the data

In [None]:
def clean_data(text):
    text = text.replace('\n', ' ')  # remove newline
    text = text.replace('/', ' ')  # remove forward slashes
    text = re.sub(r'\s+', ' ', text)  # replace multiple whitespace with a single space
    text = re.sub(r'[^a-zA-Z0-9äöüÄÖÜß ]', '', text)  # remove non-alphanumeric characters
    text = text.lower()

    return text

Split the data into 60% training, 20% validation, and 20% testing data.

In [None]:
def split_data(df):

    train_data, remaining_data = train_test_split(df, test_size=0.4, random_state=42)
    test_data, val_data = train_test_split(remaining_data, test_size=0.5, random_state=42)

    return train_data, test_data, val_data


Read in the data

In [None]:
loaded_data = load_data(r"C:\Users\imaxo\Desktop\RUG FSE\Natural Language Processing\Project\deu.txt")

en_data = []
de_data = []

for row in loaded_data:
    english, deutsche, garbage = row.split('\t')
    en_data.append(english)
    de_data.append(deutsche)

print(en_data[0:10])
print(de_data[0:10])


['Go.', 'Hi.', 'Hi.', 'Run!', 'Run.', 'Wow!', 'Wow!', 'Fire!', 'Help!', 'Help!']
['Geh.', 'Hallo!', 'Grüß Gott!', 'Lauf!', 'Lauf!', 'Potzdonner!', 'Donnerwetter!', 'Feuer!', 'Hilfe!', 'Zu Hülf!']


Call the cleaning function to clean the data

In [None]:
en_cleaned = [clean_data(text) for text in en_data]
de_cleaned = [clean_data(text) for text in de_data]

print(en_cleaned[0:10])
print(de_cleaned[0:10])
print(len(en_cleaned))
print(len(de_cleaned))

['go', 'hi', 'hi', 'run', 'run', 'wow', 'wow', 'fire', 'help', 'help']
['geh', 'hallo', 'grüß gott', 'lauf', 'lauf', 'potzdonner', 'donnerwetter', 'feuer', 'hilfe', 'zu hülf']
227080
227080


Convert seperate English and German sets into one list.

In [None]:
data = list(zip(en_cleaned, de_cleaned))
print(len(data))

227080


Initially we wanted to use a subset, but once we switched datasets we are actually using the entire dataset, therefore we actually take 100% of the data as a 'sample'. Additionaly we call the split function to split up the data.

In [None]:
def get_data_subset(data, fraction):
    data_subset = random.sample(data, int(len(data) * fraction))
    return data_subset

data_reduced = get_data_subset(data, 1)  #x% of the data as a subset
print(data_reduced[12])
print(len(data_reduced))
data_train, data_test, data_val = split_data(data_reduced)

('you have to get enough sleep', 'du musst genug schlaf bekommen')
227080


In [None]:
print(len(data_train), len(data_test), len(data_val))

136248 45416 45416


Convert lists to dataframes

In [None]:
df_train = pd.DataFrame(data_train)
df_test = pd.DataFrame(data_test)
df_val = pd.DataFrame(data_val)

Convert the dataframes to csv files

In [None]:
df_train.to_csv('df_train.csv', index=False)
df_test.to_csv('df_test.csv', index=False)
df_val.to_csv('df_val.csv', index=False)

In [None]:
train = pd.read_csv("C:/Users/imaxo/Desktop/RUG FSE/Natural Language Processing/Project/df_train.csv")


Test whether the data loaded in well

In [None]:
train.head()

Unnamed: 0,0,1
0,why didnt i listen to you,warum habe ich dir nicht zugehört
1,ancient coins were found inside the mysterious...,in dem rätselhaften grab wurden antike münzen ...
2,he has a nice income,er hat ein gutes einkommen
3,tom is wearing a widebrimmed hat,tom trägt einen breitkrempigen hut
4,why not take a few days off,warum nehmen sie sich nicht ein paar tage frei
