# Data Preparation

## 1. Clean Text

Below function will load the file as a blob of text.

In [2]:
# load document into memory
def load_d(fname):
    file = open(fname,mode = 'rt', encoding='utf-8')
    t = file.read()
    file.close()
    return t

English and German phrases are separated by a tab character.

We'll split the text by line and then by phrases

In [10]:
# split into sentences
def to_pairs(d):
    lines = d.strip().split('\n')
    pairs = [l.split('\t') for l in lines]
    return pairs

Cleaning operations:

1. Remove all non-printable characters.
2. Remove all punctuation characters.
3. Normalize all Unicode characters to ASCII
4. Convert to lowercase.
5. Remove any remaining tokens that are not alphabetic.

In [5]:
# clean list of lines
def clean_pairs(lines):
    cleaned = list()
    
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            
            # tokenize on white space
            line = line.split()
            
            # convert to lowercase
            line = [word.lower() for word in line]
            
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            
            # store as string
            clean_pair.append(' '.join(line))
            
        cleaned.append(clean_pair)
    return array(cleaned)
    

Save the cleaned data to file

In [7]:
# Importing required libraries
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

In [8]:
# save a list of clean sentences to file
def save_clean_data(sentences, fname):
    dump(sentences, open(fname, 'wb'))
    print('Saved: %s' % fname)

In [9]:
# load dataset
filename = 'deu.txt'
doc = load_d(filename)

In [11]:
# split into english-german pairs
pairs = to_pairs(doc)

In [12]:
# clean sentences
clean_pairs = clean_pairs(pairs)

In [16]:
clean_pairs[0:10]

array([['hi', 'hallo'],
       ['hi', 'gru gott'],
       ['run', 'lauf'],
       ['wow', 'potzdonner'],
       ['wow', 'donnerwetter'],
       ['fire', 'feuer'],
       ['help', 'hilfe'],
       ['help', 'zu hulf'],
       ['stop', 'stopp'],
       ['wait', 'warte']],
      dtype='<U370')

In [13]:
# save clean pairs to file
save_clean_data(clean_pairs, 'english-german.pkl')

Saved: english-german.pkl


In [14]:
# spot check
for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]
[hello] => [hallo]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[smile] => [lacheln]
[cheers] => [zum wohl]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[got it] => [verstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran] => [er lief]
[hop in] => [mach mit]
[hug me] => [druck mich]
[hug me] => [nimm mich in den arm]
[hug me] => [umarme mich]
[i fell] => [ich fiel]
[i fell] => [ich fiel hin]
[i fell] => [ich sturzte]
[i fell] => [ich bin hingefallen]
[i fell] => [ich bin gesturzt]
[i know] => [ich wei]
[i lied] => [ich habe gelogen]
[i lost] => [ich habe verloren]
[im] => [ich bin jahre alt]
[im] => [ich bin]
[im ok] => [mir gehts gut]
[im ok] => [es geht mir gut]
[no way] => [unmoglich]
[no way] => [das gibts doch nicht]
[no wa