In [1]:
  !ls -lrt
#download data
print("Downloading Dataset:")
!wget --quiet http://www.manythings.org/anki/deu-eng.zip
!unzip deu-eng.zip

total 4
drwxr-xr-x 1 root root 4096 Oct  5 16:31 sample_data
Downloading Dataset:
Archive:  deu-eng.zip
  inflating: deu.txt                 
  inflating: _about.txt              


In [2]:
ls -lrt

total 41828
-rw-r--r-- 1 root root  8324970 Aug 23 14:57 deu-eng.zip
-rw-r--r-- 1 root root 34494242 Aug 23 23:57 deu.txt
-rw-r--r-- 1 root root     1441 Aug 23 23:57 _about.txt
drwxr-xr-x 1 root root     4096 Oct  5 16:31 [0m[01;34msample_data[0m/


In [3]:
import csv
import string
import re
from typing import List, Tuple
from pickle import dump
from unicodedata import normalize
import numpy as np
import itertools
from pickle import load
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from pickle import load
import random
import tensorflow as tf
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
from sklearn.model_selection import train_test_split
import tensorflow_addons as tfa

In [4]:
# Start of sentence
SOS = "<start>"
# End of sentence
EOS = "<end>"
# Relavent Punctuation
PUNCTUATION = set("?,!,.")

In [5]:
def load_dataset(filename: str) -> str:
  """
  load dataset into memory
  """
  with open(filename, mode="rt", encoding="utf-8") as fp:
    return fp.read()

In [6]:
def to_pairs(dataset: str, limit: int=None, shuffle=False) -> List[Tuple[str, str]]:
  """
  Split dataset into pair of sentences
  :param dataset: dataset containing examples of translations between two languages
  the examples are delimited by `\n` and the contents of the lines are
  delimited by `\t`
  :param limit: number that limit dataset size (optional)
  :param shuffle: default is True
  :return: list of pairs
  """
  assert isinstance(limit, (int, type(None))), TypeError(
      "The limit value must be an integer"
  )
  lines = dataset.strip().split('\n')
  # Random Dataset
  if shuffle is True:
    random.shuffle(lines)
  number_examples = limit or len(lines)
  pairs = []
  for line in lines[: abs(number_examples)]:
    # take only source and target
    src, trg, _ = line.split("\t")
    pairs.append((src, trg))

  # dataset size check
  assert len(pairs) == number_examples
  return pairs

In [7]:
def separe_punctuation(token: str) -> str:
  """
  Separe punctuation if they exist
  """
  if not set(token).intersection(PUNCTUATION):
    return token
  for p in PUNCTUATION:
    token = f" {p} ".join(token.split(p))
  return " ".join(token.split(p))    

In [8]:
def preprocess(sentence: str, add_start_end: bool=True) -> str:
  """
  - convert lowercase
  - remove numbers
  - remove speacial characters
  - separe punctuation
  - add <start> and <end> of sentence
  :param add_start_end: add SOS and EOS
  """
  re_print = re.compile(f"[^{re.escape(string.printable)}]")
  # convert lowercase and normalizing unicode characters
  sentence = (
      normalize("NFD", sentence.lower()).encode("ascii", "ignore").decode("UTF-8")
  )
  cleaned_tokens = []
  # tokenize sentence on white space
  for token in sentence.split():
    # remove non printable characters from each token
    token = re_print.sub("", token).strip()
    # ignore token with numbers
    if re.findall("[0-9]", token):
      continue
    # add space between words and punctuation
    token = separe_punctuation(token)
    cleaned_tokens.append(token)

  # rebuild sentence with space between tokens
  sentence = " ".join(cleaned_tokens)

  # add a start and end token to the sentence
  if add_start_end is True:
    sentence = f"{SOS} {sentence} {EOS}"
  return sentence      

In [9]:
def dataset_preprocess(dataset: List[Tuple[str, str]]) -> Tuple[List[str], List[str]]:
  """
  Returns processed dataset

  :param dataset: list of sentence pairs
  :return: list of prallel data
  """
  source_cleaned = []
  target_cleaned = []
  for source, target in dataset:
    source_cleaned.append(preprocess(source))
    target_cleaned.append(preprocess(target))
  return source_cleaned, target_cleaned  


In [10]:
NUM_EXAMPLES = 10000 # Limit dataset size
filename = "deu.txt"
dataset = load_dataset(filename)
# get pairs limited to 1000 pairs
pairs = to_pairs(dataset, limit=NUM_EXAMPLES)
print(f"Dataset size: {len(pairs)}")
raw_data_en, raw_data_ge = dataset_preprocess(pairs)

# show last 5 pairs
for pair in zip(raw_data_en[-5:],raw_data_ge[-5:]):
    print(pair)

Dataset size: 10000
('<start> tom was crying    <end>', '<start> tom flennte    <end>')
('<start> tom was eating    <end>', '<start> tom hat gegessen    <end>')
('<start> tom was famous    <end>', '<start> tom war beruhmt    <end>')
('<start> tom was framed    <end>', '<start> tom wurde reingelegt    <end>')
('<start> tom was fuming    <end>', '<start> tom war wutend    <end>')


In [11]:
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
en_tokenizer.fit_on_texts(raw_data_en)

data_en = en_tokenizer.texts_to_sequences(raw_data_en)
data_en = tf.keras.preprocessing.sequence.pad_sequences(data_en, padding='post')

ge_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
ge_tokenizer.fit_on_texts(raw_data_ge)

data_ge = ge_tokenizer.texts_to_sequences(raw_data_ge)
data_ge = tf.keras.preprocessing.sequence.pad_sequences(data_ge, padding='post')

In [12]:
def max_len(tensor):
    #print( np.argmax([len(t) for t in tensor]))
    return max( len(t) for t in tensor)

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(data_en, data_ge, test_size=0.2)
BATCH_SIZE = 64
BUFFER_SIZE = len(X_train)
steps_per_epoch = BUFFER_SIZE//BATCH_SIZE
embedding_dims = 256
rnn_units = 1024
dense_units = 1024
Dtype = tf.