# Neural Machine Translation Example

In [1]:
# Install TensorFlow and also our package via PyPI
!pip install tensorflow-gpu==2.0.0
!pip install headliner

Collecting tensorflow-gpu==2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/25/44/47f0722aea081697143fbcf5d2aa60d1aee4aaacb5869aee2b568974777b/tensorflow_gpu-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (380.8MB)
[K     |████████████████████████████████| 380.8MB 44kB/s 
Collecting tensorboard<2.1.0,>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/76/54/99b9d5d52d5cb732f099baaaf7740403e83fe6b0cedde940fabd2b13d75a/tensorboard-2.0.2-py3-none-any.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 38.6MB/s 
Collecting tensorflow-estimator<2.1.0,>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/fc/08/8b927337b7019c374719145d1dceba21a8bb909b93b1ad6f8fb7d22c1ca1/tensorflow_estimator-2.0.1-py2.py3-none-any.whl (449kB)
[K     |████████████████████████████████| 450kB 59.7MB/s 
Collecting google-auth<2,>=1.6.3
[?25l  Downloading https://files.pythonhosted.org/packages/1c/6d/7aae38a9022f982cf8167775c7fc299f203417b698c27080ce09060bba07/

Collecting headliner
[?25l  Downloading https://files.pythonhosted.org/packages/74/e4/386e9f58b8464261d4e220abaebe66da2426d55b6ea4186ec2cb828195ef/headliner-1.0.2-py3-none-any.whl (65kB)
[K     |█████                           | 10kB 29.6MB/s eta 0:00:01[K     |██████████                      | 20kB 3.0MB/s eta 0:00:01[K     |███████████████                 | 30kB 4.4MB/s eta 0:00:01[K     |████████████████████            | 40kB 2.9MB/s eta 0:00:01[K     |████████████████████████▉       | 51kB 3.5MB/s eta 0:00:01[K     |█████████████████████████████▉  | 61kB 4.2MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 3.6MB/s 
Collecting transformers>=2.2.2
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 14.0MB/s 
[?25hCollecting spacy>=2.2.2
[?25l  Downloading https://files.pythonhosted.org/pac

In [3]:
# Download the German-English sentence pairs
!wget http://www.manythings.org/anki/deu-eng.zip
!unzip deu-eng.zip
!head deu.txt

--2020-01-24 10:05:02--  http://www.manythings.org/anki/deu-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.108.196, 104.24.109.196, 2606:4700:3037::6818:6cc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.108.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7747747 (7.4M) [application/zip]
Saving to: ‘deu-eng.zip’


2020-01-24 10:05:03 (9.36 MB/s) - ‘deu-eng.zip’ saved [7747747/7747747]

Archive:  deu-eng.zip
  inflating: deu.txt                 
  inflating: _about.txt              
Hi.	Hallo!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #380701 (cburgmer)
Hi.	Grüß Gott!	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #659813 (Esperantostern)
Run!	Lauf!	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #941078 (Fingerhut)
Wow!	Potzdonner!	CC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #2122382 (Pfirsichbaeumchen)
Wow!	Donnerwetter!	CC-BY 2.0 (France) Attribution: t

In [4]:
# Create the dataset but only take a subset for faster training
import io

def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[w for w in l.split('\t')]  for l in lines[:num_examples]]
    return zip(*word_pairs)

eng, ger, meta = create_dataset('deu.txt', 30000)
data = list(zip(eng, ger))
data[:5]

[('Hi.', 'Hallo!'),
 ('Hi.', 'Grüß Gott!'),
 ('Run!', 'Lauf!'),
 ('Wow!', 'Potzdonner!'),
 ('Wow!', 'Donnerwetter!')]

In [0]:
# Split the dataset into train and test
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=100)

In [0]:
# Define the model and train it
import tensorflow as tf
from headliner.trainer import Trainer
from headliner.model.attention_summarizer import AttentionSummarizer

summarizer = AttentionSummarizer(lstm_size=1024, embedding_size=64, max_prediction_len=10)
trainer = Trainer(batch_size=64, 
                  steps_per_epoch=500, 
                  steps_to_log=50, 
                  max_output_len=10,
                  model_save_path='/tmp/summarizer')
trainer.train(summarizer, train, num_epochs=10, val_data=test)

training a bare model, preprocessing data to init model...
fitting tokenizers...
vocab encoder: 4675, vocab decoder: 7459
epoch 0, batch 50, logs: {'loss': 3.516734313964844}
epoch 0, batch 100, logs: {'loss': 3.1651382446289062}
epoch 0, batch 150, logs: {'loss': 2.9864306640625}
epoch 0, batch 200, logs: {'loss': 2.84840576171875}
epoch 0, batch 250, logs: {'loss': 2.73945068359375}
epoch 0, batch 300, logs: {'loss': 2.651563517252604}
epoch 0, batch 350, logs: {'loss': 2.5786356026785713}
epoch 0, batch 400, logs: {'loss': 2.5185585021972656}
epoch 0, batch 450, logs: {'loss': 2.462959255642361}
finished iterating over dataset, total batches: 467
epoch 0, batch 500, logs: {'loss': 2.407764404296875}

(input) <start> we've found it . <end> 
(target) <start> wir haben es gefunden . <end> 
(prediction) du kannst sie nicht . <end>


(input) <start> was tom caught ? <end> 
(target) <start> wurde tom erwischt ? <end> 
(prediction) hat tom tom ? <end>


(input) <start> whose baby is this ?

In [0]:
# Do some prediction
summarizer.predict('How are you?')

'wie geht es dir ? <end>'