In [3]:
pip install keras


Collecting keras
  Downloading Keras-2.4.3-py2.py3-none-any.whl (36 kB)

Collecting h5py
  Downloading h5py-3.1.0-cp38-cp38-win_amd64.whl (2.7 MB)
Installing collected packages: h5py, keras
Successfully installed h5py-3.1.0 keras-2.4.3


In [6]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os, sys
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt

In [2]:
BATCH_SIZE = 64
EPOCHS = 5
LSTM_NODES =256
NUM_SENTENCES = 20000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

In [3]:
input_sentences = []
output_sentences = []
output_sentences_inputs = []

count = 0
for line in open(r'deu.txt', encoding="utf-8"):
    count += 1

    if count > NUM_SENTENCES:
        break

    if '\t' not in line:
        continue

    input_sentence, output,_ = line.rstrip().split('\t')

    output_sentence = output + ' <eos>'
    output_sentence_input = '<sos> ' + output

    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentence_input)

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 20000
num samples output: 20000
num samples output input: 20000


In [4]:
print(input_sentences[172])
print(output_sentences[172])
print(output_sentences_inputs[172])

Go away.
Mach ’ne Fliege! <eos>
<sos> Mach ’ne Fliege!


In [5]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 3688
Length of longest sentence in input: 6


In [6]:
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 8213
Length of longest sentence in the output: 11


In [7]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[172]:", encoder_input_sequences[172])

encoder_input_sequences.shape: (20000, 6)
encoder_input_sequences[172]: [ 0  0  0  0 20 93]


In [8]:
print(word2idx_inputs["i'm"])
print(word2idx_inputs["walk"])

7
196


In [9]:
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[172])

decoder_input_sequences.shape: (20000, 11)
decoder_input_sequences[172]: [   2   71 1346 1347    0    0    0    0    0    0    0]


In [10]:
print(word2idx_outputs["<sos>"])
print(word2idx_outputs["ich"])
print(word2idx_outputs["hörte"])
print(word2idx_outputs["auf."])

2
3
597
66


In [11]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open(r'glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [12]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [13]:
print(embeddings_dictionary["walk"])

[ 0.20725    0.84365   -0.076531   0.0071135 -0.13323    0.7837
 -0.491      0.94298   -0.2167    -0.74668    0.25125    0.62035
  0.49758   -0.027116   0.27161    0.092964   0.38282    0.99705
 -0.79015   -0.27931    0.83799    0.59941   -0.20857   -0.40869
  0.46501    0.47012   -1.1325    -0.57505    0.87657   -0.0051153
 -0.64612    0.21349    0.62074    0.31057    0.60625    0.64406
 -0.32349    0.1056     0.39087    0.46077    0.18394    0.10649
 -0.17585   -0.31971   -0.19235   -0.16422   -0.12337   -0.20295
  0.26234   -0.072558   0.030499  -0.30478    0.068655   0.5949
  0.25831   -2.111     -0.044159   0.66168    1.501      0.30353
 -0.24889    0.92395    0.010073  -0.24858    0.28253    0.26781
  0.82791   -0.34017   -0.2274    -0.32975   -0.52374    0.0063988
 -0.17637   -1.1112     0.45905    0.24236   -0.25737   -0.24206
  0.49061   -0.36632   -0.30224    0.41096   -0.46267    0.010152
 -0.74526   -0.016159   0.22009    0.015054   0.27948    0.27322
  0.26239   -0.57978  

In [14]:
print(embedding_matrix[539])

[ 0.56405002  0.13749     0.73861998 -0.51165998  0.13603     0.46792001
 -0.52835    -0.12946001  0.18187    -0.67803001 -0.51889998  0.37786999
  0.76815999  1.01600003  0.37512001  0.31896001 -1.20899999  0.62366003
  0.45614001 -0.50072998 -0.30096999  0.13631     0.50472999 -0.90983999
  1.35839999  1.32650006 -0.12594    -0.23116     0.87313002  0.11791
  1.01129997 -0.22823     0.64191997  0.32475999 -0.062599    0.42541
 -0.29337999  0.44635999  0.85812002 -0.13944     0.52234    -0.72368997
 -0.13501    -0.26104    -0.45321    -0.43656999  0.29662001  0.25174001
 -0.24169999 -0.72171003 -0.47907001 -0.10003     0.17557999  0.51245999
 -0.21233    -0.52855998  0.227      -0.084546    0.72983003 -0.041315
  1.03250003  1.70200002 -0.17155001  0.55111003  0.06694    -0.60178
 -0.28736001  0.16134     0.32778999 -0.59762001  0.085608    0.31759
  0.64714003 -0.26975    -0.25167999 -0.46195    -0.4201      0.51457
  0.14984     0.089569    0.41067001 -0.27408001 -0.78237998 -0.1091

In [15]:
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

In [16]:
decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        max_out_len,
        num_words_output
    ),
    dtype='float32'
)

In [17]:
decoder_targets_one_hot.shape

(20000, 11, 8214)

In [18]:
decoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding='post')

In [19]:
for i, d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

In [20]:
encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

In [21]:
decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

In [22]:
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [23]:
model = Model([encoder_inputs_placeholder,
  decoder_inputs_placeholder], decoder_outputs)
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [24]:
r = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

In [26]:
decoder_state_input_h = Input(shape=(LSTM_NODES,))
decoder_state_input_c = Input(shape=(LSTM_NODES,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [27]:
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

In [28]:
decoder_outputs, h, c = decoder_lstm(decoder_inputs_single_x, initial_state=decoder_states_inputs)

In [29]:
decoder_states = [h, c]
decoder_outputs = decoder_dense(decoder_outputs)

In [30]:
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

In [31]:
idx2word_input = {v:k for k, v in word2idx_inputs.items()}
idx2word_target = {v:k for k, v in word2idx_outputs.items()}

In [32]:
def translate_sentence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']
    eos = word2idx_outputs['<eos>']
    output_sentence = []

    for _ in range(max_out_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])

        if eos == idx:
            break

        word = ''

        if idx > 0:
            word = idx2word_target[idx]
            output_sentence.append(word)

        target_seq[0, 0] = idx
        states_value = [h, c]

    return ' '.join(output_sentence)

In [33]:
i = np.random.choice(len(input_sentences))
input_seq = encoder_input_sequences[i:i+1]
translation = translate_sentence(input_seq)
print('-')
print('Input:', input_sentences[i])
print('Response:', translation)

-
Input: She is young.
Response: sie ist jung.
