In [1]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


##Import Files

In [2]:
import pandas as pd
import contractions
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Concatenate, Input
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

##Data pre process

In [3]:
data = pd.read_csv("/data/deu.txt", header=None, sep="\t")
en_text1 = data[0][:100000]
de_text1 = data[1][:100000]

en_text = []
de_text = []
max_length = 20  # Limit the maximum sequence length for both input and output
de_set = set()
en_set = set()

for word in en_text1:
    en_text.append(contractions.fix(word))
    en_set.add(word)

for word in de_text1:
    de_text.append("[CLS] " + word + " [EOS]")
    de_set.add(word)

de_word_count = len(de_set)
en_word_count = len(en_set)

print(f"Total number of unique English words count: {en_word_count},\nTotal number of unique German words count : {de_word_count}")


Total number of unique English words count: 19410,
Total number of unique German words count : 23158


['[CLS] Geh. [EOS]',
 '[CLS] Hallo! [EOS]',
 '[CLS] Grüß Gott! [EOS]',
 '[CLS] Lauf! [EOS]',
 '[CLS] Lauf! [EOS]']

##Tokenize Data

In [4]:
def tokenize(text,length):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    token = tokenizer.texts_to_sequences(text)
    token = np.array(pad_sequences(token, maxlen=length, padding='post'))
    return tokenizer, token

en_tokenizer, en_token = tokenize(en_text,max_length)
de_tokenizer, de_token = tokenize(de_text,max_length)

X_train, X_test, y_train, y_test = train_test_split(en_token, de_token, test_size=0.2, random_state=42, shuffle=True)



##Build Encoder & Decoder

In [5]:
# Encoder
encoder_inputs = Input(shape=(max_length,))
enc_emb = Embedding(en_word_count, 256)(encoder_inputs)
enc_lstm1 = Bidirectional(LSTM(128, return_sequences=True, return_state=True))
encoder_outputs1, forw_state_h, forw_state_c, back_state_h, back_state_c = enc_lstm1(enc_emb)
final_enc_h = Concatenate()([forw_state_h, back_state_h])
final_enc_c = Concatenate()([forw_state_c, back_state_c])
encoder_states = [final_enc_h, final_enc_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(de_word_count, 256)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(de_word_count, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)




##Build, Compile and fit the Model

In [21]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)

# Training
encoder_input_data = X_train
decoder_input_data = y_train[:, :-1]
decoder_target_data = y_train[:, 1:]

# Testing
encoder_input_test = X_test
decoder_input_test = y_test[:, :-1]
decoder_target_test = y_test[:, 1:]

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    epochs=100,
    batch_size=256,
    validation_data=([encoder_input_test, decoder_input_test], decoder_target_test),
    callbacks=[early_stopping]
)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100


## Save the model

In [7]:
# Save the trained model
model.save('seq2seq_model.h5')

"\n# Save tokenizers\nimport pickle\n\nwith open('en_tokenizer.pickle', 'wb') as handle:\n    pickle.dump(en_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)\n\nwith open('de_tokenizer.pickle', 'wb') as handle:\n    pickle.dump(de_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)\n"

# Prediction

In [36]:
def translate(input_text):
    _,input_sequence = tokenize([input_text],max_length)

    target_sequence = np.array([[de_tokenizer.word_index['cls']]])  # Start token

    translated_text = ''

    while True:
        output_tokens = model.predict([input_sequence, target_sequence])
        predicted_token_index = np.argmax(output_tokens[0, -1, :])
        predicted_word = de_tokenizer.index_word[predicted_token_index]

        if predicted_word == 'eos' or len(translated_text.split()) >= max_length:
            break

        translated_text += predicted_word + ' '
        target_sequence = np.array([[predicted_token_index]])

    return translated_text


# Translation

In [37]:
# Test the translation
test_sentences = [
    "Ask me",
    "I lied"
]

for input_sentence in test_sentences:
  translated_sentence = translate(input_sentence)
  print("Input:", input_sentence)
  print("Translated:", translated_sentence)
  print("\n")




Input: Ask me
Translated: fragen sie mich


Input: I lied
Translated: ich habe gelogen


