In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
from datetime import datetime

In [None]:
BATCH_SIZE = 128
EPOCHS = 50
LATENT_DIM = 256
PATH = r"/data_v45_true-words.csv"

In [None]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

with open(PATH, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")

In [None]:
for line in lines[1:-1]:
    input_text, target_text = line.split(";")
    target_text = "<" + target_text + ">"
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [None]:
print("Input characters:\n", input_characters)
print("Target characters:\n", target_characters)

Input characters:
 {'j', 'x', 'o', 'ç', 'ı', 'b', 'g', 'l', 't', 'i', 'k', 's', 'ş', 'f', 'n', 'z', 'd', 'q', 'ğ', 'a', 'h', 'w', 'm', 'r', 'p', 'c', 'u', 'ü', 'e', 'v', 'y', 'ö'}
Target characters:
 {'j', 'o', 'ç', 'ı', 'b', 'g', 'l', 't', 'i', 'k', 's', 'ş', 'f', 'n', 'z', 'd', 'ğ', 'a', 'h', 'm', 'r', 'p', 'c', 'u', 'ü', 'e', 'v', '<', '>', 'y', 'ö'}


In [None]:
input_characters.add("_")
target_characters.add("_")
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

Number of samples: 1340433
Number of unique input tokens: 33
Number of unique output tokens: 32
Max sequence length for inputs: 28
Max sequence length for outputs: 29


In [None]:
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

print("Indexed input tokens:\n", input_token_index)
print("Indexed target tokens:\n", target_token_index)

Indexed input tokens:
 {'_': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, 'ç': 27, 'ö': 28, 'ü': 29, 'ğ': 30, 'ı': 31, 'ş': 32}
Indexed target tokens:
 {'<': 0, '>': 1, '_': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'y': 24, 'z': 25, 'ç': 26, 'ö': 27, 'ü': 28, 'ğ': 29, 'ı': 30, 'ş': 31}


In [None]:
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32")
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

print("Shape of encoder input data:\n", encoder_input_data.shape)
print("Shape of decoder input data:\n", decoder_input_data.shape)
print("Shape of decoder output data:\n", decoder_target_data.shape)

Shape of encoder input data:
 (1340433, 28, 33)
Shape of decoder input data:
 (1340433, 29, 32)
Shape of decoder output data:
 (1340433, 29, 32)


### One hot encode tokens

In [None]:
input_texts = [word[::-1] for word in input_texts]

In [None]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    encoder_input_data[i, t + 1 :, input_token_index["_"]] = 1.0
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    decoder_input_data[i, t + 1 :, target_token_index["_"]] = 1.0
    decoder_target_data[i, t:, target_token_index["_"]] = 1.0

In [None]:
del f
del lines
del input_texts
del target_texts

In [None]:
# Encoder
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder_lstm = keras.layers.LSTM(LATENT_DIM, return_state=True, name="encoder_lstm", dropout=0.1)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)

# Encoder States
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))
decoder_lstm = keras.layers.LSTM(LATENT_DIM, return_sequences=True, return_state=True, name="decoder_lstm")
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 33)]   0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None, 32)]   0           []                               
                                                                                                  
 encoder_lstm (LSTM)            [(None, 256),        296960      ['input_1[0][0]']                
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                              

## Train model


In [None]:
from keras.callbacks import ModelCheckpoint

checkpoint_path = '/model25/checkpoints/model1-{epoch:02d}.h5'

checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=False, 
    save_freq=1                
)

In [None]:
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

start_time = datetime.now()

history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2,
    callbacks=[checkpoint_callback])

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
1317/8378 [===>..........................] - ETA: 6:32 - loss: 0.0532 - accuracy: 0.9836

KeyboardInterrupt: ignored