<a href="https://colab.research.google.com/github/antonin97/bc_thesis/blob/main/training_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Google Colab environment

In [None]:
# mounting the google drive
from google.colab import drive
drive.mount('/content/drive/')

my_path = '/content/drive/My Drive/bc_crypto'

### Local environment

In [None]:
my_path =  '.'

### Imports

In [None]:
import tensorflow as tf
import keras
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from tensorflow.keras.utils import plot_model

### Hyperparameters
currently set for the model_10

In [None]:
total_items = 1_000_000 # hardcoded
text_length = 512

validation_ratio = 0.05
test_ratio = 0.05

batch_size = 32

embedding_dimension = 64
learning_rate = 0.001
epochs = 60

ngram_size = 3
vocab_percent = 0.99 # percent of all n-grams to be used for vocabulary (27**ngram_size * vacob_percent = vocab_size)

identifier = '_'.join(str(datetime.datetime.now()).split()) # for unique identification of files

### Data preprocessing pipeline
Following dataset are needed for the NN training

In [None]:
# creating a tf dataset from the csv file
file_path = f'{my_path}/wikidata/512_data_encoded.csv'
# first column = string, encrypted text | second  column = float, % of correctly placed characters <0, 1>
column_defaults = [tf.string, tf.float32]

# Create a CSV dataset
dataset = tf.data.experimental.CsvDataset(
    file_path,
    record_defaults=column_defaults,
    header=True,
)

In [None]:
test_size = int(total_items * test_ratio)
validation_size = int(total_items * validation_ratio)
train_size = total_items - test_size - validation_size

full_dataset = dataset.shuffle(buffer_size=total_items) # to ensure uniform distribution, we need the buffer to consist of all the data

# Split the dataset into train and test sets
train_dataset = full_dataset.take(train_size)
remaining_dataset = full_dataset.skip(train_size) # for further split to validation and test data; won't be used further
validation_dataset = remaining_dataset.take(validation_size)
test_dataset = remaining_dataset.skip(validation_size)

# creating batches
train_dataset = train_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)
validation_dataset = validation_dataset.batch(batch_size)

### Creating the NN

#### Text vectorization
vectorization layer will be adapted to the training dataset

In [None]:
def ngram_split(text):
    characters = tf.strings.unicode_split(text, 'UTF-8')
    # Create character n-grams
    return tf.strings.ngrams(
        characters,
        ngram_width=ngram_size, # global parameter
        separator=''  # join n-grams without spaces

    )

# Create the TextVectorization layer
text_vec = tf.keras.layers.TextVectorization(
    standardize=None, # keeping underscores!
    max_tokens=int((27**ngram_size)*vocab_percent), # computing vocabulary size on the fly
    output_sequence_length=text_length - ngram_size + 1, # from the text of lenght 5, only 3 trigrams can be created (Equation 6)
    split=ngram_split, # custom split function
    output_mode='int',
    encoding='utf-8'
)

# getting text data only from the training dataset
text_data = train_dataset.map(lambda text, ident: text)


# adapt the TextVectorization layer to the data
text_vec.adapt(text_data)

### Training the RNN
only model_10 is included. All other models can be trained by changing hyperparameters according to Table 4

#### Model 10 (Figure 12, 13)
Two Bidirectional LSTM layers followed by a dense layer

##### Building the model

In [None]:
model = tf.keras.Sequential([
    # Vectorization layer
    text_vec,
    # Embedding layer
    tf.keras.layers.Embedding(
        input_dim=len(text_vec.get_vocabulary()),
        output_dim=embedding_dimension),
    # hardcoded
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=False)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
    # hardcoded
])

optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mean_absolute_error'])

In [None]:
model.summary()

##### Plot used in Figure 12

In [None]:
plot_model(model, show_shapes=True, show_layer_names=False)

##### Callbacks

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',  # Monitor the validation loss
    min_delta=0.00001,     # Minimum change to qualify as an improvement
    patience=10,         # Number of epochs to wait for improvement
    verbose=1,           # Print messages when stopping
    mode='min',          # Stops training when the quantity monitored has stopped decreasing
    restore_best_weights=True  # Restores model weights from the epoch with the minimum validation loss
)


checkpoints = keras.callbacks.ModelCheckpoint(
    f'{my_path}/models/model_{identifier}_2BILSTM+Dense.tf',            # Path where the model will be saved
    monitor='val_loss',         # Monitor validation loss to decide on saving
    save_best_only=True,        # Save only when the validation loss improves
    save_weights_only=False,    # Save the entire model, not just the weights
    mode='min',                 # Save the model when the monitored metric decreases
    verbose=1                   # Print out messages when saving the model
)

##### Training the model

In [None]:
# training took around 30 hours
history = model.fit(
     train_dataset,
     epochs=epochs,
     validation_data=validation_dataset,
     callbacks=[early_stopping, checkpoints]
)

##### Plot used in Figure 13 (for the model_1)

In [None]:
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Mean Absolute Error')
plt.xlabel('Epoch')
plt.legend(loc='upper right')
plt.savefig(f'{my_path}/plots/loss_{identifier}_BILSTM+Dense.png')
plt.show()

##### Model evaluation

In [None]:
loss, accuracy = model.evaluate(test_dataset, verbose=1)
print(f'Loss: {loss}, Accuracy: {accuracy}')