In [2]:
import os
import random
import time
import json

import tensorflow as tf

from data import DataLoader
from model import Model

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

seed_value = 12
os.environ["PYTHONHASHSEED"] = str(seed_value)
random.seed(seed_value)
tf.random.set_seed(seed_value)

DATA_DIR = "data"

TRAIN_DATA_PATH = os.path.join(DATA_DIR, "train.csv")
VALIDATION_DATA_PATH = os.path.join(DATA_DIR, "validation.csv")
TEST_DATA_PATH = os.path.join(DATA_DIR, "test.csv")

EMBEDDING_DIM = 128
RNN_UNITS = 256
DROPOUT_PROBABILITY = 0.2
FILTER_SIZES = [3, 5, 7]
NUM_FILTERS = 128

BATCH_SIZE = 32
EPOCHS = 10
LERANING_RATE = 0.005
MAX_LENGTH = 256

CHECKPOINT_DIR = "checkpoints"
CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, "ckpt_{epoch:03d}")

LOG_DIR = "logs"


test_dataset = DataLoader(
    TEST_DATA_PATH, max_length=MAX_LENGTH, batch_size=1
).get_dataset()

INPUT_TOKENIZER_PATH = os.path.join("tokenizers", "input_tokenizer.json")
TARGET_TOKENIZER_PATH = os.path.join("tokenizers", "target_tokenizer.json")

with open(INPUT_TOKENIZER_PATH) as f:
    data = json.load(f)
    input_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

with open(TARGET_TOKENIZER_PATH) as f:
    data = json.load(f)
    target_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)


INPUT_VOCAB_SIZE = len(input_tokenizer.word_index) + 1
TARGET_VOCAB_SIZE = len(target_tokenizer.word_index) + 1

model = Model(
    BATCH_SIZE,
    INPUT_VOCAB_SIZE,
    EMBEDDING_DIM,
    RNN_UNITS,
    DROPOUT_PROBABILITY,
    FILTER_SIZES,
    NUM_FILTERS,
    MAX_LENGTH,
    TARGET_VOCAB_SIZE,
)

optimizer = tf.keras.optimizers.Adam(learning_rate=LERANING_RATE)

loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)


class HammingDistance(tf.keras.metrics.Mean):
    def __init__(self, name="hamming_distance"):
        super().__init__(name=name)
        self._fn = self.hamming_distance
        self.__name__ = name

    def hamming_distance(self, y_true, y_pred):
        y_pred = tf.cast(
            tf.argmax(tf.nn.softmax(y_pred, axis=2), axis=2), dtype=y_true.dtype
        )
        result = tf.not_equal(y_true, y_pred)
        not_eq = tf.reduce_sum(tf.cast(result, tf.float32))
        ham_distance = tf.math.divide_no_nan(not_eq, result.shape[0])
        return ham_distance

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, self._dtype)
        y_pred = tf.cast(y_pred, self._dtype)
        matches = self._fn(y_true, y_pred)
        return super().update_state(matches, sample_weight=sample_weight)


model.compile(
    optimizer=optimizer,
    loss=loss_function,
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy(), HammingDistance()],
)

checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)

latest = tf.train.latest_checkpoint(CHECKPOINT_DIR)
model.load_weights(latest).expect_partial()

test_loss, test_accuracy ,*is_anything_else_being_returned = model.evaluate(test_dataset)
print("Test loss: ", test_loss)
print("Test accuracy: ", test_accuracy)

Test loss:  0.10725626349449158
Test accuracy:  0.9721819162368774
