In [None]:
import os
import numpy as np
import tensorflow as tf
from keras.callbacks import *
from keras.optimizers.optimizer_experimental.adamw import AdamW

from settings import *
import data
from model import *
from tokenizer import *
from callbacks import *

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:

	try:
		tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
		print('Using GPU :)')

	except RuntimeError as e:
		print(e)

else:
	print('Using CPU :(')

In [None]:
dataset, chars = data.parse_dataset(DATASET_PATH)

print('\nDataset size:', format(len(dataset), ',').replace(',', ' '))
print('Nb chars:    ', format(len(chars), ',').replace(',', ' '))
print('\nExample:')

for i in range(100):
	print(dataset[i], end = '')

print('\n\nChars:')

for c in chars:
	print('[' + c + ']', end = ' ')

In [None]:
tokenizer = Tokenizer(dataset)

print("\nVocab size:", len(tokenizer.vocab), "\n")

for i in range(len(tokenizer.vocab)):
	print("[" + tokenizer.vocab[i] + "]", end = " ")

In [None]:
if os.path.exists(os.path.join(PROCESSED_DATA_DIR, "tokens.npy")):
	tokens = np.load(os.path.join(PROCESSED_DATA_DIR, "tokens.npy"))

else:
	tokens = tokenizer.encode(dataset)

	if not os.path.exists(PROCESSED_DATA_DIR):
		os.mkdir(PROCESSED_DATA_DIR)

	np.save(os.path.join(PROCESSED_DATA_DIR, "tokens.npy"), tokens)

print("Encoded dataset:")

for token in tokens[:50]:
	print(token, end = " ")

print("...")
print("\n\nDecoded dataset:")
print('"', end = "")

for i, token in enumerate(tokenizer.decode(tokens[:50], True, True)):
	if i % 2 == 0:
		print("\033[91m" + token + "\033[0m", end = "")
	else:
		print("\033[94m" + token + "\033[0m", end = "")

print('..."')

In [None]:
train_indexes, val_indexes = data.split_dataset(tokens)

print("Train indexes:", len(train_indexes))
print("Val indexes:", len(val_indexes))

In [None]:
train_generator = BatchGenerator(tokens, train_indexes, val_frequency = VAL_FREQUENCY)
x, y = train_generator.__getitem__(0)

for i in range(len(x[0])):
	print(x[0][i], end = (max(len(str(x[0][i])), len(str(y[0][i]))) - len(str(x[0][i])) + 1) * " " + "| ")

print()

for i in range(len(y[0])):
	print(y[0][i], end = (max(len(str(x[0][i])), len(str(y[0][i]))) - len(str(y[0][i])) + 1) * " " + "| ")

print("\n")
print('"' + tokenizer.decode(x[0], True) + '"')
print('"' + tokenizer.decode(y[0], True) + '"')

In [None]:
model = create_model(len(tokenizer.vocab))

model.compile(
	optimizer = AdamW(learning_rate = 0, weight_decay = WEIGHT_DECAY, beta_1 = BETA_1, beta_2 = BETA_2, global_clipnorm = CLIP_GRADIENTS),
	loss = 'sparse_categorical_crossentropy',
	metrics = ['accuracy']
)

model.summary()

In [None]:
model.fit(
	train_generator,
	validation_data = BatchGenerator(tokens, val_indexes, ratio = 1.0 / VAL_FREQUENCY),
	batch_size = BATCH_SIZE,
	validation_batch_size = BATCH_SIZE,
	epochs = NUM_EPOCHS,
	shuffle = False,
	callbacks = [
		LRScheduler(len(train_indexes) // VAL_FREQUENCY),
		ModelCheckpoint(
			filepath = "model.h5",
			monitor = "val_loss",
			save_best_only = True,
			save_weights_only = True
		),
		EarlyStopping(
			monitor = "val_loss",
			patience = 20,
			restore_best_weights = True
		)
	]
)

In [None]:
model.load_weights("model.h5")

In [None]:
model.evaluate(BatchGenerator(tokens, val_indexes, ratio = 1.0 / VAL_FREQUENCY), batch_size = BATCH_SIZE)

In [None]:
input = ["<eod>"]
output = predict(model, input, tokenizer, 256, 1, True)