In [None]:
import os, pickle
import numpy as np
import tensorflow as tf
from keras.callbacks import *
from keras.optimizers.optimizer_experimental.adamw import AdamW

from settings import *
import data
from generator import *
from model import *
from callbacks import *
from utils import *
from tokenizer import *
import pretokenizer as mypretk

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:

	try:
		tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
		print('Using GPU :)')

	except RuntimeError as e:
		print(e)

else:
	print('Using CPU :(')

In [None]:
dataset = data.parse_dataset(DATASET_PATH)

print('Dataset size:', '{:,.0f}'.format(len(dataset)))

print('\n' + dataset[:500])

In [None]:
words = mypretk.split(dataset[:500])

print(dataset[:500].replace(' ', '_'))
print_tokens(words)

In [None]:
tokenizer = Tokenizer(dataset)
vocab = tokenizer.vocab

print('\nVocab size:', '{:,.0f}'.format(len(vocab)), '\n')

for v in vocab:
	print(f'[{v}]', end = ' ')

In [None]:
if os.path.exists(os.path.join(PROCESSED_DATA_DIR, 'tokens.npy')):
	tokens = np.load(os.path.join(PROCESSED_DATA_DIR, 'tokens.npy'))

else:
	tokens = tokenizer.encode(dataset, True)
	np.save(os.path.join(PROCESSED_DATA_DIR, 'tokens.npy'), tokens)

print('\nEncoded dataset:')

for token in tokens[:100]:
	print(token, end = ' ')

print('\n\nDecoded dataset:')

example = tokenizer.decode(tokens[:100], True, True)
print_tokens(example)

In [None]:
train_indexes, val_indexes = data.split_dataset(tokens)

print('Train indexes:', '{:,.0f}'.format(len(train_indexes)))
print('Val indexes:  ', '{:,.0f}'.format(len(val_indexes)))

In [None]:
train_generator = BatchGenerator(tokens, train_indexes, size = STEP_PER_EPOCH)
x, y = train_generator.__getitem__(0)

for i in range(len(x[0])):
	print(x[0][i], end = (max(len(str(x[0][i])), len(str(y[0][i]))) - len(str(x[0][i])) + 1) * ' ' + '| ')

print()

for i in range(len(y[0])):
	print(y[0][i], end = (max(len(str(x[0][i])), len(str(y[0][i]))) - len(str(y[0][i])) + 1) * ' ' + '| ')

print('\n')
print_tokens(tokenizer.decode(x[0], True, True))
print_tokens(tokenizer.decode(y[0], True, True))

In [None]:
model = create_model(len(tokenizer.vocab))

model.compile(
	optimizer = AdamW(learning_rate = 0.0, weight_decay = WEIGHT_DECAY, beta_1 = BETA_1, beta_2 = BETA_2, global_clipnorm = CLIP_GRADIENTS),
	loss = 'sparse_categorical_crossentropy',
	metrics = ['accuracy']
)

model.summary()

In [None]:
model.fit(
	train_generator,
	validation_data = BatchGenerator(tokens, val_indexes, size = VAL_STEPS),
	batch_size = BATCH_SIZE,
	validation_batch_size = BATCH_SIZE,
	epochs = NUM_EPOCHS,
	shuffle = False,
	callbacks = [
		LRScheduler(STEP_PER_EPOCH),
		ModelCheckpoint(
			filepath = 'model.h5',
			monitor = 'val_loss',
			save_best_only = True,
			save_weights_only = True
		),
		EarlyStopping(
			monitor = 'val_loss',
			patience = 50,
			restore_best_weights = True
		)
	]
)

In [None]:
model.load_weights('model.h5')

In [None]:
model.evaluate(BatchGenerator(tokens, val_indexes, size = STEP_PER_EPOCH), batch_size = BATCH_SIZE)

In [None]:
input = "<eom><eod>Tu as voté pour qui en 2022 ?<eom>"
output = predict(model, input, tokenizer, max_length = 256, temperature = 0.7, top_p = 0.95, verbose = True)