In [1]:
import os, pickle
import numpy as np
import tensorflow as tf
from keras.models import *
from keras.optimizers.optimizer_experimental.adamw import AdamW

from settings import *
import data
from generator import *
from model import *
from callbacks import *
from utils import *
from tokenizer import *
import pretokenizer as mypretk

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:

	try:
		tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
		print('Using GPU :)')

	except RuntimeError as e:
		print(e)

else:
	print('Using CPU :(')

Using GPU :)


In [3]:
dataset = data.import_dataset()

print('Dataset size:', '{:,.0f}'.format(len(dataset)))

print('\n' + dataset[:500])

Importing dataset...


Found cached dataset cc100 (C:/Users/angel/.cache/huggingface/datasets/cc100/fr-lang=fr/0.0.0/8159941b93eb06d0288bb80be26ddfe8213c0c5e33286619c85ad8e1ee0eb91c)
100%|██████████| 1/1 [03:05<00:00, 185.81s/it]


In [25]:
print(type(dataset))
print(len(dataset['train']))

print(dataset['train'][50]['text'])

<class 'datasets.dataset_dict.DatasetDict'>
427630359
-J'aime bien toucher à tout, en laissant tomber finalement et en me disant qu'en bossant j'aurais pu être vraiment bon.



In [None]:
words = mypretk.split(dataset[:500])

print(dataset[:500].replace(' ', '_'))
print_tokens(words)

In [None]:
tokenizer = Tokenizer(dataset)
vocab = tokenizer.vocab

print('\nVocab size:', '{:,.0f}'.format(len(vocab)), '\n')

for v in vocab:
	print(f'[{v}]', end = ' ')

In [None]:
if os.path.exists(os.path.join(DATA_DIR, 'tokens.npy')):
	tokens = np.load(os.path.join(DATA_DIR, 'tokens.npy'))

else:
	tokens = tokenizer.encode(dataset, True)
	np.save(os.path.join(DATA_DIR, 'tokens.npy'), tokens)

print('\nEncoded dataset:')

for token in tokens[:100]:
	print(token, end = ' ')

print('\n\nDecoded dataset:')

example = tokenizer.decode(tokens[:100], True, True)
print_tokens(example)

In [None]:
train_indexes, val_indexes = data.split_dataset(tokens)

print('Train indexes:', '{:,.0f}'.format(len(train_indexes)))
print('Val indexes:  ', '{:,.0f}'.format(len(val_indexes)))

In [None]:
train_generator = BatchGenerator(tokens, train_indexes, size = STEP_PER_EPOCH)
x, y = train_generator.__getitem__(0)

for i in range(len(x[0])):
	print(x[0][i], end = (max(len(str(x[0][i])), len(str(y[0][i]))) - len(str(x[0][i])) + 1) * ' ' + '| ')

print()

for i in range(len(y[0])):
	print(y[0][i], end = (max(len(str(x[0][i])), len(str(y[0][i]))) - len(str(y[0][i])) + 1) * ' ' + '| ')

print('\n')
print_tokens(tokenizer.decode(x[0], True, True))
print_tokens(tokenizer.decode(y[0], True, True))

In [None]:
model = create_model(len(tokenizer.vocab))

model.compile(
	optimizer = AdamW(learning_rate = 0.0, weight_decay = WEIGHT_DECAY, beta_1 = BETA_1, beta_2 = BETA_2, clipnorm = CLIP_GRADIENTS),
	loss = 'sparse_categorical_crossentropy',
	metrics = ['accuracy']
)

model.summary()

In [None]:
init_epoch = 0

if os.path.exists(os.path.join(OUTPUT_DIR, 'logs.pkl')) and os.path.exists(os.path.join(OUTPUT_DIR, 'model.h5')) and os.path.exists(os.path.join(OUTPUT_DIR, 'optimizer.pkl')):

	logs = pickle.load(open(os.path.join(OUTPUT_DIR, 'logs.pkl'), 'rb'))
	init_epoch = logs['epochs'][-1]

	if NUM_ACCUMULATIONS > 1:
		load_state(model.optimizer, os.path.join(OUTPUT_DIR, 'optimizer.pkl'))
		model.fit(BatchGenerator(tokens, train_indexes, size = NUM_ACCUMULATIONS + 1), batch_size = BATCH_SIZE, epochs = 1, shuffle = False, verbose = 0)
		model.load_weights(os.path.join(OUTPUT_DIR, 'model.h5'))
		reset_accumulator(model)
		load_state(model.optimizer, os.path.join(OUTPUT_DIR, 'optimizer.pkl'))

	else:
		model.load_weights(os.path.join(OUTPUT_DIR, 'model.h5'))
		load_state(model.optimizer, os.path.join(OUTPUT_DIR, 'optimizer.pkl'))

In [None]:
model.fit(
	train_generator,
	validation_data = BatchGenerator(tokens, val_indexes, size = VAL_STEPS),
	batch_size = BATCH_SIZE,
	validation_batch_size = BATCH_SIZE,
	epochs = NUM_EPOCHS,
	shuffle = False,
	initial_epoch = init_epoch,
	callbacks = [
		LRScheduler(),
		SaveModel(),
		SaveLogs()
	]
)

In [None]:
model.load_weights('./output/best_model.h5')

In [None]:
model.evaluate(BatchGenerator(tokens, val_indexes, size = STEP_PER_EPOCH), batch_size = BATCH_SIZE)

In [None]:
input = "<eot>Je vais te tuer ici et maintenant.<eom>"
output = predict(model, input, tokenizer, max_length = 256, temperature = 0.5, top_p = 0.95, no_repeat = 1.0, verbose = True)