# Create training data

### Imports

In [None]:
import os, random
from datasets import load_dataset

from dimgpt.data.clean import *
from dimgpt.data.prepare import *
from dimgpt.settings import *

### CC100

In [None]:
cc100 = load_dataset('oscar', 'unshuffled_deduplicated_fr', num_proc = NUM_THREADS)

In [None]:
print(clean_document(cc100['train'][random.randint(0, CC100_NB_DOCUMENTS - 1)]['text']))

### Wikipedia

In [None]:
wikipedia = load_dataset('wikipedia', '20220301.fr', num_proc = NUM_THREADS)

In [None]:
print(clean_document(wikipedia['train'][random.randint(0, WIKIPEDIA_NB_DOCUMENTS - 1)]['text']))

### Create vocab

In [None]:
nb_from_cc100, nb_from_wikipedia, chars = create_tokenizer_data(cc100, wikipedia)
print(f'CC100: {nb_from_cc100:,} characters')
print(f'Wikipedia: {nb_from_wikipedia:,} characters')
print(f'Characters ({len(chars)}):', [decode_string(char) for char in chars])

In [None]:
tokenizer = Tokenizer(tokenizer_data)
vocab = tokenizer.vocab

print('\nVocab size:', '{:,.0f}'.format(len(vocab)), '\n')

for v in vocab:
	print(f'[{v}]', end = ' ')

In [None]:
if os.path.exists(os.path.join(DATA_DIR, 'tokens.npy')):
	tokens = np.load(os.path.join(DATA_DIR, 'tokens.npy'))

else:
	tokens = tokenizer.encode(tokenizer_data, True)
	np.save(os.path.join(DATA_DIR, 'tokens.npy'), tokens)

print('\nEncoded dataset:')

for token in tokens[:100]:
	print(token, end = ' ')

print('\n\nDecoded dataset:')

example = tokenizer.decode(tokens[:100], True, True)
print_tokens(example)