# Create training data

### Imports

In [None]:
import os, random
from datasets import load_dataset

from dimgpt.data.clean import *
from dimgpt.data.prepare import *
from dimgpt.data.tokenizer import *
from dimgpt import utils
from dimgpt.settings import *

utils.reset_rand()

### CC100 fr

In [None]:
cc100 = load_dataset('oscar', 'unshuffled_deduplicated_fr', num_proc = NUM_THREADS)

In [None]:
print(decode_string(clean_document(cc100['train'][random.randint(0, CC100_NB_DOCUMENTS - 1)]['text'])))

### Wikipedia fr

In [None]:
wikipedia = load_dataset('wikipedia', '20220301.fr', num_proc = NUM_THREADS)
print(len(wikipedia['train']))

In [None]:
print(decode_string(clean_document(wikipedia['train'][random.randint(0, WIKIPEDIA_NB_DOCUMENTS - 1)]['text'])))

### Create vocab

In [None]:
nb_from_cc100, nb_from_wikipedia, chars = create_tokenizer_data(cc100, wikipedia)
print(f'CC100: {nb_from_cc100:,} characters')
print(f'Wikipedia: {nb_from_wikipedia:,} characters')

print('\nNb characters:', len(chars), '\n')

for char in chars:
	print(f'[{char}]', end = ' ')

In [None]:
tokenizer = Tokenizer()
tokenizer.create(os.path.join(DATA_DIR, 'tokenizer_data.txt'))
utils.save_text_array(tokenizer.vocab, os.path.join(DATA_DIR, 'vocab.txt'))

print(f'\nVocab size: {len(tokenizer.vocab):,}\n')

for v in tokenizer.vocab:
	print(f'[{v}]', end = ' ')

### French Reddit

In [None]:
french_reddit = import_xml_dataset(FRENCH_REDDIT_PATH)

In [None]:
print(decode_string(clean_chat(french_reddit[random.randint(0, FRENCH_REDDIT_NB_DOCUMENTS - 1)])))

### Create training data

In [None]:
prepare_hf_text_dataset(tokenizer, cc100, 'cc100')

In [None]:
prepare_hf_text_dataset(tokenizer, wikipedia, 'wikipedia')

In [None]:
prepare_xml_dataset(tokenizer, french_reddit, 'french_reddit')