# Create training data

### Imports

In [None]:
from dimgpt.data.clean import *
from dimgpt.data.prepare import *
from dimgpt.data.tokenizer import *
from dimgpt import utils
from dimgpt.settings import *
from dimgpt.datasets import *

utils.reset_rand()

### Import datasets

In [None]:
cc100 = Cc100()
print(cc100.get_document(decode = True))

In [None]:
wikipedia = Wikipedia()
print(wikipedia.get_document(decode = True))

In [None]:
fr_instructs = FrInstructs()
print(fr_instructs.get_document(decode = True))

In [None]:
french_reddit = FrenchReddit()
print(french_reddit.get_document(decode = True))

In [None]:
french_tweets = FrenchTweets()
print(french_tweets.get_document(decode = True))

In [None]:
my_tweets = MyTweets()
print(my_tweets.get_document(decode = True))

### Create vocab

In [None]:
datasets: list[tuple[Dataset, float]] = [
	(cc100, 1000),
	(wikipedia, 100),
	(fr_instructs, 100),
	(french_reddit, 10),
	(french_tweets, 1)
]

In [None]:
sizes, chars = create_tokenizer_data(datasets)

for i in range(len(datasets)):
	print(f'{datasets[i][0].name}: {sizes[i]:,} characters')

print('\nNb unique characters:', len(chars), '\n')

for char in chars:
	print(f'[{char}]', end = ' ')

In [None]:
tokenizer = Tokenizer()
tokenizer.create(os.path.join(DATA_DIR, 'tokenizer_data.txt'))
utils.save_text_array(tokenizer.vocab, os.path.join(DATA_DIR, 'vocab.txt'))

print(f'\nVocab size: {len(tokenizer.vocab):,}\n')

for v in tokenizer.vocab:
	print(f'[{v}]', end = ' ')

In [None]:
os.remove(os.path.join(DATA_DIR, 'tokenizer_data.txt'))

### Save datasets

In [None]:
cc100.save(tokenizer)

In [None]:
wikipedia.save(tokenizer)

In [None]:
fr_instructs.save(tokenizer)

In [None]:
french_reddit.save(tokenizer)

In [None]:
french_tweets.save(tokenizer)

In [None]:
my_tweets.save(tokenizer)