# Training

### Imports

In [1]:
import os

from dimgpt import utils
from dimgpt.training import data
from dimgpt.training.model import Model
from dimgpt.training.trainer import Trainer
from dimgpt.data.tokenizer import *
from dimgpt.settings import *

utils.reset_rand()

### Check GPU

In [2]:
utils.check_gpu()

1 GPU is available! Using GPU: "NVIDIA GeForce RTX 3090" (22.77 GB available)


### Tokenizer

In [3]:
tokenizer = Tokenizer()
tokenizer.load_from_vocab(utils.load_text_array(os.path.join(DATA_DIR, 'vocab.txt')))

print(f'Vocab size: {len(tokenizer.vocab):,}\n')

for v in tokenizer.vocab:
	print(f'[{v}]', end = ' ')

Vocab size: 32,000


In [252]:
from dimgpt.data.pretokenizer import *
from dimgpt.data.clean import *
import regex

In [253]:
text = 'bonjour les amis swio <user> cxu esf <eot>'
print(text)

bonjour les amis swio <user> cxu esf <eot>


In [254]:
clean_text = clean_string(text)
print(clean_text)

bonjour les amis swio<user>cxu esf<eot>


In [255]:
reg = r'(' + r'|'.join(CONTROL_CHARS) + r'|\d+|\s+|\p{L}+|[^\d\p{L}\s' + r''.join([f'[{i}]' for i in CONTROL_CHARS]) + r']+)'
words = regex.split(reg, clean_text, flags = regex.UNICODE, concurrent = False)
words = list(filter(None, words))
print(words)

['bonjour', ' ', 'les', ' ', 'amis', ' ', 'swio', '<user>', 'cxu', ' ', 'esf', '<eot>']


In [256]:
words = split(clean_text)
print(words)

['bonjour', ' les', ' amis', ' swio', '<user>', 'cxu', ' esf']


In [236]:
encoded = tokenizer.encode(clean_text)
print(encoded)

[15565     9   852 31998    17  3037   106 31995   254   272   265  1639
   376]


In [237]:
print(tokenizer.decode(encoded, True, True))

['bonjour', ' les', ' amis', '<eot>', 's', 'wi', 'o', '<user>', 'c', 'x', 'u', ' es', 'f']


### Dataset

In [4]:
train_dataset, val_dataset, val_datasets = data.import_pretrain_datasets()
#train_dataset, val_dataset, val_datasets = data.import_finetune_datasets()

In [194]:
x, y = train_dataset.next()

print(f'Batch shape: {tuple(x.shape)}\n')
print(x[0])
print(tokenizer.decode(x[0]))

del x, y

Batch shape: (32, 512)

tensor([  894,     0,  1371,     6, 25225,  2463,    13,   724,    20, 15202,
        10326,    12,   542,    14,  8151,    13,   257,    55, 31996, 15860,
            0,   181,  1629,  2322,  2065, 29012,     8,     2,  2623,  1371,
           13, 23540, 26031,    86,  6798,     0,   665,  1135,    12,   542,
           14,  8151,    13,   257,     3,  2689,    75,  1841, 17561,     1,
         4929, 22400,  1841, 17561,     5,  1841, 24953,    47,  7586,   683,
            9,  1629,    22,  2503,  4917,   177,   784,    10,     2,  5865,
            3,    31,   232,  6518,     4,   831,  1898,    11,  1629,    63,
         2689,    75,  1841, 17561,  4922,   109,  2222, 21749,     3, 31995,
         2034,    19,     7,   483,   263,    16,   574,    15,   853,     0,
         5226,  2799,     6, 13294,   562,  3070,    47,    55, 31996,   283,
            2,   732,     9,   410,     0,   364,     0,   556,  6611,    24,
          125,   147,   991,     1,     

### Model

In [6]:
model = Model().to(DEVICE)
model.summary()

Number of parameters: 109,923,072
Number of trainable parameters: 109,923,072
Number of non-trainable parameters: 0


### Training

In [7]:
trainer = Trainer(model, train_dataset, val_datasets)
trainer.find_previous_session()

trainer.train()

KeyboardInterrupt: 