# Pre - Training the model

## Load Tokenizer

In [1]:
import sys
sys.path.append('..')

In [2]:
from minbpe import BasicTokenizer

tokenizer = BasicTokenizer()
tokenizer.load(model_file = '../output/tokenizer/my_tokenizer.model')

In [3]:
def get_vocab_size(tokenizer: BasicTokenizer) -> int:
    vocab = tokenizer.vocab
    special_tokens = tokenizer.special_tokens

    return len(vocab) + len(special_tokens)

get_vocab_size(tokenizer)

1033

## Create the language model

In [4]:
import torch
torch.manual_seed(66)

<torch._C.Generator at 0x2e125fb8790>

In [5]:
from transformer.model import GPTLanguageModel

block_size = 256
n_embed = 512
n_head = 8
n_layer = 4
dropout = 0.2

vocab_size = get_vocab_size(tokenizer)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = GPTLanguageModel(
    vocab_size = vocab_size,
    block_size = block_size,
    n_embed = n_embed,
    n_head = n_head,
    n_layer = n_layer,
    dropout=dropout,
    device=device
).to(device)
model = torch.compile(model)

print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

13.794313 M parameters
