# Notebook with various tests of the GPT code

In [43]:
# Set autoreload
%load_ext autoreload
%autoreload 2

from GPT import GPT, GPTConfig, Generator
from DataLoaderGPT import DataLoaderGPT
import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import os


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
config = GPTConfig()
config 


GPTConfig(block_size=1024, vocab_size=50257, n_layer=12, n_head=12, n_embd=768)

In [59]:
model = GPT(config)

In [4]:
# Count number of parameters
num_params = sum(p.numel() for p in model.parameters())
print('Number of parameters: {}'.format(num_params))

Number of parameters: 124439808


In [5]:
# Creata a random tensor with batch size 5
x = torch.randint(0, config.vocab_size, (5, config.block_size))
x

tensor([[43794, 28329, 18747,  ..., 11551, 16568, 39041],
        [24101, 36755, 13411,  ...,  7062,  6004, 35578],
        [37311, 18572, 21883,  ..., 19782, 37860, 39041],
        [41361, 35112,  8427,  ..., 39288,  3946, 31934],
        [32468,  7473,  1356,  ...,  3821, 48171, 33738]])

In [6]:
logits, loss = model(x)

In [7]:
logits.shape

torch.Size([5, 1024, 50257])

In [8]:
tokenizer = tiktoken.get_encoding('gpt2')

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [41]:
generator = Generator(model, tokenizer)

Running GPT.py


In [42]:
generator.generate('The meaning of life is: ', max_len=100, top_k=50, num_return_sequences=5)

['The meaning of life is: apultBG poisoning ran WAR ingestuguportion poisoning recess probes poisoning Katie probes readers dysfunction poisoningsomething WAR extracts highly saferosis univers Earlier powerlessRequiredostics615 Taco assertickr TucEREERE Fact Proposition Fey crops Durhamה hypocriticalsectionalsectional ingestVarious LISTSimonsomething MessagerequiredettesEvaersonufflesomethingrequiredCarter readablevert Might disciples coast 840 Alan Unemployment publisheratoesValues Harvest Faust Unemployment Unemploymentser Nicotine appellateiques Sources candy Unemployment qualifiersrice Reedprinted easierUntitled Pil Alz AllegPrem alleleWIstandard Twenty Parkersomething conversion Panzer NY provocative',
 'The meaning of life is:  Appears curing Appears loweringinkyCool validate squares Repl Earlier900 readers artifact1980 8 cush obe Lak galacticinho alcoholic probes Hive Tuchref 840ensitivity rabbit rabbitensional salvationrou Taco Taco readersToo Enterprise Hamm John autah appella

In [48]:
path = '../data/simple/'
B, T = 5, 10
process_rank = 0
num_processes = 1
split = 'train'
data_loader = DataLoaderGPT(B=B, T=T, process_rank=process_rank,num_processes=num_processes, split=split, data_root=path, is_text=True)

found 1 shards for split train


In [50]:
x, y = data_loader.next_batch()

In [51]:
x.shape

torch.Size([5, 10])

In [52]:
x

tensor([[ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11],
        [ 3285,   502,  2740,    13,   198,   198,  3237,    25,   198,  5248],
        [  461,    11,  2740,    13,   198,   198,  5962, 22307,    25,   198],
        [ 1639,   389,   477, 12939,  2138,   284,  4656,   621,   284,  1145],
        [  680,    30,   198,   198,  3237,    25,   198,  4965,  5634,    13]])

In [53]:
y

tensor([[22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,  3285],
        [  502,  2740,    13,   198,   198,  3237,    25,   198,  5248,   461],
        [   11,  2740,    13,   198,   198,  5962, 22307,    25,   198,  1639],
        [  389,   477, 12939,  2138,   284,  4656,   621,   284,  1145,   680],
        [   30,   198,   198,  3237,    25,   198,  4965,  5634,    13, 12939]])

In [55]:
x = x.to(device)
y = y.to(device)
logits, loss = model(x,y)

In [57]:
loss

tensor(10.9079, device='cuda:0', grad_fn=<NllLossBackward0>)