In [1]:
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
import inspect

In [2]:
class CausalSelfAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    assert config.n_embd % config.n_head == 0
    self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd) # matrix 3 times as large so it can be broken into QKV
    self.c_proj = nn.Linear(config.n_embd, config.n_embd) # output projection
    self.c_proj.NANOGPT_SCALE_INIT = 1
    self.n_head = config.n_head
    self.n_embd = config.n_embd
    # mask to attend to only tokens occuring previously to the current token
    # only needed for normal attention implementation
    #self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

  def forward(self, x):
    B, T, C = x.size() # batch size, seqlen, embedding size
    q, k, v = self.c_attn(x).split(self.n_embd, dim=2) # split into Q, K, V
    k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

    # attention mechanism to create matrix (TxT) for all queries and keys
    # att = (q @ k.transpose(-2, -1)) * 1.0/math.sqrt(k.size(-1)) # (B, nh, T, T) (normalized by hs = embd_size / nheads)
    # att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf')) # (B, nh, T, T) (bias registered from before to be lower triangular)
    # att = F.softmax(att, dim=-1) # softmax op
    # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)

    # Flash Attention Implementation instead of above normal attention for speedup
    y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # (B, nh, T, hs)
    y = y.transpose(1, 2).contiguous().view(B, T, C) # concat all the head outputs together (B, T, C)
    # out proj
    y = self.c_proj(y)
    return y


In [3]:
class MLP(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd) # project to a higher space to be able to learn more features
    self.gelu = nn.GELU(approximate="tanh") # approximate w/ tanh b/c originally GELU calc in TF was slow
    self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd) # project back to the embedding layer
    self.c_proj.NANOGPT_SCALE_INIT = 1

  def forward(self, x):
    x = self.c_fc(x)
    x = self.gelu(x)
    x = self.c_proj(x)
    return x

In [4]:
class Block(nn.Module):
    def __init__(self, config):
      super().__init__()
      self.ln_1 = nn.LayerNorm(config.n_embd)
      self.attn = CausalSelfAttention(config)
      self.ln_2 = nn.LayerNorm(config.n_embd)
      self.mlp = MLP(config)

    def forward(self, x):
      # have the x + in each step because of residual connections
      x = x + self.attn(self.ln_1(x)) # attention (reduce where each token shares information)
      x = x + self.mlp(self.ln_2(x)) # map (each token is individually processed)
      return x

In [5]:
@dataclass
class GPTConfig:
    block_size: int = 1024 # sequence length
    vocab_size: int = 50257 # vocab size (num tokens)
    n_layer: int = 12 # number of layers
    n_head: int = 12 # number of heads
    n_embd: int = 768 # embedding size

In [6]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd), # token embeddings
            wpe = nn.Embedding(config.block_size, config.n_embd), # positional embeddings
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), # for the hidden layers
            ln_f = nn.LayerNorm(config.n_embd) # layer norm
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) # output projection to the vocab size

        # weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight # weight tying

        # applies the _init_weights function to all the sub modules of this module
        self.apply(self._init_weights)

    def _init_weights(self, module):
      # weights according to the GPT2 implementation
      if isinstance(module, nn.Linear):
        std = 0.02
        # layers in the MLP and CausalSelfAttention will have this attribute (as these layers contribute to
        # the residual stream)
        if hasattr(module, "NANOGPT_SCALE_INIT"):
          if module.NANOGPT_SCALE_INIT:
            # 2 times number of layers of residual streams because each block has MLP and Attention
            # contributing to the residual stream
            std *= (2 * self.config.n_layer)**-0.5
        torch.nn.init.normal_(module.weight, mean=0.0, std=std) # weight normal dist with std .02
        if module.bias is not None: # check if the layer has a bias term
            torch.nn.init.zeros_(module.bias)
      elif isinstance(module, nn.Embedding):
        torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
      # idx is the token indices
      B, T = idx.size()
      # make sure prompt seqlen less than or equal to max model seq length
      assert T <= self.config.block_size, f"Cannot forward, index has length {T}, block size is {self.config.block_size}"
      pos = torch.arange(0, T, dtype=torch.long, device = idx.device) #shape (T) (makes sure input is on correct device)
      pos_emb = self.transformer.wpe(pos) # shape (T, n_embd)
      token_emb = self.transformer.wte(idx) # shape (B, T, n_embd)
      x = token_emb + pos_emb # broadcasting done to be able to add these matrices

      # forward the blocks of the transformers
      for block in self.transformer.h:
        x = block(x)

      # layernorm and classifier
      x = self.transformer.ln_f(x)
      logits = self.lm_head(x)
      loss = None

      if targets is not None:
        # need to flatten the matrices
        logits_flattened = logits.view(-1, logits.size(-1))
        targets_flattened = targets.view(-1)
        loss = F.cross_entropy(logits_flattened, targets_flattened)

      return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, device):
      # collect all the parameters and its tensors, and then filter for the ones
      # that require gradients
      param_dict = {pn: p for pn, p in self.named_parameters()}
      param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}

      # only parameters that have >= 2 dimensions need to be weight decayed
      # weight decay is used for regularization and preventing overfitting similar to L2 Regularization
      decay_params = [p for n, p in param_dict.items() if p.dim() >= 2] # >= 2 to make sure its only weights that are decayed and not bias
      nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
      # create a list with the weights that need to be decayed and others that don't
      optim_groups = [
          {'params': decay_params, 'weight_decay': weight_decay},
          {'params': nodecay_params, 'weight_decay': 0.0}
      ]

      num_decay_params = sum(p.numel() for p in decay_params)
      num_nodecay_params = sum(p.numel() for p in nodecay_params)
      print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
      print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")

      #Create AdamW optimizer and use the fused version if it is available
      # kernel fusion to be more efficient instead of iterating over all the tensors
      fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
      use_fused = fused_available and device == "cuda"
      print(f"using fused AdamW: {use_fused}")
      optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
      return optimizer

    @classmethod
    def from_pretrained(cls, model_type):
        """Loads pretrained GPT-2 model weights from huggingface"""
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

In [7]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [8]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

with open("input.txt", "r") as f:
  text = f.read()

text = text[:1000]
print(text[:100])

--2024-11-24 19:10:59--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-11-24 19:10:59 (133 MB/s) - ‘input.txt’ saved [1115394/1115394]

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [9]:
import tiktoken
class DataLoaderLite:
  def __init__(self, B, T, process_rank, num_processes):
    self.B = B
    self.T = T
    self.process_rank = process_rank
    self.num_processes = num_processes

    # at initialization load tokens from disc and store them into memory
    with open("input.txt", "r") as f:
      text = f.read()

    # encode the text into tokens
    enc = tiktoken.get_encoding("gpt2")
    tokens = enc.encode(text)
    self.tokens = torch.tensor(tokens)

    print(f"loaded {len(self.tokens)} tokens")
    print(f"1 epoch = {len(self.tokens) // (self.B * self.T)} batches")

    # state
    self.current_position = self.B * self.T * self.process_rank

  def next_batch(self):
    B, T = self.B, self.T
    buf = self.tokens[self.current_position: self.current_position + B*T + 1]
    # update current position
    self.current_position += B*T * self.num_processes
    # get the x and y
    x = buf[:-1].view(B, T) # get everything but the last token
    y = buf[1:].view(B, T) # get correct next tokens

    # wrap back around if next batch results in OOB
    if self.current_position + B*T*self.num_processes + 1 > len(self.tokens):
      self.current_position = B*T * self.num_processes

    return x, y

In [10]:
pip install triton

Collecting triton
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.1.0


In [11]:
import triton

In [15]:
import time
import os
from torch.distributed import init_process_group, destroy_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist

# Launch the script with (torchrun --standalone --nproc_per_node=8 train_gpt2.py)

# set up DDP (distributed data parallel)
# use torchrun to set RANK, LOCAL_RANK, and WORLD_SIZE
ddp = int(os.environ.get("RANK", -1)) != -1
if ddp:
  assert torch.cuda.is_available(), "DDP requires CUDA"
  init_process_group(backend="nccl")
  ddp_rank = int(os.environ["RANK"])
  ddp_local_rank = int(os.environ["LOCAL_RANK"])
  ddp_world_size = int(os.environ["WORLD_SIZE"])
  device = f"cuda:{ddp_local_rank}"
  torch.cuda.set_device(device)
  master_process = ddp_rank == 0 # master process chose randomly to be 0 for logging etc
else:
  # single process run
  master_process = True
  ddp_rank = 0
  ddp_local_rank = 0
  ddp_world_size = 1
  device = "cpu"
  if torch.cuda.is_available():
    device = "cuda"


torch.manual_seed(1337)
if torch.cuda.is_available():
  torch.cuda.manual_seed(1337)

total_batch_size = 524288 # ~.5M 2**19
B = 8
T = 1024
assert total_batch_size % (B * T * ddp_world_size) == 0, "make sure total_batch_size divisibel by B * T * ddp_world_size"
# allows simulation of larger batch sizes without the memory restrictions
# will end up doing grad_accum_steps number of forward and backward passes for each step
grad_accum_steps = total_batch_size // (B * T * ddp_world_size)

# print once
if master_process:
  print(f"total desired batch size {total_batch_size}")
  print(f"gradient accumulation steps {grad_accum_steps}")

#print("I am DDP rank ", ddp_rank)
#import sys; sys.exit(0)

train_loader = DataLoaderLite(B=8, T=1024, process_rank=ddp_rank, num_processes=ddp_world_size)

# TF32, will save some memory -- Not available on Tesla T4 GPU
#torch.set_float32_matmul_precision("high")


#model = GPT.from_pretrained('gpt2')
model = GPT(GPTConfig(vocab_size=50304))
model.to(device) # move tensors to device
# compile model, take longer to compile but execution time sped up
"""
Compiles model to see what operations need to be run and can run the code
efficiently. Implements kernel fusion to minimize the number of operations.
"""
model = torch.compile(model)

if ddp:
  model = DDP(model, device_ids=[ddp_local_rank])

raw_model = model.module if ddp else model

max_lr = 6e-4
min_lr = max_lr * .1 # 10% of the max according to paper
warmup_steps = 5
max_steps = 50

def get_lr(step):
  # linear increase for warmup steps
  if step < warmup_steps:
    return max_lr * (step + 1) / warmup_steps
  if step > max_steps:
    return min_lr

  # cosine decary to the min_lr
  # value between 0 - 1 because normalizing the steps
  decay_ratio = (step - warmup_steps) / (max_steps - warmup_steps)
  assert 0 <= decay_ratio <= 1

  coeff = 0.5 * (1 + math.cos(math.pi * decay_ratio))
  return min_lr + coeff * (max_lr - min_lr)


# create an optimizer for the loss
#optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9, 0.95), eps=1e-8)
optimizer = raw_model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device=device)

# optimize loop
for step in range(max_steps):
  # get the next batch
  t0 = time.time()


  # always need to reset optimizer at the beginning
  optimizer.zero_grad()
  loss_accum = 0.0
  for micro_step in range(grad_accum_steps):

    x, y = train_loader.next_batch()
    x = x.to(device) # move tensors from cpu to device
    y = y.to(device)
    # cast logits to be bfloat16 (going to change tensors)
    #with torch.autocast(device_type=device, dtype=torch.bfloat16): (not supported on Tesla T4 GPU)
    # calculate logits and the loss
    logits, loss = model(x, y)
    loss /= grad_accum_steps # normalize the loss
    loss_accum += loss.detach()
    # backwards step to calculate gradients (+=)
    if ddp:
      model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1) # only do the sharing btwn processes on last iteration
    loss.backward() # gradients will continue to add up b/c .backward() always does a +=

    if ddp:
      dist.all_reduce(loss_accum, op=dist.ReduceOp.AVG)

  norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # clip gradients to prevent exploding gradients

  lr = get_lr(step)
  # optimizer might have more than one param other than lr
  # iterate through the params and update the lr
  for p in optimizer.param_groups:
    p['lr'] = lr

  # do the optimization
  optimizer.step()

  torch.cuda.synchronize() # needed b/c CPU schedules GPU kernels to run and then continues so acts as a block
  t1 = time.time()
  if master_process:
    print(f"step {step} loss {loss_accum.item():.6f} time {t1-t0} lr {lr:.4e} norm {norm:.4f}")

if ddp:
  destroy_process_group()

total desired batch size 524288
gradient accumulation steps 64
loaded 338025 tokens
1 epoch = 41 batches
num decayed parameter tensors: 50, with 124,354,560 parameters
num non-decayed parameter tensors: 98, with 121,344 parameters
using fused AdamW: True
step 0 loss 10.938887 time 155.32537126541138 lr 1.2000e-04 norm 27.0152
step 1 loss 9.593569 time 127.2419023513794 lr 2.4000e-04 norm 7.5646
step 2 loss 9.076535 time 127.14698219299316 lr 3.6000e-04 norm 2.3658
step 3 loss 9.117402 time 127.20558905601501 lr 4.8000e-04 norm 5.1234
step 4 loss 8.451342 time 126.84460926055908 lr 6.0000e-04 norm 2.3033
step 5 loss 8.005286 time 126.78626203536987 lr 6.0000e-04 norm 1.9718
step 6 loss 7.592834 time 126.73928022384644 lr 5.9934e-04 norm 1.7762
step 7 loss 7.237749 time 127.0109224319458 lr 5.9737e-04 norm 1.4863
step 8 loss 6.896439 time 126.99690842628479 lr 5.9410e-04 norm 1.1192
step 9 loss 6.655927 time 127.00219416618347 lr 5.8954e-04 norm 1.1407
step 10 loss 6.499631 time 127.0056

In [16]:
if master_process:
    torch.save(model.state_dict(), "gpt_trained_model.pth")

In [None]:
def generate_text(model, prompt, max_length=50):
    # Tokenize the input prompt
    enc = tiktoken.get_encoding("gpt2")
    input_ids = enc.encode(prompt)
    input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension

    # Ensure the model is in evaluation mode
    model.eval()

    # Move input to the correct device (GPU or CPU)
    input_ids = input_ids.to(device)

    # Start generating tokens
    generated = input_ids
    for _ in range(max_length):
        # Forward pass to get logits
        with torch.no_grad():
            logits, _ = model(generated)

        # Take the last token logits, convert to probabilities
        logits = logits[:, -1, :]
        probabilities = torch.nn.functional.softmax(logits, dim=-1)

        # Sample from the distribution (can use `torch.argmax` for greedy decoding)
        next_token = torch.multinomial(probabilities, 1)

        # Append the generated token to the input sequence
        generated = torch.cat([generated, next_token], dim=1)

    # Decode the generated tokens back into text
    generated_text = enc.decode(generated[0].cpu().numpy())
    return generated_text


# Example usage:
prompt = "Once upon a time"
generated_text = generate_text(model, prompt, max_length=100)
print(generated_text)

In [17]:
from google.colab import drive
drive.mount('/content/drive')

save_path = "/content/drive/My Drive/gpt_trained_model.pth"  # Update path as needed
if master_process:
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")

Mounted at /content/drive
Model saved to /content/drive/My Drive/gpt_trained_model.pth


In [18]:
print(device)

cuda


In [22]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

# Define the evaluation loop for text generation
def generate_text(model, enc, device, num_return_sequences=2, max_length=128, prompt="What hath you say,"):
    model.eval()  # Set model to evaluation mode

    # Encode the input prompt and prepare the input tensor
    tokens = enc.encode(prompt)
    tokens = torch.tensor(tokens, dtype=torch.long)
    tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)  # Repeat for num_return_sequences
    xgen = tokens.to(device)  # Move to the appropriate device (GPU/CPU)

    # Create a random generator for sampling
    sample_rng = torch.Generator(device=device)
    sample_rng.manual_seed(42)  # Set a seed for reproducibility

    # Generate tokens until reaching max_length
    while xgen.size(1) < max_length:
        with torch.no_grad():  # No gradient tracking during generation
            with torch.autocast(device_type=device, dtype=torch.bfloat16):  # Mixed precision (optional)
                logits, _ = model(xgen)  # Get logits (B, T, vocab_size)

            logits = logits[:, -1, :]  # Get logits for the last token (B, vocab_size)
            probs = F.softmax(logits, dim=-1)  # Get probabilities for the next token

            # Top-k sampling: Select the top 50 tokens
            topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)

            # Sample from the top-k probabilities
            ix = torch.multinomial(topk_probs, 1, generator=sample_rng)  # (B, 1)
            xcol = torch.gather(topk_indices, -1, ix)  # (B, 1)

            # Append the sampled token to the sequence
            xgen = torch.cat((xgen, xcol), dim=1)

    # Decode and print the generated sequences
    for i in range(num_return_sequences):
        tokens = xgen[i, :max_length].tolist()  # Get the generated tokens for this sequence
        decoded = enc.decode(tokens)  # Decode tokens back to text
        print(f"Generated text (sample {i}): {decoded}")

# Example usage
# Assuming 'model' is your trained model and 'enc' is your tokenizer

# Move model to device (if not already done)
model.to(device)
enc = tiktoken.get_encoding("gpt2")
# Generate text from the model
generate_text(model, enc, device)

Generated text (sample 0): What hath you say,

A, I you to?T that and her:
If to, myUS the.

That my, I thy on, for he you and:
O a, to;And of,
To my?
And's,
H,
With with for and:
In!
 sir, for I this with?
H?
As not beIO!
Of in.


And

What with,
R
With with me shall, but
Th?
 sir:
R, is him
To
My of'd be,

Generated text (sample 1): What hath you say, this your so you,
The her, I he with! I thy the:
KING:
The,
What, the:What.First
If,
Why that
No that with:
For me your.
O is it it by I my:
This

That and it, be not,
The's a,
HowIO.Now, thisEN is for be
Now.
I the all of the;

O;
 I's that.
What.
 but have.

For for with his shall'

 for he:



In [10]:
# Inference
num_return_sequences = 5
max_length = 30

device = "cpu"
if torch.cuda.is_available():
  device = "cuda"

# Weights from hugging face pre-trained gpt2
model = GPT.from_pretrained('gpt2')
#model = GPT(GPTConfig())
model.eval() # evaluation mode means not going to use any backtracking so it won't cache values
model.to(device) # move tensors to GPU

import tiktoken
enc = tiktoken.get_encoding("gpt2") # gpt2 token encoding
tokens = enc.encode("Hello, I'm a language model,")
tokens = torch.tensor(tokens, dtype=torch.long) # (8, )
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # (5, 8)
x = tokens.to(device) # X is the idx that can be passed into forward to obtain logits

torch.manual_seed(42)
torch.cuda.manual_seed(42)

while x.size(1) < max_length:
  # x is (B, T) w/ B = 5, T = 8
  with torch.no_grad():
    # logits of the next token
    logits, _ = model(x) # (B, T, vocab_size)

    # get logits for the last position because thats the token that needs to be identified
    logits = logits[:, -1, :] # (B, vocab_size)

    # get the probabilites (use softmax)
    probs = F.softmax(logits, dim=-1) # (B, vocab_size)

    # topk = 50 (hf default)
    topk_probs, topk_indices = torch.topk(probs, k=50, dim=-1) # (B, 50)

    # select a token form topk_probs
    ix = torch.multinomial(topk_probs, 1) #(B, 1) (randomly select one from top 50)

    # gather corresponding indices
    xcol = torch.gather(topk_indices, -1, ix) # (B, 1) pick the ix token from top 50

    # append to the seq
    x = torch.cat((x, xcol), dim=1) # (B, T+1)  (add new token to the existing seq autoregressive)

for i in range(num_return_sequences):
  tokens = x[i,:max_length].tolist() # get the tokens up to max_length for the batch idx
  decoded = enc.decode(tokens)
  print(">", decoded)

loading weights from pretrained gpt: gpt2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

> Hello, I'm a language model, not a program.

So this morning I started studying for the interview in the lab. This was not
> Hello, I'm a language model, and one of the main things that bothers me when they create languages is how easy it becomes to create something that
> Hello, I'm a language model, and I wrote it off on the grounds that a language model would make me more fluent. But I'm not
> Hello, I'm a language model, I really like languages. I like languages because like, they're good. And the way we talk about languages
> Hello, I'm a language model, a language model I'm using for data modelling. All I did was test the results and then I wrote some
