In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import random

In [2]:
# Load GPT-2 model and tokenizer directly
model_id = "gpt2"  # This is the 124M parameter version
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32,  # GPT-2 small works fine in full precision
)

# Add a proper pad token for GPT-2
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print("Model loaded and moved to device:", device)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Model loaded and moved to device: cuda


In [3]:
# count total parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = count_parameters(model)
# print human readable number of parameters
print(f"Total parameters: {total_params:,}")

Total parameters: 124,440,576


In [4]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

In [29]:
# autoregressive generation example

# Prepare the prompt
prompt = "mechan*"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text
with torch.no_grad():
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=1,
        temperature=1.0,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
    )

# Decode and print the result
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

mechan* (


In [19]:
# autoregressive generation, ONE TOKEN AT A TIME

# Prepare the prompt
prompt = "The best way to learn programming is to"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Start with the input tokens
current_tokens = inputs.input_ids

# Generate tokens one by one in a loop
max_new_tokens = 20  # How many new tokens to generate
with torch.no_grad():
    for i in range(max_new_tokens):
        # Generate just 1 token
        outputs = model.generate(
            current_tokens,
            max_new_tokens=1,
            temperature=1.0,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )
        
        # Update current_tokens with the new token
        current_tokens = outputs
        
        # Decode and print the current state
        generated_text = tokenizer.decode(current_tokens[0], skip_special_tokens=True)
        print(f"Step {i+1}: {generated_text}")
        
        # Optional: Stop if we hit the EOS token
        if outputs[0, -1].item() == tokenizer.eos_token_id:
            break

# # Final result
# final_text = tokenizer.decode(current_tokens[0], skip_special_tokens=True)
# print(f"\nFinal: {final_text}")

Step 1: The best way to learn programming is to apply
Step 2: The best way to learn programming is to apply it
Step 3: The best way to learn programming is to apply it to
Step 4: The best way to learn programming is to apply it to a
Step 5: The best way to learn programming is to apply it to a project
Step 6: The best way to learn programming is to apply it to a project.
Step 7: The best way to learn programming is to apply it to a project. If
Step 8: The best way to learn programming is to apply it to a project. If you
Step 9: The best way to learn programming is to apply it to a project. If you have
Step 10: The best way to learn programming is to apply it to a project. If you have never
Step 11: The best way to learn programming is to apply it to a project. If you have never tried
Step 12: The best way to learn programming is to apply it to a project. If you have never tried to
Step 13: The best way to learn programming is to apply it to a project. If you have never tried to learn
S

In [None]:
# stochasticity of generation, temperature control

# autoregressive generation example

# Prepare the prompt
prompt = "The best way to learn programming is to"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text
with torch.no_grad():
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=100,
        temperature=1.0,        # TRY CHANGING THIS
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
    )

# Decode and print the result
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

In [31]:
# capabilities: in-context learning, safety, refusal (ChatGPT)

# base models are happy to generate unsafe / toxic / harmful content:
# Prepare the prompt
prompt = "Here is how to build a high-grade explosive:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text
with torch.no_grad():
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=100,
        temperature=1.0,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
    )

# Decode and print the result
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Here is how to build a high-grade explosive:

In the above pictures, it can be seen that we can make three different high-grade explosives:

The first one is the low grade explosive. The second one is the high grade explosive. The third one is the high grade explosive. All three explosives are produced in a single process and are the same composition. So, we have 3 different high-grade explosive compositions, so we can build them together in a single process.

To see how it works, we simply need


In [5]:
# restrict to a single output token, examine next token probability distribution

# Prepare the prompt
prompt = "The name of the capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    logits = model(inputs.input_ids).logits[0, -1, :]
    next_token_probs = torch.softmax(logits, dim=-1)

# Get top k most likely tokens (e.g., top 10)
top_k = 10
top_probs, top_indices = torch.topk(next_token_probs, top_k)

# Decode and display the most likely next tokens
print(f"Prompt: '{prompt}'")
print(f"\nTop {top_k} most likely next tokens:")
print("-" * 40)
for i, (prob, token_id) in enumerate(zip(top_probs, top_indices)):
    token = tokenizer.decode(token_id.item())
    print(f"{i+1:2d}. '{token}' - {prob.item():.4f} ({prob.item()*100:.2f}%)")

# Show full probability distribution statistics
print(f"\nProbability distribution stats:")
print(f"Max probability: {next_token_probs.max().item():.4f}")
print(f"Min probability: {next_token_probs.min().item():.6f}")
print(f"Entropy: {-(next_token_probs * torch.log(next_token_probs + 1e-10)).sum().item():.2f}")
print(f"Total vocab size: {len(next_token_probs)}")


Prompt: 'The name of the capital of France is'

Top 10 most likely next tokens:
----------------------------------------
 1. ' Paris' - 0.0478 (4.78%)
 2. ' Marse' - 0.0433 (4.33%)
 3. ' Saint' - 0.0308 (3.08%)
 4. ' the' - 0.0282 (2.82%)
 5. ' "' - 0.0252 (2.52%)
 6. ' France' - 0.0240 (2.40%)
 7. ' not' - 0.0214 (2.14%)
 8. ' French' - 0.0169 (1.69%)
 9. ' Le' - 0.0162 (1.62%)
10. ' '' - 0.0107 (1.07%)

Probability distribution stats:
Max probability: 0.0478
Min probability: 0.000000
Entropy: 6.62
Total vocab size: 50258


In [6]:
# examine the tokenizer: vocabulary, special tokens, example of tokenizing a string


# 1. Basic vocabulary statistics
print(f"BASIC STATS")
print(f"Vocabulary size: {tokenizer.vocab_size:,}")


BASIC STATS
Vocabulary size: 50,257


In [8]:
print(f"RANDOM VOCABULARY SAMPLES")
print("Token ID | Token | Decoded")
print("-" * 35)
for _ in range(15):
    random_id = random.randint(0, tokenizer.vocab_size - 1)
    token = tokenizer.convert_ids_to_tokens(random_id)
    decoded = tokenizer.decode(random_id)
    # print(f"{random_id:8d} | {token:8s} | '{decoded}'")
    print(f"{random_id:8d} | '{decoded}'")

RANDOM VOCABULARY SAMPLES
Token ID | Token | Decoded
-----------------------------------
   33077 | 'ethy'
    2474 | '!"'
   18887 | ' youngest'
   23168 | ' embodiment'
   19812 | ' fictional'
   30669 | ' masterpiece'
   26290 | '!),'
    4025 | ' larger'
   29556 | ' flock'
   20498 | ' EVER'
   46912 | 'Matrix'
   43503 | ' Lime'
   18289 | ' Monitor'
   35015 | ' Eisenhower'
   39355 | '�'


In [11]:
# encode a string into tokens
# strng = "Hello, world! This is a test string."
strng = "2035345345345345345345"
encoded = tokenizer.encode(strng)
print(f"\nEncoded tokens for '{strng}':")
print(encoded)
# print what each token ID corresponds to
decoded_tokens = [tokenizer.decode([token_id]) for token_id in encoded]
print("Decoded tokens:")
for token_id, decoded in zip(encoded, decoded_tokens):
    print(f"Token ID {token_id}: '{decoded}'")


Encoded tokens for '2035345345345345345345':
[1238, 2327, 27712, 27712, 27712, 27712, 27712, 27712]
Decoded tokens:
Token ID 1238: '20'
Token ID 2327: '35'
Token ID 27712: '345'
Token ID 27712: '345'
Token ID 27712: '345'
Token ID 27712: '345'
Token ID 27712: '345'
Token ID 27712: '345'


In [12]:
# examine the model architecture: overall embedding/unembedding, blocks, attention, MLP 
# Note: A detailed model implementation can be found in Karpathy's MinGPT implementation: 
# https://github.com/karpathy/minGPT/blob/master/mingpt/model.py

model


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

In [14]:
model.transformer.wte.weight.shape  # Word token embeddings

torch.Size([50258, 768])

In [None]:
model.transformer.h[0].attn.c_attn # Q, K, V in self-attention, all together in ones