In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import random

In [2]:
# Load GPT-2 model and tokenizer directly
model_id = "gpt2"  # This is the 124M parameter version
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32,  # GPT-2 small works fine in full precision
)

# Add a proper pad token for GPT-2
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print("Model loaded and moved to device:", device)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Model loaded and moved to device: cuda


In [3]:
# Prepare the prompt
prompt = "The best way to learn programming is to"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text
with torch.no_grad():
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=20,
        temperature=1.0,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
    )

# Decode and print the result
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

The best way to learn programming is to apply it at your own pace. Start at 4.5 minutes, then gradually increase it, and


In [4]:
# autoregressive generation, one token at a time

# Prepare the prompt
prompt = "The best way to learn programming is to"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Start with the input tokens
current_tokens = inputs.input_ids

# Generate tokens one by one in a loop
max_new_tokens = 10  # How many new tokens to generate
with torch.no_grad():
    for i in range(max_new_tokens):
        # Generate just 1 token
        outputs = model.generate(
            current_tokens,
            max_new_tokens=1,
            temperature=1.0,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )
        
        # Update current_tokens with the new token
        current_tokens = outputs
        
        # Decode and print the current state
        generated_text = tokenizer.decode(current_tokens[0], skip_special_tokens=True)
        print(f"Step {i+1}: {generated_text}")
        
        # Optional: Stop if we hit the EOS token
        if outputs[0, -1].item() == tokenizer.eos_token_id:
            break

# # Final result
# final_text = tokenizer.decode(current_tokens[0], skip_special_tokens=True)
# print(f"\nFinal: {final_text}")

Step 1: The best way to learn programming is to study
Step 2: The best way to learn programming is to study the
Step 3: The best way to learn programming is to study the basic
Step 4: The best way to learn programming is to study the basic concepts
Step 5: The best way to learn programming is to study the basic concepts,
Step 6: The best way to learn programming is to study the basic concepts, not
Step 7: The best way to learn programming is to study the basic concepts, not just
Step 8: The best way to learn programming is to study the basic concepts, not just the
Step 9: The best way to learn programming is to study the basic concepts, not just the basics
Step 10: The best way to learn programming is to study the basic concepts, not just the basics,


In [5]:
# stochasticity of generation, temperature control

In [6]:
# capabilities: in-context learning, safety, refusal

# Prepare the prompt
prompt = "Here is how to build a high-grade explosive:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text
with torch.no_grad():
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=100,
        temperature=1.0,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
    )

# Decode and print the result
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Here is how to build a high-grade explosive:

The basic process of building a high-grade explosive is to first go through a number of steps (1-4) to build a very dense, highly explosive explosive. The next step is to apply pressure to the surface. For example, if you are building a bomb in the air, apply a pressure of 5 mm at the detonator head to a layer of the earth that is already a solid block of earth (e.g., a small lake). Apply a second pressure to the


In [7]:
# restrict to a single output token, examine next token probability distribution

# Prepare the prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    logits = model(inputs.input_ids).logits[0, -1, :]
    next_token_probs = torch.softmax(logits, dim=-1)

# Get top k most likely tokens (e.g., top 10)
top_k = 10
top_probs, top_indices = torch.topk(next_token_probs, top_k)

# Decode and display the most likely next tokens
print(f"Prompt: '{prompt}'")
print(f"\nTop {top_k} most likely next tokens:")
print("-" * 40)
for i, (prob, token_id) in enumerate(zip(top_probs, top_indices)):
    token = tokenizer.decode(token_id.item())
    print(f"{i+1:2d}. '{token}' - {prob.item():.4f} ({prob.item()*100:.2f}%)")

# Show full probability distribution statistics
print(f"\nProbability distribution stats:")
print(f"Max probability: {next_token_probs.max().item():.4f}")
print(f"Min probability: {next_token_probs.min().item():.6f}")
print(f"Entropy: {-(next_token_probs * torch.log(next_token_probs + 1e-10)).sum().item():.2f}")
print(f"Total vocab size: {len(next_token_probs)}")


Prompt: 'The capital of France is'

Top 10 most likely next tokens:
----------------------------------------
 1. ' the' - 0.0846 (8.46%)
 2. ' now' - 0.0479 (4.79%)
 3. ' a' - 0.0462 (4.62%)
 4. ' France' - 0.0324 (3.24%)
 5. ' Paris' - 0.0322 (3.22%)
 6. ' in' - 0.0266 (2.66%)
 7. ' also' - 0.0264 (2.64%)
 8. ' not' - 0.0238 (2.38%)
 9. ' home' - 0.0233 (2.33%)
10. ' still' - 0.0155 (1.55%)

Probability distribution stats:
Max probability: 0.0846
Min probability: 0.000000
Entropy: 6.00
Total vocab size: 50258


In [8]:
# examine the tokenizer: vocabulary, special tokens, example of tokenizing a string


# 1. Basic vocabulary statistics
print(f"BASIC STATS")
print(f"Vocabulary size: {tokenizer.vocab_size:,}")


BASIC STATS
Vocabulary size: 50,257


In [9]:
print(f"RANDOM VOCABULARY SAMPLES")
print("Token ID | Token | Decoded")
print("-" * 35)
for _ in range(15):
    random_id = random.randint(0, tokenizer.vocab_size - 1)
    token = tokenizer.convert_ids_to_tokens(random_id)
    decoded = tokenizer.decode(random_id)
    # print(f"{random_id:8d} | {token:8s} | '{decoded}'")
    print(f"{random_id:8d} | '{decoded}'")

RANDOM VOCABULARY SAMPLES
Token ID | Token | Decoded
-----------------------------------
   42953 | ' TRUMP'
   37815 | 'CONT'
   30249 | ' Cunning'
   28999 | ' classmates'
    4363 | 'resp'
    2728 | ' cause'
   16144 | ' alias'
   46795 | ' gib'
   33487 | 'erville'
    6658 | ' vac'
    7906 | ' spin'
    8026 | ' Stone'
   34881 | ' CNBC'
   23114 | 'Display'
   30577 | ' strut'


In [10]:
# encode a string into tokens
strng = "Hello, world! This is a test string."
encoded = tokenizer.encode(strng)
print(f"\nEncoded tokens for '{strng}':")
print(encoded)
# print what each token ID corresponds to
decoded_tokens = [tokenizer.decode([token_id]) for token_id in encoded]
print("Decoded tokens:")
for token_id, decoded in zip(encoded, decoded_tokens):
    print(f"Token ID {token_id}: '{decoded}'")


Encoded tokens for 'Hello, world! This is a test string.':
[15496, 11, 995, 0, 770, 318, 257, 1332, 4731, 13]
Decoded tokens:
Token ID 15496: 'Hello'
Token ID 11: ','
Token ID 995: ' world'
Token ID 0: '!'
Token ID 770: ' This'
Token ID 318: ' is'
Token ID 257: ' a'
Token ID 1332: ' test'
Token ID 4731: ' string'
Token ID 13: '.'


In [11]:
# examine the model architecture: overall embedding/unembedding, blocks, attention, MLP 

model


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

In [12]:
model.transformer.wte.weight  # Word token embeddings

Parameter containing:
tensor([[-0.1101, -0.0393,  0.0331,  ..., -0.1364,  0.0151,  0.0453],
        [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
        [-0.1275,  0.0479,  0.1841,  ...,  0.0899, -0.1297, -0.0879],
        ...,
        [ 0.1860,  0.0167,  0.0461,  ..., -0.0963,  0.0785, -0.0225],
        [ 0.0514, -0.0277,  0.0499,  ...,  0.0070,  0.1552,  0.1207],
        [-0.0026, -0.0585,  0.1174,  ...,  0.0236,  0.0039,  0.0344]],
       device='cuda:0', requires_grad=True)

In [13]:
model.transformer.h[0].attn.c_attn # Q, K, V in self-attention, all together in one

Conv1D(nf=2304, nx=768)