# GPT-2 Completions

Download GPT-2 weights and run text completions.

In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


## Download and Save GPT-2 Weights

In [2]:
# Setup paths
weights_dir = Path("../weights/gpt2")
weights_dir.mkdir(parents=True, exist_ok=True)

print(f"Weights will be saved to: {weights_dir.resolve()}")

Weights will be saved to: /home/zaccosenza/code/project-llm-chat/weights/gpt2


In [3]:
# Download GPT-2 model and tokenizer
model_name = "gpt2"  # Options: gpt2, gpt2-medium, gpt2-large, gpt2-xl

print(f"Downloading {model_name}...")
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Downloading gpt2...


Loading weights: 100%|██████████| 148/148 [00:00<00:00, 289.71it/s, Materializing param=transformer.wte.weight]             
GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model parameters: 124,439,808


In [4]:
# Save weights locally
print(f"Saving to {weights_dir}...")
model.save_pretrained(weights_dir)
tokenizer.save_pretrained(weights_dir)

print("Saved!")
print(f"Contents: {list(weights_dir.iterdir())}")

Saving to ../weights/gpt2...


Writing model shards: 100%|██████████| 1/1 [-00:00<00:00, -1.40it/s]

Saved!
Contents: [PosixPath('../weights/gpt2/config.json'), PosixPath('../weights/gpt2/tokenizer_config.json'), PosixPath('../weights/gpt2/generation_config.json'), PosixPath('../weights/gpt2/model.safetensors'), PosixPath('../weights/gpt2/tokenizer.json')]





## Load Model for Inference

In [5]:
# Load from local weights
device = torch.device("xpu" if torch.xpu.is_available() else "cpu")
print(f"Using device: {device}")

model = GPT2LMHeadModel.from_pretrained(weights_dir).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(weights_dir)

# Set pad token (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token
model.eval()

Using device: xpu


Loading weights: 100%|██████████| 148/148 [00:00<00:00, 480.40it/s, Materializing param=transformer.wte.weight]             


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## Text Completion Function

In [6]:
def generate_completion(
    prompt: str,
    max_new_tokens: int = 50,
    temperature: float = 0.7,
    top_p: float = 0.9,
    top_k: int = 50,
    num_return_sequences: int = 1,
) -> list[str]:
    """Generate text completions for a given prompt."""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    completions = []
    for output in outputs:
        text = tokenizer.decode(output, skip_special_tokens=True)
        completions.append(text)
    
    return completions

## Run Completions

In [7]:
# Test prompts
prompts = [
    "The capital of France is",
    "In a shocking turn of events, scientists discovered that",
    "The best way to learn programming is",
    "Once upon a time, in a land far away,",
    "The meaning of life is",
]

In [8]:
# Generate completions
for prompt in prompts:
    print(f"\n{'='*60}")
    print(f"Prompt: {prompt}")
    print(f"{'='*60}")
    
    completions = generate_completion(prompt, max_new_tokens=50, temperature=0.8)
    
    for i, completion in enumerate(completions):
        print(f"\nCompletion {i+1}:")
        print(completion)


Prompt: The capital of France is

Completion 1:
The capital of France is the most populous city in Europe and is home to more than 4,000,000 inhabitants. It is home to the oldest and largest military base in the world, the French Air Force. The capital has also been the site of the biggest concentration of

Prompt: In a shocking turn of events, scientists discovered that

Completion 1:
In a shocking turn of events, scientists discovered that the sun has been changing its orbit around the planet for two years.

The results, published online today in the journal Nature Geoscience, also reveal that the planet has been moving toward its closest approach to the sun. The planet is currently

Prompt: The best way to learn programming is

Completion 1:
The best way to learn programming is to be familiar with the programming language and how to use it.

To start, get started. If you're familiar with programming, then get a programming degree and start learning programming. This way you'll be a

## Greedy Decoding (Temperature = 0)

In [9]:
def generate_greedy(prompt: str, max_new_tokens: int = 50) -> str:
    """Generate text using greedy decoding (deterministic)."""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [10]:
# Test greedy decoding
test_prompts = [
    "The capital of France is",
    "2 + 2 =",
    "The largest planet in our solar system is",
]

for prompt in test_prompts:
    print(f"\nPrompt: {prompt}")
    print(f"Output: {generate_greedy(prompt, max_new_tokens=30)}")


Prompt: The capital of France is
Output: The capital of France is the capital of the French Republic, and the capital of the French Republic is the capital of the French Republic.

The French Republic is the capital

Prompt: 2 + 2 =
Output: 2 + 2 = 3.5 + 3.5 + 3.5 + 3.5 + 3.5 + 3.5 + 3.5 + 3.

Prompt: The largest planet in our solar system is
Output: The largest planet in our solar system is about 1.5 billion light years away.

The planet is about 1.5 billion light years from Earth.

The planet is about


## Interactive Completion

In [11]:
# Try your own prompt
custom_prompt = "The future of artificial intelligence is"

print(f"Prompt: {custom_prompt}\n")
print("Completion:")
print(generate_completion(custom_prompt, max_new_tokens=100, temperature=0.7)[0])

Prompt: The future of artificial intelligence is

Completion:
The future of artificial intelligence is bright. As we learn more about the future of AI, it's likely that we'll find that AI is not only smarter, but also smarter than it was in the past.

The Future of Artificial Intelligence

AI is changing how we think about life, how we interact with our environment, and how we communicate with other people.

While we might not see it all, there are some things we can learn from the past that are still relevant today.

The first thing
