In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoConfig, AutoModel
import numpy as np
import matplotlib.pyplot as plt

| Model  | # of Parameters | Hidden dim | # of blocks|
|-------|-----|--------| --------|
| gpt2 | 124M  | 768 /(64 * 12/)| 12|
| gpt2-medium | 355M | 1024 /(64 * 16/) | 24|
| gpt2-large | 774M | 1280 /(64 * 20/) | 36 |
|gpt2-xl | 1.56B | 1600 /(64 * 25/) | 48|


In [14]:
from transformers import AutoModelForCausalLM

model_name = 'gpt2'
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model_clm = AutoModelForCausalLM.from_pretrained(model_name)

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:
model

GPT2Model(
  (wte): Embedding(50257, 1600)
  (wpe): Embedding(1024, 1600)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-47): 48 x GPT2Block(
      (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
)

In [3]:
config

GPT2Config {
  "_name_or_path": "gpt2-large",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1280,
  "n_head": 20,
  "n_inner": null,
  "n_layer": 36,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.31.0",
  "use_cache": true,
  "vocab_size": 50257
}

In [4]:
model_clm

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)

In [4]:
tokenizer 

GPT2TokenizerFast(name_or_path='gpt2-xl', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [5]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>'}

In [6]:
tokenizer('My favorite color is shit')

{'input_ids': [3666, 4004, 3124, 318, 7510], 'attention_mask': [1, 1, 1, 1, 1]}

In [15]:
text = "A long long time ago"
model_inputs = tokenizer(text, return_tensors='pt')
input_ids = model_inputs.input_ids

In [12]:
output = model_clm(input_ids = input_ids)

In [14]:
output.keys()

odict_keys(['logits', 'past_key_values'])

In [16]:
output.logits.shape

torch.Size([1, 5, 50257])

In [19]:
len(output.past_key_values)

48

Model.transformer()

In [21]:
model_clm.eval()
transformer_outputs = model_clm.transformer(input_ids)

In [25]:
transformer_outputs.last_hidden_state.shape

torch.Size([1, 5, 1600])

#### embeddings
- wte: word token embeddings
- wpe: word position embeddings

In [27]:
model_clm.eval()
lm_logits = model_clm.lm_head(transformer_outputs.last_hidden_state)
lm_logits.shape

torch.Size([1, 5, 50257])

#### Greedy Search

In [16]:
def greedy_search(input_ids, model, n_steps, top_x, tokenizer):
    iterations = []
    with torch.no_grad():
        for _ in range(n_steps):
            iteration = {}
            iteration['input'] = tokenizer.decode(input_ids[0])
            
            transformer_outputs = model.transformer(input_ids)
            lm_logits = model.lm_head(transformer_outputs.last_hidden_state)
            last_token_logits = lm_logits[0, -1, :]
            last_token_probs = torch.softmax(last_token_logits, dim=-1)
            sorted_ids = torch.argsort(last_token_probs, dim=-1, descending=True)
            
            for choice_idx in range(top_x):
                token_id = sorted_ids[choice_idx]
                token_prob = last_token_probs[token_id]
                token_choice = f'{tokenizer.decode(token_id)}({100*token_prob:.2f}%)'
                iteration[f'choice {choice_idx + 1}'] = token_choice
            
            input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)
            
            iterations.append(iteration)
    return iterations, input_ids

In [17]:
iterations, output_ids = greedy_search(input_ids, model_clm, 10, 5, tokenizer)

In [18]:
import pandas as pd

pd.DataFrame(iterations)

Unnamed: 0,input,choice 1,choice 2,choice 3,choice 4,choice 5
0,A long long time ago,",(49.28%)",.(10.40%),I(4.62%),the(2.99%),when(2.35%)
1,"A long long time ago,",the(10.04%),I(9.30%),when(6.91%),there(3.95%),a(3.42%)
2,"A long long time ago, the",world(2.49%),only(1.22%),first(1.21%),people(0.99%),United(0.61%)
3,"A long long time ago, the world",was(27.99%),'s(8.40%),had(6.56%),of(6.24%),would(2.91%)
4,"A long long time ago, the world was",a(8.24%),divided(4.39%),ruled(3.90%),in(3.16%),full(2.84%)
5,"A long long time ago, the world was a",land(5.41%),place(5.12%),very(2.71%),little(2.39%),different(1.85%)
6,"A long long time ago, the world was a land",of(81.89%),where(5.60%),that(0.88%),filled(0.84%),full(0.83%)
7,"A long long time ago, the world was a land of",great(1.75%),the(1.32%),chaos(1.21%),peace(1.13%),giants(1.10%)
8,"A long long time ago, the world was a land of ...",wealth(10.26%),beauty(5.06%),riches(3.67%),abundance(3.22%),power(2.33%)
9,"A long long time ago, the world was a land of ...",and(54.78%),",(24.02%)",.(12.60%),;(1.38%),but(0.89%)


#### Beam Search