In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoConfig, AutoModel
import numpy as np
import matplotlib.pyplot as plt

| Model  | # of Parameters | Hidden dim | # of blocks|
|-------|-----|--------| --------|
| gpt2 | 124M  | 768 /(64 * 12/)| 12|
| gpt2-medium | 355M | 1024 /(64 * 16/) | 24|
| gpt2-large | 774M | 1280 /(64 * 20/) | 36 |
|gpt2-xl | 1.56B | 1600 /(64 * 25/) | 48|


In [8]:
from transformers import AutoModelForCausalLM

model_name = 'gpt2-xl'
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model_clm = AutoModelForCausalLM.from_pretrained(model_name)

In [10]:
model

GPT2Model(
  (wte): Embedding(50257, 1600)
  (wpe): Embedding(1024, 1600)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-47): 48 x GPT2Block(
      (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
)

In [9]:
model_clm

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=50257, bias=False)
)

In [4]:
tokenizer 

GPT2TokenizerFast(name_or_path='gpt2-xl', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [5]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>'}

In [6]:
tokenizer('My favorite color is shit')

{'input_ids': [3666, 4004, 3124, 318, 7510], 'attention_mask': [1, 1, 1, 1, 1]}

In [11]:
text = "A long long time ago"
model_inputs = tokenizer(text, return_tensors='pt')
input_ids = model_inputs.input_ids

In [12]:
output = model_clm(input_ids = input_ids)

In [14]:
output.keys()

odict_keys(['logits', 'past_key_values'])

In [16]:
output.logits.shape

torch.Size([1, 5, 50257])

In [19]:
len(output.past_key_values)

48

Model.transformer()

In [21]:
model_clm.eval()
transformer_outputs = model_clm.transformer(input_ids)

In [25]:
transformer_outputs.last_hidden_state.shape

torch.Size([1, 5, 1600])

#### embeddings
- wte: word token embeddings
- wpe: word position embeddings

In [27]:
model_clm.eval()
lm_logits = model_clm.lm_head(transformer_outputs.last_hidden_state)
lm_logits.shape

torch.Size([1, 5, 50257])

#### Greedy Search

In [31]:
def greedy_search(input_ids, model, n_steps, top_x, tokenizer):
    iterations = []
    with torch.no_grad():
        for _ in range(n_steps):
            iteration = {}
            iteration['input'] = tokenizer.decode(input_ids[0])
            
            transformer_outputs = model.transformer(input_ids)
            lm_logits = model.lm_head(transformer_outputs.last_hidden_state)
            last_token_logits = lm_logits[0, -1, :]
            last_token_probs = torch.softmax(last_token_logits, dim=-1)
            sorted_ids = torch.argsort(last_token_probs, dim=-1, descending=True)
            
            for choice_idx in range(top_x):
                token_id = sorted_ids[choice_idx]
                token_prob = last_token_probs[token_id]
                token_choice = f'{tokenizer.decode(token_id)}({100*token_prob:.2f}%)'
                iteration[f'choice {choice_idx + 1}'] = token_choice
            
            input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)
            
            iterations.append(iteration)
    return iterations, input_ids

In [32]:
iterations, output_ids = greedy_search(input_ids, model_clm, 10, 5, tokenizer)

In [34]:
import pandas as pd

pd.DataFrame(iterations)

Unnamed: 0,input,choice 1,choice 2,choice 3,choice 4,choice 5
0,A long long time ago,",(47.04%)",in(13.84%),I(5.00%),((3.36%),...(2.97%)
1,"A long long time ago,",in(23.47%),I(9.72%),there(7.42%),the(5.44%),a(5.23%)
2,"A long long time ago, in",a(80.26%),the(6.61%),an(4.45%),another(0.85%),my(0.24%)
3,"A long long time ago, in a",galaxy(50.88%),land(11.56%),far(2.65%),place(2.60%),kingdom(2.51%)
4,"A long long time ago, in a galaxy",far(90.35%),not(5.81%),very(0.62%),that(0.35%),much(0.28%)
5,"A long long time ago, in a galaxy far",",(88.23%)",far(8.82%),away(2.12%),distant(0.14%),",(0.03%)"
6,"A long long time ago, in a galaxy far,",far(99.57%),distant(0.05%),Far(0.05%),very(0.05%),long(0.04%)
7,"A long long time ago, in a galaxy far, far",away(97.65%),",(1.90%)",distant(0.05%),far(0.05%),Away(0.05%)
8,"A long long time ago, in a galaxy far, far away",",(26.92%)",...(19.00%),…(13.21%),.(5.36%),"…""(4.21%)"
9,"A long long time ago, in a galaxy far, far away,",there(20.43%),a(18.31%),the(8.79%),in(5.54%),I(3.01%)


#### Beam Search