In [2]:
import matplotlib.pyplot as plt 
import numpy as np
import time
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm 

In [3]:
model_name = "./models/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [12]:
prompt = "The quick brown fox jumped over the"
prompt = "Why people learn programming?"
inputs = tokenizer(prompt, return_tensors="pt")

def generate_token_with_past(inputs):
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    last_logits = logits[0,-1,:]
    next_token_id = last_logits.argmax()
    return next_token_id, outputs.past_key_values

def generate(inputs, max_tokens):
    generated_tokens = []
    next_inputs = inputs
    for _ in range(max_tokens):
        next_token_id, past_key_values = generate_token_with_past(next_inputs)

        next_inputs = {
            "input_ids": next_token_id.reshape((1,1)),
            "attention_mask": torch.cat(
                [next_inputs["attention_mask"], torch.tensor([[1]])],
                dim=1
            ),
            "past_key_values": past_key_values,
        }

        next_token = tokenizer.decode(next_token_id)
        generated_tokens.append(next_token)

    return "".join(generated_tokens)
    

tokens = generate(inputs, max_tokens=10)
print(tokens)



I think that's a good question.


In [13]:
# Step1: If we have multiple prompts to process them in a batch,
# we need to add paddings as an emptyiness, to align with a matrix

# define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# pad on the left, all new tokens will be appended to the right 
tokenizer.padding_side = "left"
tokenizer.padding_side = "left"

In [14]:
# multiple prompts of varying lengths to send to the model at once

prompts = [
    "The quick brown fox jumped over the",
    "The rain in Spain falls",
    "What comes up must",
]


# make sure the padding applied
inputs = tokenizer(prompts, padding=True, return_tensors="pt")

In [16]:
# check
print("input_ids:", inputs["input_ids"])
print("shape:", inputs["input_ids"].shape)

print("attention_mask:", inputs["attention_mask"])
print("shape:", inputs["attention_mask"].shape)

# in input_ids 50256 is added padding
# in attention_mask, we ignore 0


input_ids: tensor([[  464,  2068,  7586, 21831, 11687,   625,   262],
        [50256, 50256,   464,  6290,   287,  8602,  8953],
        [50256, 50256, 50256,  2061,  2058,   510,  1276]])
shape: torch.Size([3, 7])
attention_mask: tensor([[1, 1, 1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1, 1],
        [0, 0, 0, 1, 1, 1, 1]])
shape: torch.Size([3, 7])


In [17]:
# ordinal position of each token in the input sequence, should start with 0
# padding tokens are set to 1

attention_mask = inputs["attention_mask"]
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)

tensor([[0, 1, 2, 3, 4, 5, 6],
        [1, 1, 0, 1, 2, 3, 4],
        [1, 1, 1, 0, 1, 2, 3]])

In [19]:
# calculate logits, by specifying the position_ids

with torch.no_grad():
    outputs = model(position_ids = position_ids, ** inputs)
logits = outputs.logits

# retrieve most likely token for each prompt

last_logits = logits[:,-1,:]
next_token_ids = last_logits.argmax(dim=1)

# print
print(next_token_ids)

next_tokens = tokenizer.batch_decode(next_token_ids)
next_tokens

tensor([13990,   319,   307])


[' fence', ' on', ' be']