In [69]:
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time

*This cell is only necessary if using python environments with jupyter notebooks. Prevents having to restart the kernel in order for changes made to modules imported to be reflected*


In [168]:
import importlib
import inference
importlib.reload(inference)


# Now, you can use the updated 'your_function_name' or other functions from 'generate'
from inference import generate_token_w_caching, get_top_k, generate, generate_no_caching, generate_one_sequence

In [135]:
model_name = "/Users/amitej/amitejmehta/models/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [51]:
prompt = ["He was the boss of Chicago in the 20s and his name was Al"]
input = tokenizer(prompt, return_tensors="pt")
input

{'input_ids': tensor([[1544,  373,  262, 6478,  286, 4842,  287,  262, 1160,   82,  290,  465,
         1438,  373,  978]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [61]:
next_token_id, past_key_values = generate_token_w_caching(input, model)
next_token = tokenizer.decode(next_token_id)
print(f'{prompt}{next_token}')

He was the boss of Chicago in the 20s and his name was Al Gore


In [17]:
top_k = get_top_k(input, model, tokenizer, k=10)
print(f"Top 10 most likely tokens: {top_k}")

Top 10 most likely tokens: [' Gore', ' Cap', '.', ' Pac', ' Shar', 'on', ' Smith', 'ton', '-', ' Franken']


As we can see, our model believes the most likely token to follow Al is Gore, despite the context which suggests we are talking about Al Capone. While this error is in large part due to GPT-2s shortcomings, we do see that the second most likely token is ' Cap' the first of two tokens that make up 'Capone'. We'll explore later how different sampling techniques can improve the reponse.

In [74]:
max_tokens = 50
t0 = time.time()
generated_tokens = generate_no_caching(input, model, tokenizer, max_tokens)
t1 = time.time() - t0
print(f'{prompt}{"".join(generated_tokens)}')
print(f'Generated {max_tokens} tokens in {t1}s')


He was the boss of Chicago in the 20s and his name was Al Gore. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the
Generated 50 tokens in 1.897346019744873s


In [75]:
t0 = time.time()
generated_tokens = generate_one_sequence(input, model, tokenizer, max_tokens)
t1 = time.time() - t0
print(f'{prompt}{"".join(generated_tokens)}')
print(f'Generated {max_tokens} tokens in {t1}s')

He was the boss of Chicago in the 20s and his name was Al Gore. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the
Generated 50 tokens in 0.9406602382659912s


For 50 tokens, generation without caching takes twice as long as generation with. The longer the output the more time caching saves us. 

In [161]:
prompts = ["He was the boss of Chicago in the 20s and his name was Al",
           "In Game of Thrones, Ned Stark had five children. From oldest to youngest, their names were ",
           'def generate_token(inputs, model, sampling="greedy"): ']

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"

inputs = tokenizer(prompts, padding=True, return_tensors="pt")
inputs

{'input_ids': tensor([[50256, 50256, 50256, 50256, 50256,  1544,   373,   262,  6478,   286,
          4842,   287,   262,  1160,    82,   290,   465,  1438,   373,   978],
        [  818,  3776,   286, 20902,    11, 35754, 20956,   550,  1936,  1751,
            13,  3574, 13325,   284, 18887,    11,   511,  3891,   547,   220],
        [50256, 50256, 50256,  4299,  7716,    62, 30001,     7, 15414,    82,
            11,  2746,    11, 19232,  2625, 16694,  4716,     1,  2599,   220]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [162]:
next_token_ids, past_key_values = generate_token_w_caching(inputs, model)

print(next_token_ids.shape)
next_tokens = tokenizer.batch_decode(next_token_ids)
for i, prompt in enumerate(prompts):
    print(f'{prompt}{next_tokens[i]}')

torch.Size([3])
He was the boss of Chicago in the 20s and his name was Al in
In Game of Thrones, Ned Stark had five children. From oldest to youngest, their names were  
def generate_token(inputs, model, sampling="greedy"):  


In [169]:
generated_texts = generate(inputs, model, tokenizer, 100)

for prompt, generated in zip(prompts, generated_texts):
    print(prompt, f'"\x1b[31m{generated}\x1b[0m\n')

He was the boss of Chicago in the 20s and his name was Al "[31m Gore. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the president of the United States. He was the president of the United States[0m

In Game of Thrones, Ned Stark had five children. From oldest to youngest, their names were  "[31m Darth, Dany, Dany, and Dany.
Dany was the youngest, and Dany was the oldest.
Dany was the youngest, and Dany was the oldest.
Dany was the youngest, and Dany was the oldest.
Dany was the youngest, and Dany was the oldest.
Dany was the youngest, and Dany was the oldest.
Dany was the youngest, and Dany was the oldest.
D[0m

def generate_token(inputs, mo