In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from mingpt.model import GPT
from mingpt.utils import set_seed
from mingpt.bpe import BPETokenizer
import matplotlib.pyplot as plt
set_seed(3407)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
input = "Michelle Jones was a top-notch student. Michelle"
print("Input:", input)
bpe = BPETokenizer()
# bpe() gets a string and returns a 2D batch tensor 
# of indices with shape (1, input_length)
tokens = bpe(input)[0]
print("Tokenized input:", tokens)
input_length = tokens.shape[-1]
print("Number of input tokens:", input_length)
# bpe.decode gets a 1D tensor (list of indices) and returns a string
print("Detokenized input from indices:", bpe.decode(tokens))  
tokens_str = []
for token in tokens:
    decoded_token = bpe.decode(torch.tensor([token]))
    tokens_str.append(decoded_token)
print("Detokenized input as strings: " + '/'.join(tokens_str))


Input: Michelle Jones was a top-notch student. Michelle
Tokenized input: tensor([48736,  5437,   373,   257,  1353,    12,  1662,   354,  3710,    13,
        16738])
Number of input tokens: 11
Detokenized input from indices: Michelle Jones was a top-notch student. Michelle
Detokenized input as strings: Michelle/ Jones/ was/ a/ top/-/not/ch/ student/./ Michelle


In [31]:
model = 'gpt2-xl'
device = "cpu"

In [32]:
model = GPT.from_pretrained(model)
model.to(device)
model.eval()
use_minigpt = True

number of parameters: 1557.61M


In [38]:

def generate(prompt='', num_samples=10, steps=20, do_sample=True):


# tokenize the input prompt into integer input sequence
    tokenizer = BPETokenizer()
    if prompt == '':
        # to create unconditional samples...
        # manually create a tensor with only the special <|endoftext|> token
        # similar to what openai's code does here https://github.com/openai/gpt-2/blob/master/src/generate_unconditional_samples.py
        x = torch.tensor([[tokenizer.encoder.encoder['<|endoftext|>']]], dtype=torch.long)
    else:
        x = tokenizer(prompt).to(device)
    
    # we'll process all desired num_samples in a batch, so expand out the batch dim
    x = x.expand(num_samples, -1)

    # forward the model `steps` times to get samples, in a batch
    y = model.generate(x, max_new_tokens=steps, do_sample=do_sample, top_k=40)
    
    # Write outputs to file
    with open('output.txt', 'w', encoding='utf-8') as f:
        f.write('Prompt: ' + prompt + '\n\n')
        for i in range(num_samples):
            out = tokenizer.decode(y[i].cpu().squeeze())
            print('-'*80)
            print(out)
            f.write(out + '\n')  
        

In [None]:
inputs =['Michelle Jones was a top-notch student. Michelle',
         'Michelle Smith was a top-notch student. Michelle',
         'Jessica Jones was a top-notch student. Michelle',
         'Michelle Smith was a top-notch student. Jessica']

for input in inputs:
    generate(prompt=input, num_samples=10, steps=20, do_sample=True)

--------------------------------------------------------------------------------
Michelle Jones was a top-notch student. Michelle had a top 10 average and had participated in numerous extracurricular activities. She had a "
--------------------------------------------------------------------------------
Michelle Jones was a top-notch student. Michelle's parents believed that they had everything a woman could want. She was a varsity athlete and a
--------------------------------------------------------------------------------
Michelle Jones was a top-notch student. Michelle was also a victim of domestic violence," said CPD spokeswoman Melissa Matey.

Police said
--------------------------------------------------------------------------------
Michelle Jones was a top-notch student. Michelle had won all seven Advanced Placement classes she had taken in high school. She had been a member
--------------------------------------------------------------------------------
Michelle Jones was a t