In [1]:
from huggingface_hub import hf_hub_download
file = hf_hub_download('alexedw/dense-train-masked-between-tokens', 'model_state.pt', revision='1500')


Downloading:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

In [7]:
file

'/Users/alex/.cache/huggingface/hub/models--alexedw--pythia-410-dense-test-1/snapshots/c930775cfacb01fcf2625fbfeff752c428a9ca54/model_state.pt'

In [9]:
import sys
sys.path.append('..')
from transformers import GPTNeoXTokenizerFast
from core.model import GPT, GPTConfig, Tokenizer, DENSE_TOKEN_ID
import torch
tokenizer = Tokenizer()

model = GPT(GPTConfig.from_pretrained('EleutherAI/pythia-410m'))
state_dict = torch.load(file, map_location=torch.device('cpu'))

unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

model.load_state_dict(state_dict)
state_dict = None

number of parameters: 405.33M


In [95]:
tokens = torch.tensor([tokenizer.encode('My name is Alex Dog Dog Dog<|dense|><|dense|>')])

logits, dense, loss = model(tokens)

In [96]:
# Last Token = tokens[0,5] # Last Logits = logits[0,5]
token = model.sample_top_p(logits[0,5], top_p=0.0, temperature=0.001)
token, tokenizer.decode(token)

(tensor([50277]), '<|dense|>')

In [101]:
def generate(idx, max_new_tokens, dense_input=None, temperature=0.7, top_p=0.9):
    """
    Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
    the sequence max_new_tokens times, feeding the predictions back into the model each time.
    Most likely you'll want to make sure to be in model.eval() mode of operation for this.
    """
    if dense_input is None:
        dense_input = model.create_dense_inputs(idx)

    for _ in range(max_new_tokens):
        # if the sequence context is growing too long we must crop it at block_size
        logits, dense_out, _ = model(idx, dense=dense_input)
        # pluck the logits at the final step and scale by desired temperature
        logits = logits[:, -1, :]
        # sample from the top-p distribution
        idx_next = model.sample_top_p(logits, temperature, top_p)
        # append sampled index to the running sequence and continue
        idx = torch.cat((idx, idx_next), dim=1)
        
        dense_input = model.create_dense_inputs(idx, DENSE_TOKEN_ID, dense_out)

    return idx

tokens_new = generate(torch.tensor([[50277,50277]]), dense_input=dense[:, -2:, :], max_new_tokens=15, temperature=0.0001)
tokens_new, tokenizer.decode(tokens_new[0])

(tensor([[50277, 21048, 50277,   434,   367, 11345,   187,   187,   510,  4370,
            434, 44802,   310,   247,  1077,  1774]]),
 "<|dense|> Dog<|dense|>'s Paws\n\nThe dog's paw is a very important")

In [102]:
tokens_new = generate(torch.tensor([[50277]]), max_new_tokens=15, temperature=0.0001)
tokens_new, tokenizer.decode(tokens_new[0])

(tensor([[50277, 29408, 50277,    15,   187,   187,   510,   806,  2181,   309,
            858,   369,   281,   564,   281,   253]]),
 '<|dense|>cie<|dense|>.\n\nThe first thing I did was to go to the')