In [1]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = "cuda"
model_id = "openai-community/gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)




In [2]:
from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")


Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors


In [3]:
encodings.input_ids.size(1)

287644

In [20]:
encodings

{'input_ids': tensor([[ 628,  796, 5199,  ...,  220,  628,  198]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [22]:
model.config.n_positions

1024

In [None]:
nlls = []

In [31]:
len(nlls)

173

In [37]:
import torch
from tqdm import tqdm

max_length = model.config.n_positions
stride = 512
seq_len = encodings.input_ids.size(1)


prev_end_loc = 0
for begin_loc in tqdm(range(156160, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())

print(ppl)

  0%|                                                             | 0/257 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 80.00 MiB (GPU 0; 5.79 GiB total capacity; 4.97 GiB already allocated; 91.25 MiB free; 5.20 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [39]:
ppl = torch.exp(torch.stack(nlls).mean())
print(ppl)

tensor(16.1426, device='cuda:0')


In [34]:
begin_loc

156160

In [35]:
len(range(156160, seq_len, stride))

257

In [30]:
list(range(0, seq_len, stride))

[0,
 512,
 1024,
 1536,
 2048,
 2560,
 3072,
 3584,
 4096,
 4608,
 5120,
 5632,
 6144,
 6656,
 7168,
 7680,
 8192,
 8704,
 9216,
 9728,
 10240,
 10752,
 11264,
 11776,
 12288,
 12800,
 13312,
 13824,
 14336,
 14848,
 15360,
 15872,
 16384,
 16896,
 17408,
 17920,
 18432,
 18944,
 19456,
 19968,
 20480,
 20992,
 21504,
 22016,
 22528,
 23040,
 23552,
 24064,
 24576,
 25088,
 25600,
 26112,
 26624,
 27136,
 27648,
 28160,
 28672,
 29184,
 29696,
 30208,
 30720,
 31232,
 31744,
 32256,
 32768,
 33280,
 33792,
 34304,
 34816,
 35328,
 35840,
 36352,
 36864,
 37376,
 37888,
 38400,
 38912,
 39424,
 39936,
 40448,
 40960,
 41472,
 41984,
 42496,
 43008,
 43520,
 44032,
 44544,
 45056,
 45568,
 46080,
 46592,
 47104,
 47616,
 48128,
 48640,
 49152,
 49664,
 50176,
 50688,
 51200,
 51712,
 52224,
 52736,
 53248,
 53760,
 54272,
 54784,
 55296,
 55808,
 56320,
 56832,
 57344,
 57856,
 58368,
 58880,
 59392,
 59904,
 60416,
 60928,
 61440,
 61952,
 62464,
 62976,
 63488,
 64000,
 64512,
 65024,


In [29]:
begin_loc

88576

In [26]:
encodings.input_ids.size(1) // 512

561