In [2]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = "cuda"
model_id = "openai-community/gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

print("done")

Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors


done


In [5]:

# 打印test的基本信息和前几行内容
print("Test dataset info:")
print(test.info)
print("\nFirst few lines of test dataset:")
for i in range(5):
    print(test["text"][i])  

# print the statistic information of encodings
print("\nEncodings info:")
print(encodings.keys())
print(encodings["input_ids"].shape)
print(encodings["attention_mask"].shape)
# what is the structure of encodings?
print(encodings)






Test dataset info:
DatasetInfo(description='', citation='', homepage='', license='', features={'text': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name='parquet', dataset_name='wikitext', config_name='wikitext-2-raw-v1', version=0.0.0, splits={'test': SplitInfo(name='test', num_bytes=1305088, num_examples=4358, shard_lengths=None, dataset_name='wikitext'), 'train': SplitInfo(name='train', num_bytes=11061717, num_examples=36718, shard_lengths=None, dataset_name='wikitext'), 'validation': SplitInfo(name='validation', num_bytes=1159288, num_examples=3760, shard_lengths=None, dataset_name='wikitext')}, download_checksums={'hf://datasets/wikitext@b08601e04326c79dfdd32d625aee71d232d685c3/wikitext-2-raw-v1/test-00000-of-00001.parquet': {'num_bytes': 732610, 'checksum': None}, 'hf://datasets/wikitext@b08601e04326c79dfdd32d625aee71d232d685c3/wikitext-2-raw-v1/train-00000-of-00001.parquet': {'num_bytes': 6357543, 'checksum': None}, 'hf

In [7]:
seq_len = encodings.input_ids.size(1)
print('seq_len',seq_len)

seq_len 287644


In [8]:

import torch
from tqdm import tqdm

max_length = model.config.n_positions
stride = 512


nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())


list=nlls
print("list的基本信息：")
print("元素个数:", len(list))
print("元素类型:", type(list[0]))

# 打印list的前几个元素
print("\nlist的前几个元素：")
for i in range(3):
    print(list[i])

100%|█████████▉| 560/562 [03:12<00:00,  2.91it/s]


list的基本信息：
元素个数: 561
元素类型: <class 'torch.Tensor'>

list的前几个元素：
tensor(2.4563, device='cuda:0')
tensor(3.1668, device='cuda:0')
tensor(3.0118, device='cuda:0')
