# 使用滑动窗口进行采样

In [1]:
with open("./../the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print(f"Length of dataset in characters: {len(raw_text)}")
print(raw_text[:50])  # Print the first 1000 characters to check

Length of dataset in characters: 20479
I HAD always thought Jack Gisburn rather a cheap g


In [2]:
from importlib.metadata import version
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
enc_ids = tokenizer.encode(raw_text)
print(f"Length of dataset in tokens: {len(enc_ids)}")
print(enc_ids[:10])  # Print the first 10 token IDs to check

Length of dataset in tokens: 5145
[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]


In [5]:
# 去掉前50个token作为时延样本
enc_sample = enc_ids[50:]

In [7]:
context_len = 4
for i in range(1, context_len + 1):
    context = enc_sample[:i]
    target = enc_sample[i : i + 1]
    print(f"Context:{context} -> Target: {target}")

Context:[290] -> Target: [4920]
Context:[290, 4920] -> Target: [2241]
Context:[290, 4920, 2241] -> Target: [287]
Context:[290, 4920, 2241, 287] -> Target: [257]


In [10]:
context_len = 4
for i in range(1, context_len + 1):
    context = enc_sample[:i]
    target = enc_sample[i : i + 1]
    print(f"Context:{tokenizer.decode(context)} -> Target: {tokenizer.decode(target)}")

Context: and -> Target:  established
Context: and established -> Target:  himself
Context: and established himself -> Target:  in
Context: and established himself in -> Target:  a


# 创建DataSet and datasetLoader

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.inputs = []
        self.targets = []
        enc_ids = tokenizer.encode(text)
        for i in range(0, len(enc_ids) - max_length, stride):
            input_ids = enc_ids[i : i + max_length]
            target_ids = enc_ids[i + 1 : i + max_length + 1]
            self.inputs.append(torch.tensor(input_ids))
            self.targets.append(torch.tensor(target_ids))
    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]



In [12]:
def create_dataloader(text, batch_size, max_length = 256, stride = 128, shuffle = True, drop_last = True, num_workers = 0):
    tokenizer = tiktoken.get_encoding("gpt2");
    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size = batch_size, shuffle = shuffle, drop_last = drop_last, num_workers = num_workers)
    return dataloader

In [14]:
with open("./../the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
dataloader = create_dataloader(raw_text, batch_size = 1, max_length =4, stride = 1, shuffle = False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print("First batch:", first_batch)

First batch: [tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [15]:
second_batch = next(data_iter)
print("Second batch:", second_batch)

Second batch: [tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]
