In [1]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "the-verdict.txt" 
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x23e8167cc10>)

### 分词

In [11]:
import re 
with open("./the-verdict.txt",'r',encoding="utf-8") as f:
    raw_data = f.read()
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_data)
preprocessed = [item.strip() for item in preprocessed if item.split()]

In [12]:
len(preprocessed)

4690

### Tokens and Token IDs

In [14]:
# 构建词汇表
vocab = {token:integer for integer,token in enumerate(set(preprocessed))}
print(len(vocab))

1130


In [22]:
class SimapleTokenizerV1:
    def __init__(self, vocab):
        self.str2int = vocab
        self.int2str = {i:k for k,i in vocab.items()}
        
    def encode(self, raw):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw)
        preprocessed = [item.strip() for item in preprocessed if item.split()]
        ids = [self.str2int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        texts = " ".join([self.int2str[i] for i in ids])
        texts = re.sub(r'\s+([,.?!"()\'])',r'\1', texts)
        return texts

In [23]:
t1 = SimapleTokenizerV1(vocab=vocab)
ids = t1.encode("It's me!")
print(ids)
print(t1.decode(ids))

[1106, 283, 564, 614, 686]
It' s me!


In [30]:
# 添加特殊字符
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|unk|>","<|endoftext|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(len(vocab))


1132


In [37]:
# 更新 SimapleTokenizerV1 
class SimapleTokenizerV2:
    def __init__(self, vocab):
        self.str2int = vocab
        self.int2str = {i:k for k,i in vocab.items()}
        
    def encode(self, texts):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', texts)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str2int else "<|unk|>" for item in preprocessed]
        ids = [self.str2int[s] for s in preprocessed]
        return ids 
    
    def decode(self, ids):
        texts = " ".join([self.int2str[i] for i in ids])
        texts = re.sub(r'\s+([,.?"!_):;\'])',r'\1',texts)
        texts = re.sub(r'([(])+\s',r'\1',texts)
        return texts
    

In [38]:
tokenizer = SimapleTokenizerV2(vocab)
texts = "Hello, this is: me; And that is (he)."
ids = tokenizer.encode(texts)
print(ids)
print(tokenizer.decode(ids))

[1130, 5, 999, 584, 8, 663, 9, 14, 987, 584, 3, 533, 4, 7]
<|unk|>, this is: me; And that is (he).


### BPE

In [1]:
!pip install tiktoken
import tiktoken 
tokenizer = tiktoken.get_encoding("gpt2")
text = ( "Hello, do you like tea? <|endoftext|> In the sunlit terraces" 
        "of someunknownPlace." )
print(tokenizer.encode(text,allowed_special={"<|endoftext|>"}))
print(tokenizer.decode(tokenizer.encode(text,allowed_special={"<|endoftext|>"})))

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting tiktoken
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/dc/da/8d1cc3089a83f5cf11c2e489332752981435280285231924557350523a59/tiktoken-0.8.0-cp310-cp310-win_amd64.whl (884 kB)
     ---------------------------------------- 0.0/884.2 kB ? eta -:--:--
     ----------- ---------------------------- 262.1/884.2 kB ? eta -:--:--
     --------------------------------- ---- 786.4/884.2 kB 1.9 MB/s eta 0:00:01
     -------------------------------------- 884.2/884.2 kB 1.9 MB/s eta 0:00:00
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/42/7e/5f1b92c8468290c465fd50c5318da64319133231415a8aa6ea5ab995a815/regex-2024.11.6-cp310-cp310-win_amd64.whl (274 kB)
Collecting requests>=2.26.0 (from tiktoken)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl (64 kB)
Collectin

In [5]:
text = "Akwirw ier"
print(tokenizer.encode(text,allowed_special={"<|endoftext|>"}))
print(tokenizer.decode(tokenizer.encode(text,allowed_special={"<|endoftext|>"})))

[33901, 86, 343, 86, 220, 959]
Akwirw ier


### 构建 dataset 和 dataloader

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input = []
        self.target = []
        ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        for i in range(0,len(ids)-max_length,stride):
            self.input.append(torch.tensor(ids[i:i+max_length]))
            self.target.append(torch.tensor(ids[i+1:i+max_length+1]))
    
    def __len__(self):
        return len(self.input)
    
    def __getitem__(self, idx):
        return self.input[idx], self.target[idx]
    
def create_dataloader_v1(text, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,
                            num_workers=num_workers, drop_last=drop_last)
    return dataloader


In [13]:
with open("the-verdict.txt", "r", encoding="utf-8") as f: 
    raw_text = f.read()
t_dataloader=create_dataloader_v1(raw_text,batch_size=1,max_length=4,stride=1)
t_dataloader=iter(t_dataloader)
print(next(t_dataloader))

[tensor([[10724,   262,  6846,   338]]), tensor([[  262,  6846,   338, 11428]])]


### torch.nn.Embedding 

In [26]:
torch.manual_seed(123) 
vocab_size=5
output_dim=2
embedding_layer = torch.nn.Embedding(vocab_size, output_dim) 
print(embedding_layer.weight)
print(embedding_layer(torch.tensor([3,4])).shape)

Parameter containing:
tensor([[-0.1115,  0.1204],
        [-0.3696, -0.2404],
        [-1.1969,  0.2093],
        [-0.9724, -0.7550],
        [ 0.3239, -0.1085]], requires_grad=True)
torch.Size([2, 2])


### 增加 positional embedding

In [35]:
max_length=4
dataloader=create_dataloader_v1(raw_text,batch_size=8,max_length=max_length,stride=max_length,shuffle=False)
data_iter=iter(dataloader)
inputs, targets = next(data_iter)
print(inputs.shape)
print(targets.shape)
vocab_size = 50257 
output_dim = 256 
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)
positional_embedding_layer = torch.nn.Embedding(vocab_size,output_dim)
positional_embedding = positional_embedding_layer(torch.arange(max_length))
print(positional_embedding.shape)
embedding = token_embeddings + positional_embedding
print(f"embedding.shape: {embedding.shape}")

torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4, 256])
torch.Size([4, 256])
embedding.shape: torch.Size([8, 4, 256])
