## pip

In [None]:
!pip install datasets

In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


## Import

In [1]:
import os
import re
import time
import json
import random
import string
import psutil
import pickle
from tqdm import tqdm
from pprint import pprint
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import mode

from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers, decoders, processors
import tiktoken

import torch
from torch.utils.data import TensorDataset, Dataset, IterableDataset, DataLoader

# 🟥 tokenize Tiktoken fast

In [None]:
dataset=load_dataset("roneneldan/TinyStories")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset.shape

{'train': (2119719, 1), 'validation': (21990, 1)}

In [None]:
tokenizer=tiktoken.get_encoding("gpt2")
tokenized_train_samples = []
for item in tqdm(dataset["train"], desc="Tokenizing Train Set"):
    input_ids = tokenizer.encode(item["text"])
    tokenized_train_samples.append(np.array(input_ids))

In [None]:
tokenized_valid_samples = []
for item in tqdm(dataset["validation"], desc="Tokenizing validation Set"):
    input_ids = tokenizer.encode(item["text"])
    tokenized_valid_samples.append(np.array(input_ids))

In [None]:
tokenized_valid_samples[:1]

In [None]:
sumtoks=  sum(len(tok) for tok in tokenized_train_samples)
print(sumtoks)

# 🟥 Train Bpe Tokenizer and data loader

## 🟧 BPE Trainer

In [None]:
# Initialize a BPE tokenizer
tokenizer = Tokenizer(models.BPE(unk_token="|<unk>|"))

# Use a pre-tokenizer to split text into words
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

# Initialize a BPE trainer
trainer = trainers.BpeTrainer(
    vocab_size=10_000,  # Set the vocabulary size
    special_tokens=["|<unk>|", "<|endoftext|>"],
    min_frequency=2,  # Set the minimum frequency of tokens
    )

# Train the tokenizer on a custom dataset
tokenizer.train_from_iterator(dataset["train"]["text"], trainer)

# Add special tokens
tokenizer.post_processor = processors.TemplateProcessing(
    single="<|endoftext|> $A",
    special_tokens=[("<|endoftext|>", tokenizer.token_to_id("<|endoftext|>"))],
)

# Add decoder
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False)

# Save the trained tokenizer
tokenizer.save("bpe-tokenizer_tinystories.json")

#
print(f"🎉 Tokenizer training complete!")
print(f"🔹 Vocabulary size: {tokenizer.get_vocab_size():,} tokens")

🎉 Tokenizer training complete!
🔹 Vocabulary size: 10,000 tokens


In [None]:
# Initialize a BPE tokenizer
tokenizer = Tokenizer.from_file("bpe-tokenizer_tinystories.json")
print(f"🎉 Tokenizer training complete!")
print(f"🔹 Vocabulary size: {tokenizer.get_vocab_size():,} tokens")

🎉 Tokenizer training complete!
🔹 Vocabulary size: 10,000 tokens


In [None]:
sent = 'They played together all day and became best friends.'
tokens = tokenizer.encode(sent)
print(tokens.ids)
print(tokens.tokens)

pprint(tokenizer.decode(tokens.ids))

[1, 546, 667, 462, 378, 252, 161, 1042, 725, 375, 15]
['<|endoftext|>', 'They', 'Ġplayed', 'Ġtogether', 'Ġall', 'Ġday', 'Ġand', 'Ġbecame', 'Ġbest', 'Ġfriends', '.']
'They played together all day and became best friends.'


## 🟧 Save and load Tokens with BPE tokenizer

In [None]:
# Tokenization {train}
tokenized_train_samples = []
for item in tqdm(dataset["train"], desc="Tokenizing Train Set"):
    input_ids = tokenizer.encode(item["text"]).ids
    tokenized_train_samples.append(np.array(input_ids))

Tokenizing Train Set: 100%|██████████| 2119719/2119719 [20:22<00:00, 1733.38it/s]


In [None]:
tokenized_train_samples_concat=[]
tokenized_train_samples_concat = np.concatenate(tokenized_train_samples)
len(tokenized_train_samples_concat)

464965814

In [None]:
# Save tokens as a pytorch file
torch.save(torch.tensor(tokenized_train_samples_concat), 'tokenized-train-samples_vocab-10k.pt')

In [None]:
# Tokenization {validation}
tokenized_valid_samples = []
for item in tqdm(dataset["validation"], desc="Tokenizing Validation Set"):
    input_ids = tokenizer.encode(item["text"]).ids
    tokenized_valid_samples.append(np.array(input_ids))

Tokenizing Validation Set: 100%|██████████| 21990/21990 [00:12<00:00, 1822.45it/s]


In [None]:
tokenized_valid_samples_concat=[]
tokenized_valid_samples_concat = np.concatenate(tokenized_valid_samples)
len(tokenized_valid_samples_concat)

4673588

In [None]:
# Save tokens as a pytorch file
torch.save(torch.tensor(tokenized_valid_samples_concat), 'tokenized-valid-samples_vocab-10k.pt')

## 🟧 load Data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
tokenized_train_samples = torch.load('/content/drive/MyDrive/temp/tokenized-train-samples_vocab-10k.pt')

In [4]:
print(tokenized_train_samples.shape)
tokenized_train_samples[:5]

torch.Size([464965814])


tensor([  1, 316, 252,  13, 155])

In [None]:
tokenized_valid_samples = torch.load('tokenized-valid-samples_vocab-10k.pt')

In [None]:
print(tokenized_valid_samples_load.shape)
print(tokenized_valid_samples_load[:10])
tokenized_valid_samples_concat[:10]

torch.Size([4673588])
tensor([   1, 2891,   15, 1014,  309,  159,  866,  460,  161,  223])


array([   1, 2891,   15, 1014,  309,  159,  866,  460,  161,  223])

## 🟧 Custom dataset

In [5]:
class TinyStoriesDataset(Dataset):

    def __init__(self, data, seq_len):
        self.seq_len = seq_len
        self.data = prepare_data(data, seq_len+1)

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        sample = self.data[idx]
        return sample[:-1], sample[1:]

In [None]:
train_set = TinyStoriesDataset(tokenized_train_samples, 128)
train_set.data.shape, len(train_set), train_set[0]

In [13]:
%timeit next(iter(train_set))

6.5 µs ± 104 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## 🟧 DataLoader

In [14]:
# tokenized_train_samples = torch.load('/content/drive/MyDrive/temp/tokenized-train-samples_vocab-10k.pt')
tokenized_valid_samples = torch.load('/content/drive/MyDrive/temp/tokenized-valid-samples_vocab-10k.pt')

train_set = TinyStoriesDataset(tokenized_train_samples, seq_len=128)
valid_set = TinyStoriesDataset(tokenized_valid_samples, seq_len=128)

In [15]:
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, pin_memory=True), num_workers=4)
valid_loader = DataLoader(valid_set, batch_size=32, shuffle=False, pin_memory=True), num_workers=4)

In [None]:
x_batch, y_batch = next(iter(train_loader))
x_batch.shape, y_batch.shape

In [None]:
len(train_loader), len(valid_loader)

In [None]:
%timeit next(iter(train_loader))

## 🟧 EDA

In [None]:
token_count_stories=[]
for tokns in tokenized_train_samples:
    token_count_stories.append(len(tokns))

In [None]:
token_count_stories_np=np.array(token_count_stories)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(token_count_stories, bins=50, kde=True)
plt.xlabel('Token Count')
plt.ylabel('Frequency')
plt.title('Distribution of Token Counts')
plt.show()

In [None]:
np.sort(token_count_stories_np)[:1000]

# 🔴 **Utils**

In [7]:
def prepare_data(tokens, seq_len):
    # Trim tokens so that total length is divisible by seq_len
    n_tokens = (tokens.shape[0] // seq_len) * seq_len
    tokens = tokens[:n_tokens]

    # Reshape to 2D tensor
    return tokens.view(-1, seq_len)


In [8]:
def num_trainable_params(model):
  nums = sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6
  return nums

In [9]:
def calculate_time(model, x, num_runs=10):
    torch.cuda.synchronize()
    start = time.time()
    for _ in range(num_runs):
        model(*x)
    torch.cuda.synchronize()
    return (time.time() - start) / num_runs

# 🟥 Transformer Model from scratch

In [None]:
import time
from dataclasses import dataclass

# from datasets import load_dataset
from tokenizers import Tokenizer

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F

In [None]:
class MultiHeadAtention(torch.nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.qkv_proj = torch.nn.Linear(embed_dim, 3 * embed_dim)
        self.out_proj = torch.nn.Linear(embed_dim, embed_dim)
    # احتمالا خطا ابعاد دارد زمان ترین ممکن است ترین نشود
    def forward(self, x):
        batch_size, seq_len, embed_dim = x.size()
        k,q,v = self.qkv_proj(x).view(batch_size, seq_len, 3, self.num_heads, self.head_dim).transpose(1,2).chunk(3)
        # F.scaled_dot_product_attention(q,k,v)
        # return self.out_proj(x)
        return q

In [None]:
x=torch.range(1,24).view(2,3,4)
print(x)

# /x=x.transpose(1,0)
print(x)
#

tensor([[[ 1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.],
         [ 9., 10., 11., 12.]],

        [[13., 14., 15., 16.],
         [17., 18., 19., 20.],
         [21., 22., 23., 24.]]])
tensor([[[ 1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.],
         [ 9., 10., 11., 12.]],

        [[13., 14., 15., 16.],
         [17., 18., 19., 20.],
         [21., 22., 23., 24.]]])


  x=torch.range(1,24).view(2,3,4)


In [None]:
print(x.shape)
y= MultiHeadAtention(4,2)(x)
y.shape

torch.Size([2, 3, 4])


ValueError: not enough values to unpack (expected 3, got 2)

# 🔴 **Model from scratch - Howsam**

## 🟠 Embedding

In [None]:
wte = nn.Embedding(tokenizer.get_vocab_size(), 100)
wte(torch.tensor([1, 2, 100])).shape

In [None]:
wpe = nn.Embedding(seq_len, 100)
wpe(torch.tensor([1, 2, 100])).shape

torch.Size([3, 100])

In [None]:
x = wte(x_batch) + wpe(torch.arange(x_batch.shape[1]))
x.shape

torch.Size([64, 128, 100])

## 🟠 Scaled Dot-Product Attention

In [None]:
q = k = v = x
print(q.shape)

mask = torch.tril(torch.ones(seq_len, seq_len))

scores = q @ k.transpose(-2, -1) / (k.shape[-1]**0.5)
scores.masked_fill_(mask ==0, float(-torch.inf))
scores = scores.softmax(dim=-1)
print(scores.shape)

z = scores @ v
z.shape

In [None]:
# scores = torch.randn(3, 5, 5)
# mask = torch.tril(torch.ones(5, 5))
# scores.masked_fill_(mask ==0, float(-torch.inf))
# scores = scores.softmax(dim=-1)
# scores

In [None]:
def scaled_dot_product_attention(q, k, v):
    mask = torch.tril(torch.ones(q.shape[-2], q.shape[-2])).to(device)
    scores = q @ k.transpose(-2, -1) / (k.shape[-1]**0.5)
    scores.masked_fill_(mask==0, float(-torch.inf))
    scores = scores.softmax(dim=-1)
    z = scores @ v
    return z

In [None]:
scaled_dot_product_attention(x.to(device), x.to(device), x.to(device)).shape

In [None]:
q = torch.randn((128, 1024, 768), device=device)
k = torch.randn((128, 1024, 768), device=device)
v = torch.randn((128, 1024, 768), device=device)
q.shape

torch.Size([128, 1024, 768])

In [None]:
scaled_dot_product_attention(q, k, v).shape

torch.Size([128, 1024, 768])

In [None]:
calculate_time(scaled_dot_product_attention, (q, k, v), num_runs=20)

0.13929617404937744

In [None]:
F.scaled_dot_product_attention(q, k, v, is_causal=True).shape

  F.scaled_dot_product_attention(q, k, v, is_causal=True).shape


torch.Size([128, 1024, 768])

In [None]:
torch.abs(scaled_dot_product_attention(q, k, v) - F.scaled_dot_product_attention(q, k, v, is_causal=True)).max()

tensor(6.6757e-06, device='cuda:0')

In [None]:
calculate_time(F.scaled_dot_product_attention, (q, k, v), num_runs=20)

0.12824971675872804

## 🟠 Multi Head Attention

In [None]:
# class MultiHeadAttention(nn.Module):

#     def __init__(self):
#         super().__init__()
#         self.fc1 = nn.Linear(100, 1000)
#         self.fc2 = nn.Linear(1000, 100)
#         self.fc3 = nn.Linear(1000, 100)

#     def forward(self, x):
#         y = F.relu(self.fc1(x))
#         y1 = self.fc2(y)
#         y2 = self.fc3(y)
#         return F.relu(torch.concat([y1, y2], dim=-1))

In [None]:
# mha = MultiHeadAttention()
# num_trainable_params(mha)
# mha.forward(torch.rand(10, 100)).shape

In [None]:
class GPTConfig:
    n_embd: int = 100
    n_head: int = 5

config = GPTConfig()
config.n_embd

100

In [None]:
class MultiHeadAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.n_embd = config.n_embd
        self.n_head = config.n_head
        self.head_size = self.n_embd // self.n_head

        self.qkv_proj = nn.Linear(self.n_embd, 3*self.n_embd, bias=False)

        self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False)
        self.c_proj.residual = True

    def forward(self, x):
        B, T, C = x.shape
        q, k, v = self.qkv_proj(x).view(B, T, 3*self.n_head, self.head_size).transpose(1, 2).chunk(3, dim=-3)

        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)

        y = y.transpose(1, 2).contiguous().view(B, T, C)

        y = self.c_proj(y)
        return y

In [None]:
mha = MultiHeadAttention(config)
mha(x).shape

NameError: name 'x' is not defined

In [None]:
xx = torch.arange(24).view(2, 2, 3, 2)
print(xx)
xx.reshape(2, 3, 4)

tensor([[[[ 0,  1],
          [ 2,  3],
          [ 4,  5]],

         [[ 6,  7],
          [ 8,  9],
          [10, 11]]],


        [[[12, 13],
          [14, 15],
          [16, 17]],

         [[18, 19],
          [20, 21],
          [22, 23]]]])


tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])

In [None]:
calculate_time(mha.to(device), (x.to(device),), num_runs=20)

0.0009491205215454102

## 🟠 Feed Forward (MLP)

In [None]:
class GPTConfig:
    n_embd: int = 100
    n_head: int = 5
    f_expnd: float = 4

config = GPTConfig()
config.n_embd

100

In [None]:
class FeedForward(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.n_embd = config.n_embd
        self.f_expnd = config.f_expnd

        self.up_proj = nn.Linear(self.n_embd, int(self.f_expnd*self.n_embd), bias=False)
        self.down_proj = nn.Linear(int(self.f_expnd*self.n_embd), self.n_embd, bias=False)
        self.down_proj.residual = True

    def forward(self, x):
        return self.down_proj(F.gelu(self.up_proj(x)))

In [None]:
mlp = FeedForward(config)
mlp(x).shape

torch.Size([64, 128, 100])

In [None]:
num_trainable_params(mlp)*1000

80.0

In [None]:
calculate_time(mlp, (x, ), num_runs=20)

0.013211965560913086

## 🟠 Decoder Block

In [None]:
class DecoderBlock(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.n_embd = config.n_embd

        self.ln1 = nn.LayerNorm(config.n_embd)
        self.mha = MultiHeadAttention(config)

        self.ln2 = nn.LayerNorm(config.n_embd)
        self.mlp = FeedForward(config)

    def forward(self, x):
        x = x + self.mha(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

In [None]:
decoder = DecoderBlock(config)
decoder(x).shape

torch.Size([64, 128, 100])

In [None]:
num_trainable_params(decoder) * 1e3

120.39999999999999

In [None]:
calculate_time(decoder, (x, ), num_runs=20) * 1e3

52.333009243011475

## 🟠 GPT

In [None]:
class GPTConfig:
    vocab_size: int = 10_000
    seq_len: int = 128
    n_layer: int = 12
    n_embd: int = 100
    n_head: int = 5
    f_expnd: float = 4


config = GPTConfig()
config.n_embd

100

In [None]:
class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.seq_len, config.n_embd)
        # self.decoders = nn.Sequential(*[DecoderBlock(config) for _ in range(config.n_layer)])
        self.decoders = nn.ModuleList([DecoderBlock(config) for _ in range(config.n_layer)])
        self.lnf = nn.LayerNorm(config.n_embd)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.lm_head.weight = self.wte.weight
        # self.lm_head.weight.data.uniform_(-1/self.lm_head.in_features**0.5, 1/self.lm_head.in_features**0.5)
        # nn.init.uniform_(self.lm_head.weight, -1/self.lm_head.in_features**0.5, 1/self.lm_head.in_features**0.5)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        std = 0.02
        if isinstance(module, nn.Linear):
            if hasattr(module, 'residual'):
                std *= (2*self.config.n_layer)**-0.5
            nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=std)

    def forward(self, idx):
        B, T = idx.shape

        x = self.wte(idx) + self.wpe(torch.arange(T, device=device))

        # x = self.decoders(x)
        for decoder in self.decoders:
            x = decoder(x)

        x = self.lnf(x)
        logits = self.lm_head(x)
        return logits

In [None]:
model = GPT(config).to(device)
model(x_batch.to(device)).shape

torch.Size([64, 128, 10000])

In [None]:
num_trainable_params(model), num_trainable_params(model.decoders), num_trainable_params(model.lm_head)

(2.4578, 1.4448, 1.0)

In [None]:
calculate_time(model, (x_batch.to(device),), num_runs=100) * 1e3

13.62372875213623

## 🟠 Initialization

In [None]:
model = GPT(
    GPTConfig(
        seq_len=256, vocab_size=10_000, n_layer=4, n_embd=256, n_head=4
        )).to(device)

In [None]:
plt.hist(model.decoders[0].mha.c_proj.weight.flatten().detach().cpu(), bins=50);

In [None]:
0.02 * (2*4)**-0.5 * 3

0.021213203435596427

In [None]:
plt.hist(model.wpe.weight.flatten()[:100_000].detach().cpu(), bins=50);

In [None]:
plt.hist(model)

In [None]:
plt.hist(model.decoders[2].mlp.down_proj.weight.flatten().detach().cpu(), bins=50);

# 🟥 GPT model implement with nn.torch transformer