## pip

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [4]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


## Import

In [5]:
import os
import re
import time
import json
import random
import string
import psutil
import pickle
from tqdm import tqdm
from pprint import pprint
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import mode

from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers, decoders, processors
import tiktoken

import torch
from torch.utils.data import TensorDataset, Dataset, IterableDataset, DataLoader

## tokenize

In [None]:
dataset=load_dataset("roneneldan/TinyStories")

In [None]:
tokenizer=tiktoken.get_encoding("gpt2")
tokenized_train_samples = []
for item in tqdm(dataset["train"], desc="Tokenizing Train Set"):
    input_ids = tokenizer.encode(item["text"])
    tokenized_train_samples.append(np.array(input_ids))

In [None]:
tokenized_valid_samples = []
for item in tqdm(dataset["validation"], desc="Tokenizing validation Set"):
    input_ids = tokenizer.encode(item["text"])
    tokenized_valid_samples.append(np.array(input_ids))

In [None]:
tokenized_valid_samples[:1]

In [None]:
sumtoks=  sum(len(tok) for tok in tokenized_train_samples)
print(sumtoks)

## Save and Read Tokens

In [None]:
# # Save tokens as a pickle file
# with open('tokenized_train_samples.pkl', 'wb') as f:
#     pickle.dump(tokenized_train_samples, f)

# with open('tokenized_valid_samples.pkl', 'wb') as f:
#     pickle.dump(tokenized_valid_samples, f)

In [None]:
# # Load data from the pickle file
# with open('tokenized_train_samples.pkl', 'rb') as f:
#     tokenized_train_samples = pickle.load(f)

# with open('tokenized_valid_samples.pkl', 'rb') as f:
#     tokenized_valid_samples = pickle.load(f)

## EDA

In [None]:
token_count_stories=[]
for tokns in tokenized_train_samples:
    token_count_stories.append(len(tokns))

In [None]:
token_count_stories_np=np.array(token_count_stories)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(token_count_stories, bins=50, kde=True)
plt.xlabel('Token Count')
plt.ylabel('Frequency')
plt.title('Distribution of Token Counts')
plt.show()

In [None]:
np.sort(token_count_stories_np)[:1000]

# 🔴 **Utils**

In [None]:
def prepare_data(tokens, seq_len):
    # Trim tokens so that total length is divisible by seq_len
    n_tokens = (tokens.shape[0] // seq_len) * seq_len
    tokens = tokens[:n_tokens]

    # Reshape to 2D tensor
    return tokens.view(-1, seq_len)


In [None]:
def num_trainable_params(model):
  nums = sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6
  return nums

In [None]:
def calculate_time(model, x, num_runs=10):
    torch.cuda.synchronize()
    start = time.time()
    for _ in range(num_runs):
        model(*x)
    torch.cuda.synchronize()
    return (time.time() - start) / num_runs

## 🟥 Transformer Model

In [2]:
import time
from dataclasses import dataclass

# from datasets import load_dataset
from tokenizers import Tokenizer

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F

In [21]:
class MultiHeadAtention(torch.nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.qkv_proj = torch.nn.Linear(embed_dim, 3 * embed_dim)
        self.out_proj = torch.nn.Linear(embed_dim, embed_dim)
    //n
    def forward(self, x):
        batch_size, seq_len, embed_dim = x.size()
        k,q,v = self.qkv_proj(x).view(batch_size, seq_len, 3, self.num_heads, self.head_dim).transpose(1,2).chunk(3)
        F.scaled_dot_product_attention(q,k,v)
        return self.out_proj(x)

In [None]:
x=torch.range(1,24).view(2,3,4)
print(x)

x=x.transpose(1,0)
print(x)
#

In [27]:
print(x.shape)
y= MultiHeadAtention(4,2)(x)
y.shape

torch.Size([3, 2, 4])


torch.Size([3, 2, 4])