<a href="https://colab.research.google.com/github/anandharidas/LLMs-from-scratch/blob/main/ch02/01_main-chapter-code/dataloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<table style="width:100%">
<tr>
<td style="vertical-align:middle; text-align:left;">
<font size="2">
Supplementary code for the <a href="http://mng.bz/orYv">Build a Large Language Model From Scratch</a> book by <a href="https://sebastianraschka.com">Sebastian Raschka</a><br>
<br>Code repository: <a href="https://github.com/rasbt/LLMs-from-scratch">https://github.com/rasbt/LLMs-from-scratch</a>
</font>
</td>
<td style="vertical-align:middle; text-align:left;">
<a href="http://mng.bz/orYv"><img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp" width="100px"></a>
</td>
</tr>
</table>


# The Main Data Loading Pipeline Summarized

The complete chapter code is located in [ch02.ipynb](./ch02.ipynb).

This notebook contains the main takeaway, the data loading pipeline without the intermediate steps.

Packages that are being used in this notebook:

In [None]:
# NBVAL_SKIP
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.9.0+cpu
tiktoken version: 0.12.0


In [None]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size, max_length, stride,
                         shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader


with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

vocab_size = 50257
output_dim = 256
context_length = 1024


token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

batch_size = 8
max_length = 4
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=batch_size,
    max_length=max_length,
    stride=max_length
)

In [None]:
for batch in dataloader:
    x, y = batch

    token_embeddings = token_embedding_layer(x)
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))

    input_embeddings = token_embeddings + pos_embeddings

    break

In [None]:
!pip install gensim



In [None]:

import gensim.downloader as api

model = api.load("word2vec-google-news-300")

In [None]:
print(model.most_similar(positive=['king','woman'] , negative=['man'], topn=10))

[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.518113374710083), ('sultan', 0.5098593235015869), ('monarchy', 0.5087411403656006)]


In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [3]:
import gensim.downloader as api

In [4]:
model = api.load("word2vec-google-news-300")



In [9]:
model.most_similar("tower",topn=10)

[('towers', 0.8531750440597534),
 ('skyscraper', 0.6417425870895386),
 ('Tower', 0.639177143573761),
 ('spire', 0.594687819480896),
 ('responded_Understood_Atlasjet', 0.5931612253189087),
 ('storey_tower', 0.5783934593200684),
 ('SolarReserve_molten_salt', 0.5733036398887634),
 ('monopole_tower', 0.566946804523468),
 ('bell_tower', 0.5626809000968933),
 ('foot_monopole', 0.5514882802963257)]

In [11]:
model["om"]

array([ 5.51757812e-02,  1.88476562e-01, -1.87988281e-02, -6.05468750e-02,
       -3.44238281e-02, -1.80664062e-01,  2.40325928e-04, -9.08203125e-02,
       -3.32031250e-01, -6.34765625e-02, -3.43750000e-01, -1.49414062e-01,
       -4.51171875e-01,  2.70996094e-02, -3.08593750e-01, -6.39648438e-02,
        1.87500000e-01,  1.49414062e-01, -3.46679688e-02, -3.14941406e-02,
       -9.03320312e-02, -3.93066406e-02, -1.21093750e-01,  1.24023438e-01,
       -2.23632812e-01, -1.22558594e-01, -1.66015625e-01, -8.48388672e-03,
       -9.27734375e-02, -2.59765625e-01,  1.50390625e-01,  1.15234375e-01,
       -2.47070312e-01, -2.83203125e-01, -3.86718750e-01, -1.69921875e-01,
       -2.31445312e-01,  3.65234375e-01,  3.45703125e-01,  2.53906250e-02,
       -2.13867188e-01, -5.11718750e-01, -1.62109375e-01,  1.11328125e-01,
        6.59179688e-02,  1.17675781e-01, -2.20703125e-01, -1.92382812e-01,
       -1.89208984e-02,  5.19531250e-01,  4.46777344e-02,  4.62890625e-01,
        2.38037109e-02,  

In [14]:
model.most_similar("om",topn=10)

[('på', 0.6145849823951721),
 ('aan', 0.61150723695755),
 ('ter', 0.6017832159996033),
 ('ne', 0.6013097167015076),
 ('waar', 0.594710648059845),
 ('moet', 0.5922201871871948),
 ('te', 0.5908598303794861),
 ('denne', 0.5897035598754883),
 ('naar', 0.5896697044372559),
 ('gaan', 0.5889247059822083)]

In [24]:
vector_diff1 = model["apple"] - model["mango"]
vector_diff2 = model["apple"] - model["tennis"]
vector_diff3 = model["mango"] - model["pear"]
vector_diff4 = model["apple"] - model["pear"]

In [20]:
import numpy as np
np.linalg.norm(vector_diff1)

np.float32(2.98844)

In [22]:
np.linalg.norm(vector_diff2)

np.float32(4.210593)

In [26]:
np.linalg.norm(vector_diff3)

np.float32(2.7786484)

In [28]:
np.linalg.norm(vector_diff4)

np.float32(2.6607358)

In [31]:
np.linalg.norm(model["apple"] - model["apples"])

np.float32(2.3678634)

In [32]:
# Creating Token embeddings..,

In [34]:
import torch

In [36]:
input_ids = torch.tensor([2,3,5,1])

In [38]:
input_ids

tensor([2, 3, 5, 1])

In [49]:
vocab_size = 6
output_dim = 3
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size,output_dim)

In [50]:
embedding_layer.weight

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)

In [52]:
embedding_layer(torch.tensor([3]))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

In [57]:
embedding_layer(torch.tensor([2,3,5,1]))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)