In [1]:
!pip install tiktoken -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.2 MB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m18.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [31]:
import torch
import torch.nn as nn

import tiktoken
import numpy as np
import matplotlib.pyplot as plt
import re

In [3]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [4]:
text = "Hello, world. This, is a test."
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [5]:
result = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
result = [item.strip() for item in result if item.strip()]
print(len(result))
print(len(np.unique(result)))

4690
1130


In [6]:
vocab = np.sort(np.unique([word for word in result]))
word_2_idx = {w:v for v,w in enumerate(vocab)}

In [7]:
class Tokenizer_V1:
  def __init__(self, vocab: list):
    self.word2idx = {w:v for v,w in enumerate(vocab)}
    self.word2idx["<unk>"] = len(vocab) + 1
    self.word2idx["<eot>"] = self.word2idx["<unk>"] +1
    self.idx2word = {v:k for k,v in self.word2idx.items()}

  def encode(self, text: str):
    result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    result = [item.strip() for item in result if item.strip()]
    return [self.word2idx[token] if token in self.word2idx else self.word2idx["<unk>"] for token in result] + [self.word2idx["<eot>"]]

  def decode(self, ids: list):
    text = " ".join([self.idx2word[id] for id in ids])
    text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
    return text

In [8]:
text = "I am a loser."
tokenizer = Tokenizer_V1(vocab)
ids = tokenizer.encode(text)
words = tokenizer.decode(ids)
print(text)
print(ids)
print(words)

I am a loser.
[53, 150, 115, 1131, 7, 1132]
I am a <unk>. <eot>


In [9]:
tokenizer = tiktoken.get_encoding("gpt2")

In [10]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [11]:
def create_data_chunks(encoded: list, context_length: int):
  inputs = []
  targets = []
  num_chunks = (len(encoded) - 1) // context_length
  for i in range(num_chunks):
    input, target = encoded[i*context_length: (i+1)*context_length], encoded[i*context_length + 1: (i+1)*context_length + 1]
    inputs.append(input)
    targets.append(target)

  return torch.tensor(inputs), torch.tensor(targets)

In [12]:
X, Y = create_data_chunks(enc_text, 12)
print(X.shape)
print(Y.shape)
print(X[0])
print(Y[0])

torch.Size([428, 12])
torch.Size([428, 12])
tensor([   40,   367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138,
          257,  7026])
tensor([  367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138,   257,
         7026, 15632])


In [13]:
class GPTDataset(torch.utils.data.Dataset):
  def __init__(self, text, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids  = []

    token_ids = tokenizer.encode(text)
    for i in range(0, len(token_ids) - max_length, stride):
      inputs, outputs = token_ids[i: i+max_length], token_ids[i+1: i+max_length+1]
      self.input_ids.append(torch.tensor(inputs))
      self.target_ids.append(torch.tensor(outputs))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [14]:
def create_dataloader(text, tokenizer, max_length, stride, batch_size, shuffle=True, drop_last=True):
  dataset = GPTDataset(text, tokenizer,  max_length, stride)
  return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)

In [15]:
dataloader = create_dataloader(raw_text, tokenizer, batch_size=4, max_length=32, stride=1, shuffle=False)

In [16]:
x,y = next(iter(dataloader))
print(x.shape)

torch.Size([4, 32])


In [17]:
emb_layer = torch.nn.Embedding(num_embeddings=tokenizer.n_vocab, embedding_dim=16)

In [18]:
out = emb_layer(x)
print(out.shape)

torch.Size([4, 32, 16])


In [19]:
%%writefile data_setup.py
import torch
import tiktoken


class GPTDataset(torch.utils.data.Dataset):
  def __init__(self,
               txt: str,
               tokenizer: tiktoken.Encoding,
               max_length: int,
               stride: int):
    self.input_ids = []
    self.target_ids = []
    token_ids = tokenizer.encode(txt)

    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i: i+max_length]
      target_chunk = token_ids[i+1: i+1+max_length]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]


def create_dataloader(txt, tokenizer, max_length=256, stride=128, batch_size=4, shuffle=True, drop_last=True, num_workers=0):
  dataset = GPTDataset(txt, tokenizer, max_length, stride)
  dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
  return dataloader



Writing data_setup.py


In [20]:
context_length = 32
embedding_dim = 16

emb_layer = torch.nn.Embedding(num_embeddings=tokenizer.n_vocab, embedding_dim=embedding_dim)
pos_encoding_layer = torch.nn.Embedding(context_length, embedding_dim)
pos_enc = pos_encoding_layer(torch.arange(context_length))

out = emb_layer(x) + pos_enc
out.shape

torch.Size([4, 32, 16])

In [65]:
inputs = torch.tensor(
[[0.43, 0.15, 0.89], # Your (x^1)
[0.55, 0.87, 0.66], # journey (x^2)
[0.57, 0.85, 0.64], # starts (x^3)
[0.22, 0.58, 0.33], # with (x^4)
[0.77, 0.25, 0.10], # one (x^5)
[0.05, 0.80, 0.55]] # step (x^6)
)

In [66]:
inputs.shape

torch.Size([6, 3])

In [30]:
dpa = inputs @ inputs.T
attention_weights = torch.softmax(dpa, dim=-1)
context_vecs = attention_weights @ inputs
print(context_vecs)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


In [29]:
print(np.sum([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581]))

1.0


In [32]:
class AttentionV1(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, inputs):
    d = torch.tensor(inputs.shape[-1])
    Q, K, V = inputs, inputs, inputs
    attention_scores = Q @ K.transpose(-1, -2)/torch.sqrt(d)
    attention_weights = torch.softmax(attention_scores, dim=-1)
    return attention_weights @ V

In [36]:
class AttentionV2(nn.Module):
  def __init__(self, d: int, qkv_bias: bool=False):
    super().__init__()
    self.d = torch.tensor(d)
    self.Wq = nn.Linear(d, d, bias=qkv_bias)
    self.Wk = nn.Linear(d, d, bias=qkv_bias)
    self.Wv = nn.Linear(d, d, bias=qkv_bias)

  def forward(self, inputs):
    Q = self.Wq(inputs)
    K = self.Wk(inputs)
    V = self.Wv(inputs)
    attention_scores = Q @ K.transpose(-1, -2)/torch.sqrt(self.d)
    attention_weights = torch.softmax(attention_scores, dim=-1)
    return attention_weights @ V

In [76]:
class CausalSelfAttention(nn.Module):
  def __init__(self,
               context_length: int,
               d: int,
               qkv_bias: bool=False) -> None:
    super().__init__()
    self.d = torch.tensor(d)
    self.Wq = nn.Linear(d, d, bias=qkv_bias)
    self.Wk = nn.Linear(d, d, bias=qkv_bias)
    self.Wv = nn.Linear(d, d, bias=qkv_bias)
    self.mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)

  def forward(self, inputs):
    Q = self.Wq(inputs)
    K = self.Wk(inputs)
    V = self.Wv(inputs)
    attention_scores = Q @ K.transpose(-1, -2)/torch.sqrt(self.d)
    masked = attention_scores.masked_fill(self.mask.bool(), -torch.inf)
    attention_weights = torch.softmax(masked, dim=-1)
    print(masked)
    print(attention_weights)
    return attention_weights @ V



In [77]:
l = CausalSelfAttention(6, 3, False)
input = inputs.unsqueeze(0)
out = l(input)
print(input.shape)
print(out.shape)

tensor([[[-0.2636,    -inf,    -inf,    -inf,    -inf,    -inf],
         [-0.3517, -0.3221,    -inf,    -inf,    -inf,    -inf],
         [-0.3477, -0.3169, -0.3095,    -inf,    -inf,    -inf],
         [-0.1904, -0.1737, -0.1702, -0.0888,    -inf,    -inf],
         [-0.1783, -0.1342, -0.1286, -0.0722,  0.0102,    -inf],
         [-0.2419, -0.2359, -0.2324, -0.1199, -0.1026, -0.1637]]],
       grad_fn=<MaskedFillBackward0>)
tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4926, 0.5074, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3257, 0.3359, 0.3384, 0.0000, 0.0000, 0.0000],
         [0.2413, 0.2454, 0.2462, 0.2671, 0.0000, 0.0000],
         [0.1847, 0.1930, 0.1941, 0.2053, 0.2230, 0.0000],
         [0.1568, 0.1578, 0.1583, 0.1772, 0.1803, 0.1696]]],
       grad_fn=<SoftmaxBackward0>)
torch.Size([1, 6, 3])
torch.Size([1, 6, 3])


In [49]:
temp = torch.triu(torch.ones(3,3)*-torch.inf, diagonal=1)
torch.softmax(temp, dim=-1)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])