In [15]:
!pip install tiktoken -q

In [54]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

import re
import numpy as np
import matplotlib.pyplot as plt

from typing import List, Dict, Tuple

In [31]:
with open("./the-verdict.txt", 'r', encoding='utf-8') as f:
  raw_text = f.read()

In [18]:
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
len(preprocessed)

4649

In [19]:
vocab = sorted(list(set([x.lower() for x in preprocessed])))
temp = enumerate(vocab)
encoder = {k:v for v,k in temp}
decoder = {v:k for k,v in encoder.items()}

In [20]:
encoded = [encoder[x.lower()] for x in preprocessed]
decoded = [decoder[x] for x in encoded]
print(encoded[:20])
print(decoded[:20])

[490, 436, 47, 966, 517, 405, 767, 12, 161, 400, 6, 965, 12, 417, 347, 303, 6, 868, 513, 1047]
['i', 'had', 'always', 'thought', 'jack', 'gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was']


In [21]:
class SimpleTokenizerV1:
  def __init__(self, raw_text):
    preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
    preprocessed = [item for item in preprocessed if item.strip()]
    vocab = sorted(list(set([x.lower() for x in preprocessed])))
    self.encoder = {k:v for v,k in enumerate(vocab)}
    self.decoder = {v:k for k,v in self.encoder.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
    preprocessed = [item for item in preprocessed if item.strip()]
    return [self.encoder[word.lower()] for word in preprocessed]

  def decode(self, ids):
    return " ".join([self.decoder[id] for id in ids])

In [22]:
class SimpleTokenizerV2:
  def __init__(self, raw_text):
    preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
    preprocessed = [item for item in preprocessed if item.strip()]
    vocab = sorted(list(set([x.lower() for x in preprocessed])))

    self.encoder = {key: value for value, key in enumerate(vocab)}
    self.encoder["<unk>"], self.encoder["<EOF>"] = len(vocab), len(vocab) + 1
    self.decoder = {value: key for key, value in self.encoder.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
    preprocessed = [item.lower() for item in preprocessed if item.strip()]
    return [self.encoder[word] if word in self.encoder else self.encoder["<unk>"] for word in preprocessed] + [self.encoder["<EOF>"]]

  def decode(self, ids):
    return " ".join([self.decoder[id] for id in ids])

In [23]:
tok = SimpleTokenizerV2(raw_text)
ids = tok.encode("I am a loser")
words = tok.decode(ids)
print(ids)
print(words)

[490, 48, 12, 1105, 1106]
i am a <unk> <EOF>


In [24]:
tokenizer = tiktoken.get_encoding("gpt2")

In [29]:
text = "Ich kann es nicht tun"
ids = tokenizer.encode(text)
words = tokenizer.decode(ids)
print(text)
print(ids)
print(words)

Ich kann es nicht tun
[40, 354, 479, 1236, 1658, 299, 30830, 6278]
Ich kann es nicht tun


In [42]:
enc_text = tokenizer.encode(raw_text)
enc_sample = enc_text[:50]
temp = torch.tensor(enc_sample)
temp

tensor([   40,   367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138,
          257,  7026, 15632,   438,  2016,   257,   922,  5891,  1576,   438,
          568,   340,   373,   645,  1049,  5975,   284,   502,   284,  3285,
          326,    11,   287,   262,  6001,   286,   465, 13476,    11,   339,
          550,  5710,   465, 12036,    11,  6405,   257,  5527, 27075,    11])

In [46]:
class CustomTextDataset(Dataset):
  def __init__(self, text, tokenizer, context_length):
    self.tokenizer = tokenizer
    self.ids = torch.tensor(tokenizer.encode(text))
    self.context_length = context_length

  def __len__(self):
    return len(self.ids)

  def __getitem__(self, index):
    assert index + self.context_length < len(self)
    x_sample = self.ids[index: index + self.context_length]
    y_sample = self.ids[self.context_length + index]
    return x_sample, y_sample

In [55]:
class GPTDatasetV1(Dataset):
  def __init__(self, text, tokenizer, max_length, stride):
    self.tokenizer = tokenizer
    self.input_ids = []
    self.target_ids = []
    ids = self.tokenizer.encode(text)

    for i in range(0, len(ids) - max_length, stride):
      input_chunk = ids[i: i + max_length]
      target_chunk = ids[i+1 : i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, index):
    return self.input_ids[index], self.target_ids[index]

In [51]:
train_ds = CustomTextDataset(raw_text, tokenizer, 5)
train_dl = DataLoader(train_ds, batch_size=4)
batch = next(iter(train_dl))

In [62]:
train_ds = GPTDatasetV1(raw_text, tokenizer, 5, 1)
train_dl = DataLoader(train_ds, batch_size=4)
xb, yb = next(iter(train_dl))

In [61]:
num_embedding = 50257
embedding_dim = 256
embedding_layer = torch.nn.Embedding(num_embedding, embedding_dim)

In [None]:
all_tokens = sorted(list(set(preprocessed)))
str_to_int = {s:i for i,s in enumerate(all_tokens)}

In [None]:
class Basic_Tokenizer:
  def __init__(self, vocab: List):
    self.vocab = sorted(list(set(vocab)))
    self.str_to_int = {s:i for i,s in enumerate(self.vocab)}
    self.str_to_int["<|unk|>"] = len(self.vocab)
    self.str_to_int["<|EOT|>"] = len(self.vocab) + 1
    self.int_to_str = {i:s for s,i in self.str_to_int.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
    preprocessed = [item for item in preprocessed if item.strip()]
    token_IDs = [self.str_to_int[word] if word in self.vocab else self.str_to_int["<|unk|>"] for word in preprocessed] + [self.str_to_int["<|EOT|>"]]
    return token_IDs

  def decode(self, ids):
    text =  " ".join([self.int_to_str[token] for token in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [None]:
tok = Basic_Tokenizer(preprocessed)
tokens = tok.encode("I am a loser. I am that, mon frere.")
tok.decode(tokens)

'I am a <|unk|>. I am that, <|unk|> <|unk|>. <|EOT|>'

In [None]:
# alltokens = sorted(list(set(preprocessed)))
# str_to_int = {s:i for i,s in enumerate(alltokens)}

In [None]:
all_tokens, preprocessed_ids = np.unique(preprocessed, return_inverse=True)
len(all_tokens), len(preprocessed_ids)

(1159, 4649)

In [None]:
all_tokens[preprocessed_ids[:10]]

array(['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a',
       'cheap', 'genius'], dtype='<U18')

In [None]:
class BasicTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {value: key for key, value in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
    preprocessed = [item for item in preprocessed if item.strip()]
    # alltokens = sorted(list(set(preprocessed)))
    # str_to_int = {s:i for i,s in enumerate(alltokens)}
    token_ids = [self.str_to_int[token] for token in preprocessed]
    return token_ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[id] for id in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [None]:
alltokens = sorted(list(set(preprocessed)))
str_to_int = {s:i for i,s in enumerate(alltokens)}
BT1 = BasicTokenizerV1(str_to_int)

In [None]:
ids = BT1.encode(raw_text)
text = BT1.decode(ids)

In [None]:
class BasicTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {value: key for key, value in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
    preprocessed = [item for item in preprocessed if item.strip()]
    token_ids = [self.str_to_int[token] if token in self.str_to_int else self.str_to_int["<|unk|>"] for token in preprocessed]
    # token_ids.append(self.str_to_int["<|endoftext|>"])
    return token_ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[id] for id in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [None]:
alltokens = sorted(list(set(preprocessed)))
alltokens.extend(["<|endoftext|>", "<|unk|>"])
str_to_int = {s:i for i,s in enumerate(alltokens)}
BT2 = BasicTokenizerV2(str_to_int)

In [None]:
text = "Hello World, is this ann?"
BT2.encode(text)

[1160, 1160, 5, 595, 1024, 1160, 10]

In [None]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [None]:
print(BT2.encode(text))

[1160, 5, 362, 1155, 642, 1000, 10, 1159, 57, 1013, 981, 1009, 738, 1013, 1160, 7]


In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
ints = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(ints)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 262, 20562, 13]


In [None]:
tokenizer.decode(ints)

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.'

In [None]:
raw_text[:100]

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g'

In [None]:
enc_text = tokenizer.encode(raw_text)

In [None]:
len(enc_text)

5145

In [None]:
enc_sample = enc_text[50:]
dec_sample = tokenizer.decode(enc_sample)
dec_sample[:20]

' and established him'

In [None]:
context_size = 5
length = 25
x_samples, y_samples = [], []

for i in range(length):
  x_sample, y_sample = enc_sample[i: i+context_size], enc_sample[i+context_size:i+context_size+1]
  x_samples.append(x_sample)
  y_samples.append(y_sample)

print(x_samples)
print(y_samples)

[[290, 4920, 2241, 287, 257], [4920, 2241, 287, 257, 4489], [2241, 287, 257, 4489, 64], [287, 257, 4489, 64, 319], [257, 4489, 64, 319, 262], [4489, 64, 319, 262, 34686], [64, 319, 262, 34686, 41976], [319, 262, 34686, 41976, 13], [262, 34686, 41976, 13, 357], [34686, 41976, 13, 357, 10915], [41976, 13, 357, 10915, 314], [13, 357, 10915, 314, 2138], [357, 10915, 314, 2138, 1807], [10915, 314, 2138, 1807, 340], [314, 2138, 1807, 340, 561], [2138, 1807, 340, 561, 423], [1807, 340, 561, 423, 587], [340, 561, 423, 587, 10598], [561, 423, 587, 10598, 393], [423, 587, 10598, 393, 28537], [587, 10598, 393, 28537, 2014], [10598, 393, 28537, 2014, 198], [393, 28537, 2014, 198, 198], [28537, 2014, 198, 198, 1], [2014, 198, 198, 1, 464]]
[[4489], [64], [319], [262], [34686], [41976], [13], [357], [10915], [314], [2138], [1807], [340], [561], [423], [587], [10598], [393], [28537], [2014], [198], [198], [1], [464], [6001]]


In [None]:
len(x_samples)

25

In [None]:
class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.tokenizer = tokenizer
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt)

    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i: i+max_length]
      output_chunk = token_ids[i+1: i+max_length+1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(output_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [None]:
def create_dataloader(txt, batch_size: int=4, max_length: int=256, stride: int=128, shuffle: bool=True, drop_last: bool=True):
  tokenizer = tiktoken.get_encoding('gpt2')
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
  return dataloader

In [None]:
dl = create_dataloader(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

In [None]:
count = 0
for xb,yb in dl:
  print(xb, ": ", yb)
  count+=1
  if count >= 3:
    break

tensor([[  40,  367, 2885, 1464]]) :  tensor([[ 367, 2885, 1464, 1807]])
tensor([[ 367, 2885, 1464, 1807]]) :  tensor([[2885, 1464, 1807, 3619]])
tensor([[2885, 1464, 1807, 3619]]) :  tensor([[1464, 1807, 3619,  402]])


In [None]:
vocab_size, output_dim = 6, 3

In [None]:
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
embedding_layer.weight

Parameter containing:
tensor([[-0.3746,  1.4903,  0.5155],
        [ 1.3757,  2.1852, -0.5437],
        [-1.2318, -0.6525, -0.3105],
        [-0.4463,  0.8960, -0.0458],
        [-0.9874, -0.7083, -0.6746],
        [-1.0927,  0.1884,  1.3215]], requires_grad=True)

In [None]:
output_dim = 256
vocab_size = 50257

In [None]:
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
max_length = 4

In [None]:
dl = create_dataloader(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)

In [None]:
inputs, targets = next(iter(dl))

In [None]:
inputs, targets

(tensor([[   40,   367,  2885,  1464],
         [ 1807,  3619,   402,   271],
         [10899,  2138,   257,  7026],
         [15632,   438,  2016,   257],
         [  922,  5891,  1576,   438],
         [  568,   340,   373,   645],
         [ 1049,  5975,   284,   502],
         [  284,  3285,   326,    11]]),
 tensor([[  367,  2885,  1464,  1807],
         [ 3619,   402,   271, 10899],
         [ 2138,   257,  7026, 15632],
         [  438,  2016,   257,   922],
         [ 5891,  1576,   438,   568],
         [  340,   373,   645,  1049],
         [ 5975,   284,   502,   284],
         [ 3285,   326,    11,   287]]))

In [None]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [None]:
token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

In [None]:
context_size = 5
length = 25
x_samples = []
y_samples = []

for i in range(length):
  x = enc_sample[i:i+context_size]
  y = enc_sample[i+context_size: i+context_size+1]
  x_samples.append(x)
  y_samples.append(y)


for i in range(length):
  print(tokenizer.decode(x_samples[i]), ": ", tokenizer.decode(y_samples[i]))

 and established himself in a :   vill
 established himself in a vill :  a
 himself in a villa :   on
 in a villa on :   the
 a villa on the :   Riv
 villa on the Riv :  iera
a on the Riviera :  .
 on the Riviera. :   (
 the Riviera. ( :  Though
 Riviera. (Though :   I
iera. (Though I :   rather
. (Though I rather :   thought
 (Though I rather thought :   it
Though I rather thought it :   would
 I rather thought it would :   have
 rather thought it would have :   been
 thought it would have been :   Rome
 it would have been Rome :   or
 would have been Rome or :   Florence
 have been Rome or Florence :  .)
 been Rome or Florence.) :  

 Rome or Florence.)
 :  

 or Florence.)

 :  "
 Florence.)

" :  The
.)

"The :   height
