# GPT from Scratch

## Imports

In [1]:
import tiktoken
import torch

## The Data

### Download Data

In [2]:
# !curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt > "shakespeare.txt"
# !curl https://www.gutenberg.org/cache/epub/174/pg174.txt > "dorian-gray.txt"
# !curl https://www.gutenberg.org/cache/epub/64317/pg64317.txt > "gatsby.txt"
# !curl https://www.gutenberg.org/ebooks/43.txt.utf-8 > "jekyll.txt"
# !curl https://www.gutenberg.org/ebooks/76101.txt.utf-8 > "theory-of-earth.txt"

In [3]:
with open("shakespeare.txt", "r", encoding="utf-8") as f:
    bard_text = f.read()

f"Number of characters: {len(bard_text)}"

'Number of characters: 1115394'

In [4]:
# with open("dorian-gray.txt", "r", encoding="utf-8") as f:
#     dorian_text = f.read()

# f"Number of characters: {len(dorian_text)}"

In [5]:
# with open("gatsby.txt", "r", encoding="utf-8") as f:
#     gatsby_text = f.read()

# f"Number of characters: {len(gatsby_text)}"

### Vocabulary and Tokenisation

In [6]:
def get_vocab(text):
        return sorted(list(set(text)))

def char_to_idx(chars=None, text=None):
    if chars is None:
        chars = get_vocab(text)
    return {ch:i for i, ch in enumerate(chars)}

def idx_to_char(chars=None, text=None):
    if chars is None:
        chars = get_vocab(text)
    return {i:ch for i, ch in enumerate(chars)}

class TextManager:
    def __init__(self, text, enc_method="simple"):
        self.vocab = get_vocab(text)
        self.vocab_size = len(self.vocab)
        self.c_to_i = char_to_idx(self.vocab)
        self.i_to_c = idx_to_char(self.vocab)
        self.enc_method = enc_method
        
        if enc_method == "tiktoken":
            self.enc = tiktoken.get_encoding("gpt2")
        else:
            self.enc = None
        
    def __str__(self):
        return f"""Vocabulary (size = {self.vocab_size}):
        {"".join(self.vocab)}
        """

    def get_vocab(self, as_str=False):
        if as_str:
            return "".join(self.vocab)
        return self.vocab
    
    def encode(self, text):
        """
        Take a string, output a list of integers.
        """
        if self.enc_method == "tiktoken":
            return self.enc.encode(text)
        return [self.c_to_i[c] for c in text]
    
    def decode(self, indices):
        """
        Take a list of integers, output a string.
        """
        if self.enc_method == "tiktoken":
            return self.enc.decode(indices)
        return "".join([self.i_to_c[idx] for idx in indices])

In [7]:
tm1 = TextManager(bard_text)
print(tm1)

Vocabulary (size = 65):
        
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
        


### Creating Dataset

In [8]:
data = torch.tensor(tm1.encode(bard_text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [9]:
test_size = 0.1
n = int((1-test_size)*data.shape[0])
train_data = data[:n]
val_data = data[n:]

In [21]:
train_data.shape

torch.Size([1003854])

In [22]:
val_data.shape

torch.Size([111540])