In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('books.txt', 'r', encoding= 'UTF -8') as f:
    text = f.read()

print(f"length of dataset in characters: {len(text)}")

length of dataset in characters: 42238758


In [3]:
print(text[:1000])

The Project Gutenberg eBook of Birds, Illustrated by Color Photography, Vol. 1, No. 1 This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook. Title: Birds, Illustrated by Color Photography, Vol. 1, No. 1 Author: Various Release date: October 9, 2009 [eBook #30221] Most recently updated: October 24, 2024 Language: English Credits: Produced by Chris Curnow, Joseph Cooper, Anne Storer, some images courtesy of The Internet Archive and the Online Distributed Proofreading Team at https://www.pgdp.net *** START OF THE PROJECT GUTENBERG EBOOK BIRDS, ILLUSTRATED BY COLOR PHOTOGRAPHY, VOL. 1, NO. 1 *** FROM: THE PR


In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"vocabulary size: {vocab_size}")
print(f"vocabulary: {chars}")

vocabulary size: 156
vocabulary: ['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '£', '§', '©', '°', '´', '·', '¹', 'º', '¼', '½', '¾', 'Á', 'Â', 'Ä', 'Æ', 'Ç', 'È', 'É', 'Ë', 'Î', 'Ó', 'Ô', 'Ö', '×', 'Ü', 'à', 'á', 'â', 'ã', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'ö', '÷', 'ù', 'ú', 'û', 'ü', 'ý', 'Œ', 'œ', '—', '‘', '’', '“', '”', '•', '′', '™', '\ufeff']


In [5]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda i: ''.join(itos[l] for l in i)

print(encode('hello there'))
print(decode(encode('hello there')))

[71, 68, 75, 75, 78, 1, 83, 71, 68, 81, 68]
hello there


In [7]:
import torch 
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])
print(data.shape)

tensor([155,  51,  71,  68,   1,  47,  81,  78,  73,  68,  66,  83,   1,  38,
         84,  83,  68,  77,  65,  68,  81,  70,   1,  68,  33,  78,  78,  74,
          1,  78,  69,   1,  33,  72,  81,  67,  82,  13,   1,  40,  75,  75,
         84,  82,  83,  81,  64,  83,  68,  67,   1,  65,  88,   1,  34,  78,
         75,  78,  81,   1,  47,  71,  78,  83,  78,  70,  81,  64,  79,  71,
         88,  13,   1,  53,  78,  75,  15,   1,  18,  13,   1,  45,  78,  15,
          1,  18,   1,  51,  71,  72,  82,   1,  68,  65,  78,  78,  74,   1,
         72,  82])
torch.Size([42238758])


In [9]:
n = int(0.9*len(text))
train_data = data[:n]
val_data = data[n:]
print(train_data.shape, val_data.shape)

torch.Size([38014882]) 4223876


In [10]:
block_size = 8
train_data[:block_size+1]


tensor([155,  51,  71,  68,   1,  47,  81,  78,  73])

In [11]:
x_train = train_data[:block_size+1]
y_train = train_data[1:block_size+1]
for t in range(block_size):
    context = x_train[:t+1]
    target = y_train[t]
    print(f"when input is {context} target is: {target}")

when input is tensor([155]) target is: 51
when input is tensor([155,  51]) target is: 71
when input is tensor([155,  51,  71]) target is: 68
when input is tensor([155,  51,  71,  68]) target is: 1
when input is tensor([155,  51,  71,  68,   1]) target is: 47
when input is tensor([155,  51,  71,  68,   1,  47]) target is: 81
when input is tensor([155,  51,  71,  68,   1,  47,  81]) target is: 78
when input is tensor([155,  51,  71,  68,   1,  47,  81,  78]) target is: 73


In [15]:
torch.manual_seed(1337)
batch_size = 4 #how many independent sequences we process in parallel
block_size = 8 #maximum context length for predictions

def get_batch(split):
    #generate a small batch of data of inputs x and y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    print(ix.shape, len(data)-block_size, batch_size)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)


torch.Size([4]) 38014874 4
inputs:
torch.Size([4, 8])
tensor([[28,  1, 78, 65, 82, 68, 81, 85],
        [86, 77,  1, 64, 66, 66, 78, 81],
        [ 1, 77, 78, 83, 72, 69, 72, 68],
        [62,  1, 74, 77, 78, 86,  1, 83]])
targets:
torch.Size([4, 8])
tensor([[ 1, 78, 65, 82, 68, 81, 85, 68],
        [77,  1, 64, 66, 66, 78, 81, 67],
        [77, 78, 83, 72, 69, 72, 68, 67],
        [ 1, 74, 77, 78, 86,  1, 83, 71]])


In [23]:
df = pd.read_csv('gutenberg_data_with_text.csv', nrows=10)
df = df.drop(labels=['Link'], axis=1)
df

Unnamed: 0,Title,Author,Bookshelf,Text
0,"Birds, Illustrated","Color Photography, Vol. 1, No. 1 Various",Animal,"﻿The Project Gutenberg eBook of Birds, Illustr..."
1,Artistic Anatomy of Animals,Édouard Cuyer,Animal,﻿The Project Gutenberg eBook of Artistic Anato...
2,What Bird is That?,Frank M. Chapman,Animal,﻿The Project Gutenberg eBook of What Bird is T...
3,Fox Trapping: A Book of Instruction Telling Ho...,,Animal,﻿The Project Gutenberg eBook of Fox Trapping: ...
4,On Snake-Poison: Its Action and Its Antidote,A. Mueller,Animal,﻿The Project Gutenberg eBook of On Snake-Poiso...
5,Deadfalls and Snares,A. R. Harding,Animal,﻿The Project Gutenberg eBook of Deadfalls and ...
6,A Guide for the Study of Animals,"Lucas, Shinn, Smallwood, and Whitney",Animal,﻿The Project Gutenberg eBook of A Guide for th...
7,Fifty Years a Hunter and Trapper,E. N. Woodcock,Animal,﻿The Project Gutenberg eBook of Fifty Years a ...
8,Our Vanishing Wild Life: Its Extermination and...,William T. Hornaday,Animal,﻿The Project Gutenberg eBook of Our Vanishing ...
9,The Extermination of the American Bison,William T. Hornaday,Animal,﻿The Project Gutenberg eBook of The Exterminat...


In [26]:
df['encoder_input'] = df['Title'] + " " + df['Author'] + " " + df['Bookshelf']
df

Unnamed: 0,Title,Author,Bookshelf,Text,encoder_input
0,"Birds, Illustrated","Color Photography, Vol. 1, No. 1 Various",Animal,"﻿The Project Gutenberg eBook of Birds, Illustr...","Birds, Illustrated Color Photography, Vol. 1, ..."
1,Artistic Anatomy of Animals,Édouard Cuyer,Animal,﻿The Project Gutenberg eBook of Artistic Anato...,Artistic Anatomy of Animals Édouard Cuyer Animal
2,What Bird is That?,Frank M. Chapman,Animal,﻿The Project Gutenberg eBook of What Bird is T...,What Bird is That? Frank M. Chapman Animal
3,Fox Trapping: A Book of Instruction Telling Ho...,,Animal,﻿The Project Gutenberg eBook of Fox Trapping: ...,
4,On Snake-Poison: Its Action and Its Antidote,A. Mueller,Animal,﻿The Project Gutenberg eBook of On Snake-Poiso...,On Snake-Poison: Its Action and Its Antidote A...
5,Deadfalls and Snares,A. R. Harding,Animal,﻿The Project Gutenberg eBook of Deadfalls and ...,Deadfalls and Snares A. R. Harding Animal
6,A Guide for the Study of Animals,"Lucas, Shinn, Smallwood, and Whitney",Animal,﻿The Project Gutenberg eBook of A Guide for th...,"A Guide for the Study of Animals Lucas, Shinn,..."
7,Fifty Years a Hunter and Trapper,E. N. Woodcock,Animal,﻿The Project Gutenberg eBook of Fifty Years a ...,Fifty Years a Hunter and Trapper E. N. Woodcoc...
8,Our Vanishing Wild Life: Its Extermination and...,William T. Hornaday,Animal,﻿The Project Gutenberg eBook of Our Vanishing ...,Our Vanishing Wild Life: Its Extermination and...
9,The Extermination of the American Bison,William T. Hornaday,Animal,﻿The Project Gutenberg eBook of The Exterminat...,The Extermination of the American Bison Willia...


In [31]:
# Ensure all values are converted to strings before joining
chars = sorted(list(set(' '.join(df['encoder_input'].astype(str).values) + ' '.join(df['Text'].astype(str).values))))
len(chars)

[' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '=',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '^',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|',
 '£',
 '§',
 '°',
 '´',
 '·',
 '¹',
 'Æ',
 'È',
 'É',
 '×',
 'â',
 'ä',
 'æ',
 'è',
 'é',
 'ê',
 'ë',
 'í',
 'î',
 'ñ',
 'ó',
 'ô',
 'ö',
 'ú',
 'ü',
 '—',
 '‘',
 '’',
 '“',
 '”',
 '•',
 '™',
 '\ufeff']

In [33]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

#hyperparameters
batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate = 1e-4
n_embd = 8
device = 'mps' if torch.backends.mps.is_available() else 'cpu'

print(device)
eval_iters = 200
n_head = 8
n_layer = 6
dropout = 0.2
torch.manual_seed(1337)


df = pd.read_csv('gutenberg_data_with_text.csv', nrows=100)
df = df.drop(labels=['Link'], axis=1)
df['encoder_input'] = df['Title'] + " " + df['Author'] + " " + df['Bookshelf']
chars = sorted(list(set(' '.join(df['encoder_input'].astype(str).values) + ' '.join(df['Text'].astype(str).values))))
vocab_size = len(chars)
print(vocab_size)
#encoder decoder mapping
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda i: ''.join(itos[l] for l in i)
 


mps
155


In [57]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Tokenize and decode
tokens = tokenizer.tokenize("This is a test.")
print(tokens)
print(tokenizer.encode(tokens))
print(tokenizer.encode(tokenizer.decode(tokenizer.encode("This is a test."))))




['This', 'Ġis', 'Ġa', 'Ġtest', '.']
[1212, 318, 257, 1332, 13]
[1212, 318, 257, 1332, 13]


In [42]:
df = df[~df['encoder_input'].isna()]
df['encoder_input']

0     Birds, Illustrated Color Photography, Vol. 1, ...
1      Artistic Anatomy of Animals Édouard Cuyer Animal
2            What Bird is That? Frank M. Chapman Animal
4     On Snake-Poison: Its Action and Its Antidote A...
5             Deadfalls and Snares A. R. Harding Animal
                            ...                        
95    Off to the Wilds: Being the Adventures of Two ...
96    The Hunters' Feast: Conversations Around the C...
97    The Giraffe Hunters Mayne Reid Animals-Wild-Tr...
98    Dead Man's Land George Manville Fenn Animals-W...
99    Birds, Illustrated Color Photography, Vol. 1, ...
Name: encoder_input, Length: 98, dtype: object

In [34]:
stoi["5"]

21

In [45]:
# Convert the data into tokenized format
train_data = []
for idx, row in df.iterrows():
    
    if type(row['encoder_input']) == float:
        print(row['encoder_input'])
    encoder_input = encode(row['encoder_input'])
    decoder_input = encode(row['Text'])
    train_data.append((torch.tensor(encoder_input), torch.tensor(decoder_input)))

def get_batch():
    # Randomly pick a sample
    ix = torch.randint(len(train_data), (batch_size,))
    x = torch.stack([train_data[i][0] for i in ix]).to(device)
    y = torch.stack([train_data[i][1] for i in ix]).to(device)
    return x, y

In [54]:
type(train_data)

list

In [24]:
df['Text']

0    ﻿The Project Gutenberg eBook of Birds, Illustr...
1    ﻿The Project Gutenberg eBook of Artistic Anato...
2    ﻿The Project Gutenberg eBook of What Bird is T...
3    ﻿The Project Gutenberg eBook of Fox Trapping: ...
4    ﻿The Project Gutenberg eBook of On Snake-Poiso...
5    ﻿The Project Gutenberg eBook of Deadfalls and ...
6    ﻿The Project Gutenberg eBook of A Guide for th...
7    ﻿The Project Gutenberg eBook of Fifty Years a ...
8    ﻿The Project Gutenberg eBook of Our Vanishing ...
9    ﻿The Project Gutenberg eBook of The Exterminat...
Name: Text, dtype: object

In [None]:

print('-----')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when context is {context.tolist()} target is {target}")



In [13]:
print(xb)
print(yb)

tensor([[28,  1, 78, 65, 82, 68, 81, 85],
        [86, 77,  1, 64, 66, 66, 78, 81],
        [ 1, 77, 78, 83, 72, 69, 72, 68],
        [62,  1, 74, 77, 78, 86,  1, 83]])
tensor([[ 1, 78, 65, 82, 68, 81, 85, 68],
        [77,  1, 64, 66, 66, 78, 81, 67],
        [77, 78, 83, 72, 69, 72, 68, 67],
        [ 1, 74, 77, 78, 86,  1, 83, 71]])


In [14]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self, idx, targets=None):
        #idx and targets are both tensors of size B,T
        logits = self.token_embedding_table(idx) # (B,T,C)
        if targets is None:
            loss = None
        else:    
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)
        return logits, loss
        

    def generate(self, idx, max_new_tokens):
        #idx is (B,T) array of indices in current context
        for _ in range(max_new_tokens):
            #get the predictions
            logits, loss = self(idx)
            #focus only on the last time step
            logits = logits[:, -1, :] #becomes (B,C)
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples = 1)
            # append sampled index to running sequence
            idx = torch.cat((idx,idx_next), dim = 1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)

print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 156])
tensor(5.6915, grad_fn=<NllLossBackward0>)

_w¹àC8úAyÈÇ{á|95“É z§gÈ|~{è+îtsºnN?é﻿jJ`öbÇcÇç2kCJŒ÷Hx op99vXx7~h¹ÁºLgV\j′G3"a™A½löïjayjÇJUY.èÜ\üèŒ×


In [15]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):
    xb,yb = get_batch('train')
    logits, loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.479205369949341


In [16]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


y p fýÓÔûnnd th, bony ng; totathas cke r oulenghis, therakis acod se orsmeru a wan m. s anoug o sigh


In [17]:
torch.manual_seed(1337)

B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [18]:
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b,t] = torch.mean(xprev, 0)


In [19]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [20]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [21]:
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1,keepdim=True)
xbow2 = wei @ x
xbow2[0]


tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [22]:
xbow2[0], xbow[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [23]:
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [24]:
#hyperparameters
batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate = 1e-3
n_embd = 32
#device = 'mps' if torch.backends.mps.is_available() else 'cpu'

eval_iters = 200

In [26]:
@torch.no_grad
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train
    return out

In [28]:
model = BigramLanguageModel(vocab_size)

optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)


for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    logits, loss = model(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 5.6474, val loss 5.6570
step 300: train loss 5.6523, val loss 5.6556
step 600: train loss 5.6542, val loss 5.6443
step 900: train loss 5.6516, val loss 5.6531
step 1200: train loss 5.6494, val loss 5.6565
step 1500: train loss 5.6599, val loss 5.6495
step 1800: train loss 5.6552, val loss 5.6548
step 2100: train loss 5.6571, val loss 5.6469
step 2400: train loss 5.6493, val loss 5.6446
step 2700: train loss 5.6613, val loss 5.6500
