In [1]:
from datasets import load_dataset

In [2]:
data = load_dataset('andjela-r/mlm-harry-potter', split = "train")

In [3]:
data["text"][0]

"Harry Potter and the Sorcerer's Stone"

In [4]:
output_file = "input.txt"

In [5]:
with open(output_file, "w", encoding='utf-8') as file:
    for line in data["text"]:
        file.write(line + "\n")


In [6]:
with open ('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [7]:
print("length of dataset in characers: ", len(text))

length of dataset in characers:  6235521


In [8]:
print(text[:1000])

Harry Potter and the Sorcerer's Stone
CHAPTER ONE
THE BOY WHO LIVED
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.
Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.
The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Pot

In [9]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"#$%&'()*,-./0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{}~­éü
100


In [10]:
# simple tokenizer - character level
stoi = { ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [11]:
print(encode("hii there"))
print(decode(encode("hi there")))

[71, 72, 72, 2, 83, 71, 68, 81, 68]
hi there


In [12]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([6235521]) torch.int64
tensor([39, 64, 81, 81, 88,  2, 47, 78, 83, 83, 68, 81,  2, 64, 77, 67,  2, 83,
        71, 68,  2, 50, 78, 81, 66, 68, 81, 68, 81,  9, 82,  2, 50, 83, 78, 77,
        68,  0, 34, 39, 32, 47, 51, 36, 49,  2, 46, 45, 36,  0, 51, 39, 36,  2,
        33, 46, 56,  2, 54, 39, 46,  2, 43, 40, 53, 36, 35,  0, 44, 81, 15,  2,
        64, 77, 67,  2, 44, 81, 82, 15,  2, 35, 84, 81, 82, 75, 68, 88, 13,  2,
        78, 69,  2, 77, 84, 76, 65, 68, 81,  2, 69, 78, 84, 81, 13,  2, 47, 81,
        72, 85, 68, 83,  2, 35, 81, 72, 85, 68, 13,  2, 86, 68, 81, 68,  2, 79,
        81, 78, 84, 67,  2, 83, 78,  2, 82, 64, 88,  2, 83, 71, 64, 83,  2, 83,
        71, 68, 88,  2, 86, 68, 81, 68,  2, 79, 68, 81, 69, 68, 66, 83, 75, 88,
         2, 77, 78, 81, 76, 64, 75, 13,  2, 83, 71, 64, 77, 74,  2, 88, 78, 84,
         2, 85, 68, 81, 88,  2, 76, 84, 66, 71, 15,  2, 51, 71, 68, 88,  2, 86,
        68, 81, 68,  2, 83, 71, 68,  2, 75, 64, 82, 83,  2, 79, 68, 78, 79, 75,
      

In [13]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [14]:
block_size = 8
train_data[:block_size+1]

tensor([39, 64, 81, 81, 88,  2, 47, 78, 83])

In [15]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is {target}")

when input is tensor([39]) the target is 64
when input is tensor([39, 64]) the target is 81
when input is tensor([39, 64, 81]) the target is 81
when input is tensor([39, 64, 81, 81]) the target is 88
when input is tensor([39, 64, 81, 81, 88]) the target is 2
when input is tensor([39, 64, 81, 81, 88,  2]) the target is 47
when input is tensor([39, 64, 81, 81, 88,  2, 47]) the target is 78
when input is tensor([39, 64, 81, 81, 88,  2, 47, 78]) the target is 83


In [16]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print("----------")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[69,  2, 88, 78, 84, 15,  4,  0],
        [65, 75, 68,  2, 69, 84, 77, 70],
        [75, 75,  2, 72, 77, 83, 78,  2],
        [ 2, 67, 72, 67, 77, 95, 83,  2]])
targets:
torch.Size([4, 8])
tensor([[ 2, 88, 78, 84, 15,  4,  0,  4],
        [75, 68,  2, 69, 84, 77, 70, 72],
        [75,  2, 72, 77, 83, 78,  2, 72],
        [67, 72, 67, 77, 95, 83,  2, 67]])
----------
when input is [69] the target: 2
when input is [69, 2] the target: 88
when input is [69, 2, 88] the target: 78
when input is [69, 2, 88, 78] the target: 84
when input is [69, 2, 88, 78, 84] the target: 15
when input is [69, 2, 88, 78, 84, 15] the target: 4
when input is [69, 2, 88, 78, 84, 15, 4] the target: 0
when input is [69, 2, 88, 78, 84, 15, 4, 0] the target: 4
when input is [65] the target: 75
when input is [65, 75] the target: 68
when input is [65, 75, 68] the target: 2
when input is [65, 75, 68, 2] the target: 69
when input is [65, 75, 68, 2, 69] the target: 84
when input is [65, 

In [17]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T , C)
            targets = targets.view(B*T) #or -1
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits,dim = -1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
    

m = BigramLanguageModel(vocab_size)
out, loss = m(xb, yb)
print(out.shape)
print(loss)
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 100])
tensor(5.0269, grad_fn=<NllLossBackward0>)

sP2g­6c/M!xQ%-%*HRo/~kz­0-é(~K­./DYmp
O$7sZXgo`
KdaT(QEaQMYétU6dotARt&o;éP/gDo7ték6ZUa/&P&os2%


In [18]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [19]:
batch_size = 32
for steps in range(50000):
    xb,yb = get_batch('train')
    logits,loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.5120654106140137


In [20]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


's, tifowad ay and f s a kesap de;
"
"So The tainen warit, t t ckevotou b. a he willexppid BBee s la fisthak w med od; w. ofee. he I'sin ch wasn ariaweapoolos s ty. l, Eat twoome yone. t. te ate.
ste tothe
"
"I cto ar ly?" thengofeofonineelathat g ty g Itothy?" histh, rk!" d Daceusg ly ff ­­ay "GHack terinklerred hes pin Har ng wn, Hooay wh, Gofanedo st sf ble him t?"­ marin minery Whaite larurreret tino sit. alailyf bokeaicchof ile vesarus menderemin,"
Lansond coremerlyoumok hemala ds t as Har 
