## Downloading data

In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-10-24 12:52:20--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'input.txt'


2023-10-24 12:52:21 (2.84 MB/s) - 'input.txt' saved [1115394/1115394]



In [2]:
with open("input.txt", "r") as f:
    text = f.read()
len(text)

1115394

In [4]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


## Preparing training and test datasets and dataloaders

In [15]:
class Vocab:
    def __init__(self, text: str):
        self.vocab = list(set(text))
        self.vocab.append("<unk>")
        self.vocab.sort()
        self.char2idx = {c: i for i, c in enumerate(self.vocab)}
        
    def encode(self, text: str) -> list:
        code = []
        for char in text:
            if char in self.char2idx:
                code.append(self.char2idx[char])
            else:
                code.append(self.char2idx["<unk>"])
        return code
    
    def decode(self, code: list) -> str:
        text = ""
        for c in code:
            text += self.vocab[c]
                
        return text
    

In [16]:
vocab = Vocab(text)
print(vocab.vocab)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '<unk>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [17]:
vocab.encode("Hello, I am ok.")

[21, 44, 51, 51, 54, 6, 1, 22, 1, 40, 52, 1, 54, 50, 8]

In [18]:
vocab.decode(vocab.encode("Hello, I am ok."))

'Hello, I am ok.'

In [19]:
import torch
data = torch.tensor(vocab.encode(text), dtype=torch.long)
data.shape

torch.Size([1115394])

### Splitting data into train and test

In [20]:
train_size = int(len(data) * 0.8)
train_data = data[:train_size]
test_data = data[train_size:]

In [21]:
train_data.shape, test_data.shape

(torch.Size([892315]), torch.Size([223079]))

In [30]:
len(train_data) > 3.5 * len(test_data)

True

In [31]:
torch.manual_seed(42)

<torch._C.Generator at 0x112accad0>

In [41]:
batch_size = 4
block_size = 8

In [42]:
def get_batch(dataset):
    indices = torch.randint(low=0, high=len(dataset) - block_size, size=(batch_size,))
    x = torch.stack([dataset[i:i+block_size] for i in indices])
    y = torch.stack([dataset[i+1:i+block_size+1] for i in indices])
    return x, y

In [43]:
x, y = get_batch(train_data)
print(x.shape, y.shape)

torch.Size([4, 8]) torch.Size([4, 8])


In [44]:
for seq1, seq2 in zip(x, y):
    print(seq1, ' --> ', seq2)

tensor([41, 57, 44, 40, 58, 59,  8,  0])  -->  tensor([57, 44, 40, 58, 59,  8,  0, 26])
tensor([47, 44,  1, 43, 44, 40, 43,  1])  -->  tensor([44,  1, 43, 44, 40, 43,  1, 40])
tensor([ 0, 36, 47, 48, 42, 47,  1, 53])  -->  tensor([36, 47, 48, 42, 47,  1, 53, 44])
tensor([51,  1, 52, 64,  1, 58, 44, 57])  -->  tensor([ 1, 52, 64,  1, 58, 44, 57, 61])


## Defining the model

In [61]:
import torch.nn as nn
import torch.nn.functional as F

In [80]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size:int):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, x, y):
        logits = self.embedding_table(x)
        
        if y is None:
            loss = None
        else:
            batch, context, vocab = logits.shape
            logits = logits.view(batch * context, vocab)
            loss = F.cross_entropy(logits, y.view(batch * context))
        return logits, loss
    
    def generate(self, x, max_new_tokens):
        with torch.no_grad():
            for _ in range(max_new_tokens):
                logits, _ = self.forward(x, None)
                logits = logits[:, -1, :] # take only the probabilities for the last token
                probs = F.softmax(logits, dim=-1)
                x_next = torch.multinomial(probs, num_samples=1)
                x = torch.concat((x, x_next), dim=1) 
                
        return x 

In [81]:
model = BigramLanguageModel(len(vocab.vocab))
model

BigramLanguageModel(
  (embedding_table): Embedding(66, 66)
)

In [89]:
out = model.generate(x = torch.zeros(size=(1,1), dtype=torch.long), max_new_tokens=100)
sequence = vocab.decode(out[0].tolist())
sequence

"\nTYNccg3oJgBXLxEH;YuLb'sAXtNIWUjA<unk>PI:iEES.!WQ\nW wK;BXK<unk>&J-hxsTLN;?CvRgWXzJtI3oUHnQx$QMmw,GBRpHVZMgbrb"

In [90]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-3)
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)

In [95]:
batch_size=32
for epoch in range(10000):
    x, y = get_batch(train_data)
    logits, loss = model(x, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4236395359039307


In [98]:
out = model.generate(x = torch.zeros(size=(1,1), dtype=torch.long), max_new_tokens=400)
sequence = vocab.decode(out[0].tolist())
print(sequence)


MERisomy taron to,
Th, t MONG wortho LUThan s, s cr t wa cke qu wilQUS:

ABRDin t ads, g:

GRCAhadearr 

DYOLINodo me, andet?
LY in nd I fr; I oulind my ngit ceieay hswine, ndoonde ruck ckeree, ENGBRDYOfis h ck heilino tid amire!
LARDAstsod d h aybute angnoupiger hy os s,
INCLIVOFI a?
VOUS:

Of aker
I by ir thin, t l lodird lla tt,
got heto tht ey
Hoor, ben ghersthillvougr RKI ans nd hed I s th In
