In [3]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-11-13 09:54:24--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 

200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-11-13 09:54:24 (7.28 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [6]:
# read it in to inspect it
with open('recherche.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [7]:
import unidecode
text = unidecode.unidecode(text)

In [8]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1013777


In [9]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"$%'()*,-./0123456789:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
81


In [10]:
string_to_integer = {ch: i for i, ch in enumerate(chars)}
integer_to_string = {i: ch for i, ch in enumerate(chars)}

encode = lambda s: [string_to_integer[c] for c in s]
decode = lambda l: ''.join([integer_to_string[i] for i in l])

In [11]:
import torch
data = torch.tensor(encode(text), dtype=int)

data[:100]

tensor([40, 69, 68, 61, 74, 59, 67, 70, 73, 10,  1, 64, 59,  1, 67, 59,  1, 73,
        75, 63, 73,  1, 57, 69, 75, 57, 62, 59,  1, 58, 59,  1, 56, 69, 68, 68,
        59,  1, 62, 59, 75, 72, 59, 12,  1, 44, 55, 72, 60, 69, 63, 73, 10,  1,
        55,  1, 70, 59, 63, 68, 59,  1, 67, 55,  0, 56, 69, 75, 61, 63, 59,  1,
        59, 74, 59, 63, 68, 74, 59, 10,  1, 67, 59, 73,  1, 79, 59, 75, 78,  1,
        73, 59,  1, 60, 59, 72, 67, 55, 63, 59])

In [14]:
split = int(0.9*len(data))
train_data = data[:split]
val_data = data[split:]

In [15]:
block_size = 8
train_data[:block_size+1]

tensor([40, 69, 68, 61, 74, 59, 67, 70, 73])

In [18]:
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("inputs")
print(xb.shape)
print(xb)
print("targets")
print(yb.shape)
print(yb)

inputs
torch.Size([4, 8])
tensor([[66, 66, 59,  0, 72, 69, 75, 61],
        [59, 59,  1, 58, 59,  1, 57, 62],
        [63, 59, 72, 59,  1, 59, 74,  1],
        [74, 75,  1, 58, 69, 63, 73,  1]])
targets
torch.Size([4, 8])
tensor([[66, 59,  0, 72, 69, 75, 61, 59],
        [59,  1, 58, 59,  1, 57, 62, 59],
        [59, 72, 59,  1, 59, 74,  1, 70],
        [75,  1, 58, 69, 63, 73,  1, 73]])


In [31]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token reads off the logits from the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensors of integers
        logits = self.token_embedding_table(idx) # (B, T, C) with T(time) = block size and C(channels) = vocab_size
        if targets is None:
            loss = None
            
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1, 1), dtype=torch.long) # 0 is newline character so every line starts with it
print(decode(m.generate(idx, max_new_tokens=1000)[0].tolist()))

torch.Size([32, 81])
tensor(4.9435, grad_fn=<NllLossBackward0>)

J8Q8'r:.XKXmMrRgXckeU)pDUkHO;u"2QfrRaA73LVk?"1Ef-oTcJ7">6BBl2?AHyvSQMSX
PaV1Q75d;<9s-tP
f2:?FW;ERTVn4<9n)qB"JEwUyPZ$b<vx>;FyJ$)d
g>KXw5,ogQ;NayKclX)lCI6UZS
LEWnl(phO4,;%Je:PlvxTt*d' ".mVyv)dI$sJDu1WCUzxZ2""ofvOiD:/rI G02CIeT6Uvgtm:qCiG1pJmvxszjCp!7aEQ/.mK ME%PdaERrQP$koYN'oAs1/Bw.IBTIUT7qK5X33fEYc0XZKRmYRNSz>tS6.!O-AZ"ff5aGf5<BPkgDJWY h?bY?XAZ08M9:(b7RzpigvOX. HjaERhwi8%EI08sdFCH:. 7-PV2V)dmsdmjr!;d/og8xJyAN6cW/8bBr;FC,TJO/PRWT()d1ykE/.!zq1;SAD,*xsff2YJCHfjXd!W;OrowH>-vuYHiADuaf3VSgjCgvx 08MRXdFWKJCLV?A%l3,Wnz/z.2WGgc)SrCJyKQ.,/>KSXs2PbZ SEW Aq G(S6.,CK1;%5d,1WSAyvSbe4(
z"JAS,owh
0D"6z/WKWy:vgDYW!9n).HhOX0TKR-gT7G3DgD6aNRTafqi4QHA>J>'W()p-)csu;-%Z(be5%e81gb08yNw.qQCI4
z4aEU9p)8e2p(P
rTC$daz<ofXEWh
2.um!$(S3E;X)A!QuiPR-XQ)k,;R'x<%JL hR'B24QlX1Uk2P$)p)IP%9"
t3z?ePB5PKRw Dy:()7T,KiD3pX-)AEmIU!Eoff)o5,tZ1v*KLOgFa3pgIp*UF:XW:()FnbBQrxJw<hjCIayKPizcdk?8zeb2oTh">mW93("bpJ!jlbgHDjbBWUqNBiyKJLRzq!FNA$)4b(szRaf!qt3P6TP!kkNt*/8gAZ.I

In [32]:
# Create PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-2)

In [37]:
batch_size = 32
for steps in range(3000):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

2.361889600753784

joundiouss Swalet anna iluris ennt ilecesan Fr qufaiomin, qu'havome
ders a rt daitileereen one
det c


### Mathematical trick in self-attention

In [None]:
# consider the following toy example
torch.manual_seed(1337)
B,T,C = 4, 8, 2 # batch, time channels
x = torch.randn(B,T,C)
x.shape