In [2]:
import wget
wget.download("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")

'input (1).txt'

In [3]:
#read and inspect
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
print("length of dataset in characters", len(text))

length of dataset in characters 1115394


In [5]:
#get all unique characters from text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars)}
itos = { i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("testing string hi"))
print(decode(encode("testing string hi")))

[58, 43, 57, 58, 47, 52, 45, 1, 57, 58, 56, 47, 52, 45, 1, 46, 47]
testing string hi


In [7]:
# now encode the entire text dataset and store it into a Torch.tensor
import torch
torch.set_default_tensor_type('torch.cuda.FloatTensor' if torch.cuda.is_available() else 'torch.FloatTensor')
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

  _C._set_default_tensor_type(t)


torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [8]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
# never put all data into into transformer at once it will be too expensive
# instead we take chunks at a time from the training set
block_size = 8
train_data[:block_size+1]
#train it to make a prediction at every individual token(character in this case)

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")
#these are the 8 examples hidden in the chunk of 9 characters that we sampled from the training set

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [11]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel
block_size = 8 # what is the maximum context length for predictions

def get_batch(split):
    #generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x  = torch.stack([data[i:i+block_size] for i in ix])
    y  = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[60, 43, 56, 63,  1, 51, 43, 39],
        [35, 53, 59, 50, 42, 57, 58,  1],
        [46, 47, 58, 46, 43, 56,  1, 61],
        [43,  6,  1, 57, 61, 43, 43, 58]])
targets
torch.Size([4, 8])
tensor([[43, 56, 63,  1, 51, 43, 39, 52],
        [53, 59, 50, 42, 57, 58,  1, 46],
        [47, 58, 46, 43, 56,  1, 61, 47],
        [ 6,  1, 57, 61, 43, 43, 58,  1]])
----
when input is [60] the target: 43
when input is [60, 43] the target: 56
when input is [60, 43, 56] the target: 63
when input is [60, 43, 56, 63] the target: 1
when input is [60, 43, 56, 63, 1] the target: 51
when input is [60, 43, 56, 63, 1, 51] the target: 43
when input is [60, 43, 56, 63, 1, 51, 43] the target: 39
when input is [60, 43, 56, 63, 1, 51, 43, 39] the target: 52
when input is [35] the target: 53
when input is [35, 53] the target: 59
when input is [35, 53, 59] the target: 50
when input is [35, 53, 59, 50] the target: 42
when input is [35, 53, 59, 50, 42] the target: 57
when input is 

In [12]:
print(xb) # our input into the transformer 

tensor([[60, 43, 56, 63,  1, 51, 43, 39],
        [35, 53, 59, 50, 42, 57, 58,  1],
        [46, 47, 58, 46, 43, 56,  1, 61],
        [43,  6,  1, 57, 61, 43, 43, 58]])


In [13]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)


class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):

        #idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) #(B, T, C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #get predictions
            logits, loss = self(idx)
            #focus only on the last time step
            logits = logits[:, -1, :] # becomes (B,C)
            #apply softmax to get probabilites
            probs = F.softmax(logits, dim=-1) # (B,C)
            #sample form the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # appened sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
        
m = BigramLanguageModel(vocab_size)
out, loss = m(xb, yb)
print(out.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8466, grad_fn=<NllLossBackward0>)

JrTxbDkRZkNwc.wj,ZTxO-On-y$WK
baqPe?kMBFeA$G:XZSGgO-3cjMGd?gLhaGhX'YVX3tpgfNuwq&$WWv.tbaF :X3!FHaGeN


In [14]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [15]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    print(loss.item())

4.722155570983887
4.721252918243408
4.708561897277832
4.776569843292236
4.720189094543457
4.707447052001953
4.730064868927002
4.654762268066406
4.653609275817871
4.707705497741699
4.678032398223877
4.713224411010742
4.632512092590332
4.60457181930542
4.718225479125977
4.628852844238281
4.697115898132324
4.638158798217773
4.700346946716309
4.697381496429443
4.58624267578125
4.638833999633789
4.674704074859619
4.671642780303955
4.569454669952393
4.667054653167725
4.602894306182861
4.627845764160156
4.692698955535889
4.663902759552002
4.673824310302734
4.649434566497803
4.663801193237305
4.66640567779541
4.6532769203186035
4.585446834564209
4.535049915313721
4.671940326690674
4.645997524261475
4.568976402282715
4.699373722076416
4.638554096221924
4.561494827270508
4.66529655456543
4.697883605957031
4.626029014587402
4.666342735290527
4.672171115875244
4.641134262084961
4.633011341094971
4.61398458480835
4.6916890144348145
4.651211261749268
4.690014362335205
4.639516353607178
4.59438133239

In [16]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=500)[0].tolist()))



T; nd
Anch icervig cllearTE:
Y heldolod mare han?jOLayowildist!
ShapudQushe hordyontour brd pl, a ar, ISab,
Dive wontofod isthigliry an.
Th sout onnitoithay! fes y m stor Fotitwe w ngatenys.

II be can O od diorouloce bop.
BE:

BoBinguk, ite' at qHEN:
Pse on3cthel wilcee houiseprs, y
Tede-tr, fPotishere'des co uret
So my my ou irens ne thepe pins.
Anor the fas myouengely arer spowan'se NUKIARIThe?
Foompt ces t tise at ncoulvearengherve, irethard h, olt y?

Teyok wo;
Kwo rsa, fe ar nt an; felfon


In [17]:
print(xb.device)

cuda:0


In [18]:
torch.manual_seed(1337)
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [19]:
# We want x[b,t] = mean_{i<=t} x[b,i]
# version 1
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] #(t, C)
        xbow[b,t] = torch.mean(xprev, 0)

In [20]:
# version 2
wei = torch.tril(torch.ones(T, T))
wei = wei / torch.sum(wei, 1, keepdim=True)
xbow2 = wei @ x #(B, T, T) @ (B, T, C) ----> (B, T, C)

torch.allclose(xbow, xbow2)
xbow[0], xbow2[0]

(tensor([[-0.2143, -0.5001],
         [ 0.1390,  0.3503],
         [-0.1853, -0.0495],
         [-0.3252,  0.4034],
         [-0.4186,  0.0921],
         [-0.0696,  0.1230],
         [-0.0349,  0.0891],
         [-0.1730,  0.1208]]),
 tensor([[-0.2143, -0.5001],
         [ 0.1390,  0.3503],
         [-0.1853, -0.0495],
         [-0.3252,  0.4034],
         [-0.4186,  0.0921],
         [-0.0696,  0.1230],
         [-0.0349,  0.0891],
         [-0.1730,  0.1208]]))

In [22]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [31]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# let's see a single head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros(T, T)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
#out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [29]:
wei[0]

tensor([[ 0.6246,  1.4527, -0.8455,  1.6544, -0.2910, -1.3241,  3.1928, -0.9580],
        [ 0.2484,  0.6016,  1.5596,  0.0480, -0.8310,  0.6599, -2.1859, -2.9718],
        [ 0.8127, -1.7581, -1.5990, -0.0669,  1.3013,  2.5681, -2.7368,  0.8935],
        [-1.2564, -2.1013,  0.2840,  0.4676,  0.8958, -2.4343,  2.0974,  0.1430],
        [-1.1045, -1.7835, -0.8400,  0.6274,  1.6353,  1.0175, -0.7430, -5.0479],
        [-0.9218, -0.1299, -1.3146,  1.0461,  1.3922,  0.2790,  0.4945, -0.0442],
        [ 1.6954,  3.3545, -0.0389, -1.7877,  0.2858,  0.3101,  2.1354,  4.4268],
        [ 1.9740,  0.9047, -0.7768, -0.2361,  1.0300,  1.7825,  1.3554, -0.4112]],
       grad_fn=<SelectBackward0>)

In [None]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10, (3,2)).float()
c = a @ b
print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[3., 9.],
        [8., 0.],
        [3., 7.]])
tensor([[3.0000, 9.0000],
        [5.5000, 4.5000],
        [4.6667, 5.3333]])


In [None]:
class BatchNorm1d:

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        # calculate the forward pass
        xmean = x.mean(1, keepdim=True) # batch mean
        xvar = x.var(1, keepdim=True) # batch variance
        xhat = (x-xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        # update the buffers 
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

torch.manual_seed(1337)
module = BatchNorm1d(100)
x = torch.randn(32, 100)
x = module(x)
x.shape

torch.Size([32, 100])

In [38]:
x[:,0].mean(), x[:,0].std()

(tensor(-0.1627), tensor(0.9687))

In [39]:
x[0,:].mean(), x[0,:].std()

(tensor(1.3113e-08), tensor(1.0000))