In [2]:
import torch.nn as nn
import torch.nn.functional as F

ModuleNotFoundError: No module named 'torch'

In [3]:
with open('input.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

In [6]:
len(text)

1115394

In [None]:
print(text[:1000])

In [14]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [83]:
batch_size = 32
block_size = 8
learning_rate = 1e-2
max_iters = 3000
eval_interval = 300
eval_iters = 200

In [21]:
# Mapping between integers and characters
stoi = {}
itos = {}
for i, ch in enumerate(chars):
    stoi[ch] = i
    itos[i] = ch

# Take a string and output a list of integers
def encode(s):
    out = []
    for ch in s:
        out.append(stoi[ch])
    return out

# Take a list of integers and output a string
def decode(ints):
    out = ""
    for i in ints:
        out += itos[i]
    return out

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [27]:
# Encoding the entire dataset and saving it in a tensor
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [28]:
# Splitting into train and validation
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [36]:
# Creating a block size (Maximum context)
data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [37]:
x = data[:block_size]
y = data[1:block_size + 1]
for t in range(block_size):
    context = x[:t + 1]   
    target = y[t]
    print(f"When context is {context}, target is {target}")

When context is tensor([18]), target is 47
When context is tensor([18, 47]), target is 56
When context is tensor([18, 47, 56]), target is 57
When context is tensor([18, 47, 56, 57]), target is 58
When context is tensor([18, 47, 56, 57, 58]), target is 1
When context is tensor([18, 47, 56, 57, 58,  1]), target is 15
When context is tensor([18, 47, 56, 57, 58,  1, 15]), target is 47
When context is tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is 58


In [51]:
gen = torch.manual_seed(1337)
# Creating a batch size (Number of independent sequences processed in parallel)
batch_size = 4
block_size = 8
def get_batch(split):
    data = train_data if split == "train" else val_data
    # ix is 4 (batch_size) numbers that are randomly generated between len(data) and block_size
    ix = torch.randint(len(data) - block_size, (batch_size, 1), generator = gen)
    x = torch.stack([data[i: i + block_size] for i in ix])
    y = torch.stack([data[i + 1: i + block_size + 1] for i in ix])
    return x,y
xb, yb = get_batch(train_data)
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t + 1]
        target = yb[b, t]
        print(f"When context is {context.tolist()}, target is {target}")

inputs:
torch.Size([4, 8])
tensor([[ 6,  1, 52, 53, 58,  1, 58, 47],
        [ 6,  1, 54, 50, 39, 52, 58, 43],
        [ 1, 58, 46, 47, 57,  1, 50, 47],
        [ 0, 32, 46, 43, 56, 43,  1, 42]])
targets:
torch.Size([4, 8])
tensor([[ 1, 52, 53, 58,  1, 58, 47, 50],
        [ 1, 54, 50, 39, 52, 58, 43, 58],
        [58, 46, 47, 57,  1, 50, 47, 60],
        [32, 46, 43, 56, 43,  1, 42, 53]])
When context is [6], target is 1
When context is [6, 1], target is 52
When context is [6, 1, 52], target is 53
When context is [6, 1, 52, 53], target is 58
When context is [6, 1, 52, 53, 58], target is 1
When context is [6, 1, 52, 53, 58, 1], target is 58
When context is [6, 1, 52, 53, 58, 1, 58], target is 47
When context is [6, 1, 52, 53, 58, 1, 58, 47], target is 50
When context is [6], target is 1
When context is [6, 1], target is 54
When context is [6, 1, 54], target is 50
When context is [6, 1, 54, 50], target is 39
When context is [6, 1, 54, 50, 39], target is 52
When context is [6, 1, 54, 50,

In [64]:
class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets = None):
        logits = self.token_embedding_table(idx) #(B,T,C)
        if targets is None:
            loss = None
        else:
            # idx and targets are of shape (B,T)
            B,T,C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        #idx is (B,T)
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] # Becomes (B, C)
            probs = F.softmax(logits, dim = -1)

            # Sampling from distribution
            idx_next = torch.multinomial(probs, num_samples = 1)
            idx = torch.cat((idx, idx_next), dim = 1)
        return idx
            
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb) 
print(loss)

idx = torch.zeros((1,1), dtype = torch.long) # stands for the new line token \n
print(idx)
print(decode(m.generate(idx = idx, max_new_tokens = 100)[0].tolist()))


tensor(4.6266, grad_fn=<NllLossBackward0>)
tensor([[0]])

'pZ&E:QuzR?hShNlS?STBwS:VCDM jlc:hRlO
MgqMNSqbpN:!hJcBEOhp,& ?e.WIEXxNbBAqsq;Vt:!SA-OTosAUjtdH:.gYp!


In [69]:
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [1]:
for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"Step {iter}: Train loss = {losses['train']}, Val loss = {losses['val']}")
    xb, yb = get_batch("train")
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()
print(loss.item())

NameError: name 'max_iters' is not defined

In [85]:
print(decode(m.generate(idx = idx, max_new_tokens = 500)[0].tolist()))


t gs weanalllacomy, k engad swase simothy thouse ce 'dyoth hor be; fooue fthodatho INEO: ntisthaspu ert thenf mand rfad ut.
BOMIDor'se burd,
DUKICKI aveato wepltr massakendos'sh ay

PATonse at!
'sthe!
F m. tcheny ure ore'd d?
CA:
RSit! brik Whe;

ARCrn towe wie y.
St fe!
Hateas wersooupaiewidifred f bl gaut, w fofu ba h pen y ave we annengh, akemm:
An. Weriverinde
Buby s in, d s piny,--be y ow IOLENGBut.
By ioshavithoootherots on d k n t
I tr; O:
Y heeasilerishire,
Aneder r punevene nefe.
Whim,



In [81]:
@torch.no_grad()
def estimate_loss():
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

In [88]:
a = torch.tril(torch.ones(3,3))
print(a)
torch.sum(a, 1, keepdims = True)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])


tensor([[1.],
        [2.],
        [3.]])