In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from model import *

d_model=512
d_ff = 2048
dropout = 0.2
vocab_size=10000
max_seq_len=8
lr=1e-3
batch_size=4
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def read_data():
    with open('input.txt', 'r', encoding='utf-8') as f:
        text = f.read()
        return text



In [2]:
input = read_data()
chars=sorted(list(set(input)))
print(''.join(chars))
print(len(chars))
# Begin with a character-level tokenizer
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(input), dtype=torch.long)

n=int(0.9*len(data))
train_data=data[:n]
val_data=data[n:]


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


'torch.LongTensor'

In [3]:
torch.manual_seed(1337)

def get_batch(split):
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data) - max_seq_len, (batch_size,))
    x = torch.stack([data[i:i+max_seq_len] for i in ix])
    y = torch.stack([data[i+1:i+max_seq_len+1] for i in ix])
    return x,y

xb, yb = get_batch('train')
print(xb)
print(yb)


tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
torch.LongTensor


In [4]:
mod = GPTDecoderModel()
mod = mod.to(device)
print(sum(p.numel() for p in mod.parameters())/1e6, 'M parameters')

13.40136 M parameters


In [11]:
eval_iters=200

@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
optimizer = torch.optim.Adam(mod.parameters(), lr=lr)
max_iters=5000
for iter in range(max_iters):

    if iter % 100 == 0 or iter == max_iters-1:
        losses = estimate_loss(mod)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    xb, yb = get_batch('train')
    
    logits, loss = mod(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 2.4599, val loss 2.5287
step 100: train loss 2.5122, val loss 2.5103
step 200: train loss 2.4976, val loss 2.4695
step 300: train loss 2.4581, val loss 2.5405
step 400: train loss 2.4187, val loss 2.4615
step 500: train loss 2.4634, val loss 2.4975
step 600: train loss 2.4348, val loss 2.4920
step 700: train loss 2.4234, val loss 2.4802
step 800: train loss 2.4562, val loss 2.4617
step 900: train loss 2.4334, val loss 2.4640
step 1000: train loss 2.4016, val loss 2.4367
step 1100: train loss 2.4284, val loss 2.4511
step 1200: train loss 2.4247, val loss 2.4248
step 1300: train loss 2.4263, val loss 2.4608
step 1400: train loss 2.3679, val loss 2.4088
step 1500: train loss 2.4325, val loss 2.4337
step 1600: train loss 2.3956, val loss 2.4422
step 1700: train loss 2.3673, val loss 2.4221
step 1800: train loss 2.4207, val loss 2.4129
step 1900: train loss 2.3913, val loss 2.3729
step 2000: train loss 2.3593, val loss 2.3642
step 2100: train loss 2.3658, val loss 2.3772


In [12]:
class EmbTest(nn.Module):

    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.l1 = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        emb = self.l1(x)
        return emb

test = EmbTest(10, 4)
inp=torch.randint(low=1, high=11, size=(1, 10))
inp
test(inp)

IndexError: index out of range in self