In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
print(f"{torch.cuda.is_available()=}")
device="cpu"
if torch.cuda.is_available():
    device="cuda:0"
print(device)

In [None]:
names_f = "tinyshakespeare/input.txt"
with open(names_f) as f:
    text = f.read()

#random.seed(42)
print(text[:30])
print(f"{len(text)=}")

chars = sorted(set(text))
voc_size = len(chars)
print(f"{chars[:100]=}")
print(f"{voc_size=}")

itos = dict()
stoi = dict()
for i, c in enumerate(chars):
    itos[i] = c
    stoi[c] = i
def encode(ss):
    return [stoi[c] for c in ss]
def decode(ii):
    return ''.join([itos[i] for i in ii])
print(encode("Hello\nWorld"))
print(decode(encode("Hello\nWorld")))

data = torch.tensor(encode(text), dtype=torch.long, device=device)
print(f"{data.shape=}")
print(data[:30])
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]


In [None]:
block_size = 8

def get_batch(data, batch_size, device=None):
    ix = torch.randint(low=0,high=len(data)-block_size-1, size=(batch_size,), device=device)
    x = torch.stack([data[i : i+block_size] for i in ix], device=device)
    y = torch.stack([data[i+1 : i+block_size+1] for i in ix], device=device)
    return x, y

In [None]:
class BigramModel(nn.Module):
    def __init__(self, voc_size) -> None:
        super().__init__()
        self.emb = nn.Embedding(voc_size, voc_size)
    def forward(self, ids):
        logits = self.emb(ids)
        return logits
    def calc_loss(self, logits, Y):
        logits = logits.transpose(1,2)
        return F.cross_entropy(logits, Y)
    @torch.no_grad()
    def generate(self, ids, max_new_tokens):
        for i in range(max_new_tokens):
            logits = self(ids)
            logits = logits[:,-1,:]
            prob = torch.softmax(logits, dim=-1)
            y = torch.multinomial(prob, num_samples=1)
            xx = torch.cat((ids, y), dim=-1)
        return xx.detach().cpu().numpy()

In [None]:
model = BigramModel(voc_size=voc_size).to(device)
lossi = []
print("Numel:", sum([p.numel() for p in model.parameters()]))

In [None]:
# Training Loop
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
batch_size = 32
WIN = []
N = 20000
for i in range(N):
    if i > 0:
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    x, y_target = get_batch(train_data, batch_size, device)
    logits=model(x)
    loss = model.calc_loss(logits, y_target)
    WIN.append(loss.detach().cpu().item())
    if (i+1) % 100 == 0:
        avg_loss = np.mean(WIN)
        WIN=[]
        lossi.append(avg_loss)

if lossi:
    print(f"{lossi[-1]=}")


In [None]:
if lossi:
    plt.figure(figsize=(20,5))
    plt.grid()
    plt.plot(lossi)

In [None]:
# Validation Loop
model.eval()
batch_size = 32
WIN=[]
for _ in range(100):
    x, y_target = get_batch(val_data, batch_size, device)
    logits = model(x)
    loss = model.calc_loss(logits, y_target)
    WIN.append(loss.detach().cpu().item())
print("loss:", np.mean(WIN))

In [None]:
model.eval()
#xx, _ = get_batch(val_data, 1, device)
xx = torch.zeros((1,1), dtype=torch.long)
yy = generate(xx, 100)
for i in range(len(xx)):
    print(decode(yy[i]))

In [None]:
max_f32=np.finfo(np.float32).max

#x = torch.softmax(torch.tril(torch.zeros((4,4)) + max_f32) - max_f32-100, dim=-1)
#x
torch.tril(-torch.zeros((8,8)) ** -1, diagonal=-1).transpose(0,1).softmax(-1)

#torch.tensor([0,0,n_inf], dtype=torch.float32).softmax(0)