In [None]:
#pip install torch torchvision torchaudio

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

In [None]:
n_emb = 10
block_size = 8
batch_size = 4
n_hidden = 64
words = open('list.txt', 'r').read().splitlines()
vocab_size = sorted(list(set(''.join(words))))

In [None]:
stoi = {s:i for i,s in enumerate(vocab_size)}
itos = {i:s for s,i in stoi.items()}

In [None]:
def data_segmentation(n):
    Tdata = words[:n]
    Vdata = words[n:]
    return Tdata,Vdata

def batch_creation(data, token_size):
    X, Y = [], []
    context = [0] * token_size
    for ch in data:
        for w in ch + '.':
            ix = stoi.get(w, None)
            if ix is None or ix >= len(vocab_size):
                continue
            X.append(context)
            Y.append(ix)
            if ix == 0:
                context = [0] * token_size
                break
            else:
                context = context[1:] + [ix]
    return X, Y



def pick_batch(X, Y, batch_size):
    ix = torch.randint(len(X), (batch_size,)).tolist()

    # Collect elements from `X` and `Y` using list comprehension
    xb = torch.stack([torch.tensor(X[i]) for i in ix])  # Use individual indices to access elements
    yb = torch.stack([torch.tensor(Y[i]) for i in ix])  # Use individual indices to access elements

    return xb, yb


Xtr,Ytr = batch_creation(words[:80000],block_size)
Xv,Yv = batch_creation(words[80000:],block_size)
xb,yb = pick_batch(X=Xtr,Y=Ytr,batch_size=batch_size)
xb.shape,yb.shape

(torch.Size([4, 8]), torch.Size([4]))

In [None]:
for x,y in zip(Xtr[0:20],Ytr[0:20]):
  print(x," ----------->",y)
print(xb , '------------->',yb)

[0, 0, 0, 0, 0, 0, 0, 0]  -----------> 5
[0, 0, 0, 0, 0, 0, 0, 5]  -----------> 3
[0, 0, 0, 0, 0, 0, 5, 3]  -----------> 5
[0, 0, 0, 0, 0, 5, 3, 5]  -----------> 1
[0, 0, 0, 0, 5, 3, 5, 1]  -----------> 40
[0, 0, 0, 5, 3, 5, 1, 40]  -----------> 50
[0, 0, 5, 3, 5, 1, 40, 50]  -----------> 39
[0, 5, 3, 5, 1, 40, 50, 39]  -----------> 32
[5, 3, 5, 1, 40, 50, 39, 32]  -----------> 3
[3, 5, 1, 40, 50, 39, 32, 3]  -----------> 5
[5, 1, 40, 50, 39, 32, 3, 5]  -----------> 2
[1, 40, 50, 39, 32, 3, 5, 2]  -----------> 41
[40, 50, 39, 32, 3, 5, 2, 41]  -----------> 32
[50, 39, 32, 3, 5, 2, 41, 32]  -----------> 56
[39, 32, 3, 5, 2, 41, 32, 56]  -----------> 3
[32, 3, 5, 2, 41, 32, 56, 3]  -----------> 5
[3, 5, 2, 41, 32, 56, 3, 5]  -----------> 32
[5, 2, 41, 32, 56, 3, 5, 32]  -----------> 3
[2, 41, 32, 56, 3, 5, 32, 3]  -----------> 5
[41, 32, 56, 3, 5, 32, 3, 5]  -----------> 32
tensor([[52, 51, 57,  3,  7, 43, 52, 53],
        [36,  3,  8, 40, 38, 49, 36, 38],
        [39, 45,  3, 11, 49, 52

In [None]:
class Linear:

  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn(fan_in, fan_out) / fan_in**0.5
    self.bias = torch.zeros(fan_out) if bias else None

  def __call__(self, x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out

  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])



class Tanh:
    def __init__(self):
        self.tanh = torch.nn.Tanh()

    def __call__(self,x):
        out = self.tanh(x)
        return out

    def parameters(self):
        return []

class Embedding:

  def __init__(self):
    self.weight = torch.randn(len(vocab_size), n_emb)

  def __call__(self, IX):
    self.out = self.weight[IX]
    return self.out

  def parameters(self):
    return [self.weight]

class Flatten:
    def __call__(self, x):
        B, T, C = x.shape
        out = x.view(B, T*C)
        return out

    def parameters(self):
        return []

class SoftmaxModel:
    def __init__(self):
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, x):
        return self.softmax(x)

class BatchNorm1d:

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)

  def __call__(self, x):
    # calculate the forward pass
    if self.training:
      if x.ndim == 2:
        dim = 0
      elif x.ndim == 3:
        dim = (0,1)
      xmean = x.mean(dim, keepdim=True) # batch mean
      xvar = x.var(dim, keepdim=True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

class Sequential:

  def __init__(self, layers):
    self.layers = layers

  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    self.out = x
    return self.out

  def parameters(self):
    return [p for layer in self.layers for p in layer.parameters()]


In [None]:
"""model = BiagrammModel()
  # Use Adam optimizer for better stability

for i in range(50000):
    xb, yb = pick_batch(X, Y, batch_size)
    logits = model(xb)
    loss = F.cross_entropy(logits, yb)  # Don't apply Softmax before cross_entropy

    for p in model.parameters():
        p.grad = None

    loss.backward()

    for p in model.parameters():
        p.data += -0.1 * p.grad



    # Print the loss every 1000 iterations
    if i % 1000 == 0:
        print(f"Step {i}, Loss: {loss.item()}")"""

'model = BiagrammModel()\n  # Use Adam optimizer for better stability\n\nfor i in range(50000):\n    xb, yb = pick_batch(X, Y, batch_size)\n    logits = model(xb)\n    loss = F.cross_entropy(logits, yb)  # Don\'t apply Softmax before cross_entropy\n\n    for p in model.parameters():\n        p.grad = None\n\n    loss.backward()\n\n    for p in model.parameters():\n        p.data += -0.1 * p.grad\n\n\n\n    # Print the loss every 1000 iterations\n    if i % 1000 == 0:\n        print(f"Step {i}, Loss: {loss.item()}")'

In [None]:
n_emb = 10
n_hidden = 64
model = Sequential([
  Embedding(),
  Flatten(), Linear(n_emb * block_size, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(n_hidden, len(vocab_size)),
])


with torch.no_grad():
  model.layers[-1].weight *= 0.1

parameters = model.parameters() # number of parameters in total
for p in parameters:
  p.requires_grad = True

In [None]:
for layer in model.layers:
  layer.training = True

In [None]:
# same optimization as last time
max_steps = 10000
batch_size = 32
lossi = []

for i in range(max_steps):

  # minibatch construct

  Xb, Yb = pick_batch(Xtr,Ytr,batch_size) # batch X,Y


    # forward pass
  logits = model(Xb)

  loss = F.cross_entropy(logits, Yb) # loss function

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update: simple SGD
  #lr = 0.01 if i < 2500 else
  lr = 0.001 # step learning rate decay
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  if i % 1000 == 0: # print every once in a while
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())




      0/  10000: 4.0657
   1000/  10000: 4.0109
   2000/  10000: 3.9694
   3000/  10000: 3.8732
   4000/  10000: 3.7817
   5000/  10000: 3.6752
   6000/  10000: 3.6041
   7000/  10000: 3.5659
   8000/  10000: 3.4351
   9000/  10000: 3.4685


In [None]:
for layer in model.layers:
  layer.training = False

In [None]:
for i in range(20):
    out = []
    context = [0] * block_size
    while True:
      logits = model(torch.tensor([context]))
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 3:
        break

    print(''.join(itos[i] for i in out))

KeyError: 0

In [None]:
stoi