In [2]:
# E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. 
#     Train the bigram and trigram models only on the training set.
#     Evaluate them on dev and test splits. What can you see?

# E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one.
# Feel free to use either counting or a neural net.
# Evaluate the loss; Did it improve over a bigram model?

words = open('names.txt', 'r').read().splitlines()


words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}


In [4]:
import torch
# create the training set of bigrams (x,y)
xs, ys = [], []

for w in words[:1]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2,ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs.append([ix1,ix2])
    ys.append(ix3)
    
xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [5]:
xs.shape



torch.Size([4, 2])

In [6]:
W = torch.randn((27*2,1))
W.shape
# xenc_flattened = xenc.view(4, 27 * 2)  # Reshape xenc to [batch_size, 27 * 2]

# # Perform matrix multiplication
# logits = torch.matmul(xenc_flattened, W)  # Output shape will be [4, 1]
# logits.shape

torch.Size([54, 1])

In [26]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g)



In [28]:
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc.view(-1, 27*2) @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# btw: the last 2 lines here are together called a 'softmax'

In [14]:
probs.shape

torch.Size([4, 27])

In [29]:
nlls = torch.zeros(4)
for i in range(4):
  # i-th trigram:
  x1 = xs[i][0].item() # input character index
  x2 = xs[i][1].item() # input character index
  y = ys[i].item() # label character index
  print('--------')
  print(f'trigram example {i+1}: {itos[x1]} {itos[x2]} {itos[y]} (indexes {x1},{x2},{y})')
  print('input to the neural net:', x1, x2)
  print('output probabilities from the neural net:', probs[i])
  print('label (actual next character):', y)
  p = probs[i, y]
  print('probability assigned by the net to the the correct character:', p.item())
  logp = torch.log(p)
  print('log likelihood:', logp.item())
  nll = -logp
  print('negative log likelihood:', nll.item())
  nlls[i] = nll

print('=========')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

--------
trigram example 1: . e m (indexes 0,5,13)
input to the neural net: 0 5
output probabilities from the neural net: tensor([0.0237, 0.0177, 0.0107, 0.0049, 0.0223, 0.0096, 0.0111, 0.0090, 0.0071,
        0.0424, 0.0704, 0.0511, 0.0196, 0.0240, 0.2683, 0.0824, 0.0320, 0.0058,
        0.1061, 0.0203, 0.0267, 0.0060, 0.0026, 0.0565, 0.0026, 0.0264, 0.0407])
label (actual next character): 13
probability assigned by the net to the the correct character: 0.023988453671336174
log likelihood: -3.730182647705078
negative log likelihood: 3.730182647705078
--------
trigram example 2: e m m (indexes 5,13,13)
input to the neural net: 5 13
output probabilities from the neural net: tensor([0.0224, 0.0482, 0.0110, 0.0675, 0.0691, 0.0033, 0.0108, 0.0058, 0.0093,
        0.0043, 0.1095, 0.0867, 0.0058, 0.0074, 0.0009, 0.0084, 0.0135, 0.0044,
        0.0651, 0.0046, 0.0327, 0.0632, 0.0720, 0.0006, 0.0040, 0.0079, 0.2616])
label (actual next character): 13
probability assigned by the net to the the 

In [30]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

In [31]:
# forward pass
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc.view(4, 2*27) @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(4), ys].log().mean()

RuntimeError: shape '[4, 54]' is invalid for input of size 10590102

In [18]:
print(loss.item())

4.095324993133545


In [19]:
# backward pass
W.grad = None # set to zero the gradient
loss.backward()

In [20]:
W.data += -0.1 * W.grad

In [32]:
# create the data set
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs.append([ix1, ix2])
    ys.append(ix3)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()


# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)
print(num)

392226


In [22]:
# gradient descent
for k in range(100):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc.view(-1, 27*2) @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(ys.shape[0]), ys].log().mean() + 0.01*(W**2).mean()    #divided by 2 because we have a bigram here
  if(k>95):
    print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

2.273597002029419
2.2732996940612793
2.2730085849761963
2.27272367477417


In [34]:
import random

# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
  
  out = []
  ix = 0
  iy = random.randint(1, 26)
  while True:
    
    # ----------
    # BEFORE:
    #p = P[ix]
    # ----------
    # NOW:
    xenc = F.one_hot(torch.tensor([ix, iy]), num_classes=27).float()
    logits = xenc.view(-1, 27*2) @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

zexzmdzpglpufkvmpzvbyhdmpmziszytwdnmagddkzkafbm.
zkkzypucjtkhyygktzk.
iszislwtjujjwttsdjjkklwmkfjsldsufjkajjjnfrfjbspmhwcjmrhvtrhslsc.
pspxxblpwglppytw.
iszypwgpfdwmpkfbwmyderz.


In [24]:
x = torch.randn(4, 4)
x.size()

torch.Size([4, 4])

In [25]:
y = x.view(16, -1)

y.size()

torch.Size([16, 1])

In [26]:
import torch

# Define xenc and W
xenc = torch.randn(4, 2, 27)  # Example input tensor shape
W = torch.randn(27, 2, 1)      # Example weight matrix shape

# Perform matrix multiplication
logits = torch.matmul(xenc, W)  # Output shape will be [4, 2, 1]

# Now logits has the desired shape [4, 2, 1]
print(logits.size())


RuntimeError: The size of tensor a (4) must match the size of tensor b (27) at non-singleton dimension 0

In [None]:
RuntimeError                              Traceback (most recent call last)
Cell In[53], line 8
      5 W = torch.randn(27, 2, 1)      # Example weight matrix shape
      7 # Perform matrix multiplication
----> 8 logits = torch.matmul(xenc, W)  # Output shape will be [4, 2, 1]
     10 # Now logits has the desired shape [4, 2, 1]
     11 print(logits.size())

RuntimeError: The size of tensor a (4) must match the size of tensor b (27) at non-singleton dimension 0

In [None]:
import torch

# Define xenc and W
xenc = torch.randn(4, 2, 27)    # Example input tensor shape
W = torch.randn(27 * 2, 1)       # Reshaped weight matrix shape

# Reshape xenc to match the shape of W for matrix multiplication
xenc_flattened = xenc.view(4, 27 * 2)  # Reshape xenc to [batch_size, 27 * 2]

# Perform matrix multiplication
logits = torch.matmul(xenc_flattened, W)  # Output shape will be [4, 1]

print(logits)
# Now logits has the desired shape [4, 2, 1]
print(logits.size())
