In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

%matplotlib inline

torch.__version__

'2.1.1+cu121'

In [1]:
import random
with open("names.txt", "r+") as f:
	words = f.read().splitlines()
	words = [word.strip() for word in words] # get rid of any trailing spaces
	words = [w for w in words if w] # get rid of any empty strings
	names = sorted(words, key=lambda x: random.random())
	
with open("names.txt", "w") as f: 
	joined = "\n".join(names)
	f.write(joined)
min_chars = 1
max_chars = max(len(v) for v in names)
chars = sorted(list(set("".join(names))))

# in replacement of the start and end token. Every name should end with a period. and there should be no start token to begin a sequence
chars = ['.'] + chars
chars_count = len(chars)
print("names: ", names[:5])
print("number of names: ", len(names))
print("(list of chars, count): ", ("".join(chars), chars_count))
print("(max word length, min word length): ", (max_chars, min_chars))

atoi = {ch:i for i,ch in enumerate(chars)}
itoa = {i:ch for i,ch in enumerate(chars)}

# adding end token to each name
names = [list(name) + ['.'] for name in names]

names:  ['tyrie', 'kesia', 'lainey', 'avyana', 'mylan']
number of names:  32033
(list of chars, count):  ('.abcdefghijklmnopqrstuvwxyz', 27)
(max word length, min word length):  (15, 1)


In [3]:
block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []

for name in names:
    context  = [0] * block_size
    for ch in name:
        ix = atoi[ch]
        X.append(context)
        Y.append(ix)
        # print(''.join(itoa[i] for i in context), '--->', itoa[ix])
        context = context[1:] + [ix] # crop and append
        
X, Y = torch.tensor(X), torch.tensor(Y)
X.shape, Y.shape, X.dtype, Y.dtype

(torch.Size([228146, 3]), torch.Size([228146]), torch.int64, torch.int64)

In [4]:
# build_dset basically builds a rolling window on the dataset based on the context length.
def build_dset(dset, ctxt_len):
    X, Y = [], []
    for name in dset:
        context  = [0] * ctxt_len
        for ch in name:
            ix = atoi[ch]
            X.append(context)
            Y.append(ix)
            # print(''.join(itoa[i] for i in context), '--->', itoa[ix])
            context = context[1:] + [ix] # crop and append
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y
n1 = int(0.1*len(words))
n2 = int(0.9*len(words))

X_train, Y_train = build_dset(names[:n1], block_size)
X_val, Y_val = build_dset(names[n1:n2], block_size)
X_test, Y_test = build_dset(names[n2:], block_size)

for c, d in zip(X_train[:5], Y_train[:5]):
    print(''.join(itoa[i.item()] for i in c), "=>", itoa[d.item()])

(X_train.shape, Y_train.shape), (X_val.shape, Y_val.shape), (X_test.shape, Y_test.shape)

... => t
..t => y
.ty => r
tyr => i
yri => e


((torch.Size([22802, 3]), torch.Size([22802])),
 (torch.Size([182481, 3]), torch.Size([182481])),
 (torch.Size([22863, 3]), torch.Size([22863])))

In [5]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility

# squeezing it into a 2 dimensional space 
# since in the paper, a relatively smaller tensor(30 and 60) (to the dataset[17_000]) is used. 
C = torch.randn((27, 2), generator=g)

x_enc = F.one_hot(X_train, num_classes=27).float()
Z = x_enc @ C

# this is another way of doing x \times W 
emb = C[X_train]

assert torch.equal(Z, emb)
print("The shape of Z(emb): ", Z.shape)

The shape of Z(emb):  torch.Size([22877, 3, 2])


In [6]:
# changing shape of Z(emb) so as to cross weight 
A = torch.cat(torch.unbind(Z, 1), dim=1)
A.shape

# This is a more efficient implementation of the above
Z_shaped = Z.view(-1, 6)

assert torch.equal(A, Z_shaped)

# here let's use a 100 dimensional space
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
 
h = torch.tanh(Z_shaped @ W1 + b1) #hidden layer

print("Shape of hidden layer: ", h.shape)

W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)

logits = h @ W2 + b2 # log-counts
print("Shape of logits: ", logits.shape)

# normalization
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)

print("Shape of probability space: ", probs.shape)

params = [C, W1, b1, W2, b2]

print("Total number of parameters: ", sum(p.nelement() for p in params))

Shape of hidden layer:  torch.Size([22877, 100])
Shape of logits:  torch.Size([22877, 27])
Shape of probability space:  torch.Size([22877, 27])
Total number of parameters:  3481


In [7]:
# negative mean log likelihood (cross entropy loss)
# print(probs.shape[0], Y_train.shape[0])
loss = -probs[torch.arange(probs.shape[0]), Y_train].log().mean()

# this is a better way to implement the above
loss2 = F.cross_entropy(logits, Y_train)

print(f"loss={loss} loss2={loss2}")
# assert torch.equal(loss, loss2), 

loss=19.509521484375 loss2=19.509521484375


In [7]:
# REDOING IT AGAIN MORE CLEARELY
# hyper-params
embedding_size = 10
block_size  = 3 # context length
mid_w_size = 200 # intermediate weight size
lre = torch.linspace(-3, 1, 100)
lrs = 10**lre # a range of learning rate
print(lrs.dtype)

torch.float32


In [8]:
# BACKPROPAGATION


# train-test split(80, 10, 10)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

X_train, Y_train = build_dset(names[:n1], block_size)
X_val, Y_val = build_dset(names[n1:n2], block_size)
X_test, Y_test = build_dset(names[n2:], block_size)

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, embedding_size), generator=g)
W1 = torch.randn((block_size * embedding_size, mid_w_size), generator=g)
b1 = torch.randn(mid_w_size, generator=g)
W2 = torch.randn((mid_w_size, 27), generator=g)
b2 = torch.randn(27, generator=g)

params = [C, W1, b1, W2, b2]

for p in params:
    p.requires_grad = True # autograd should record operations


In [1]:
# BACKPROPAGATION
# minibatch construct
# ix = torch.randint(0, X_train.shape[0], (32,))

# print(emb.shape, C[X_train].shape)  
count = 100
step_i = []
lr_i = []
loss_i = []

for i in range(count):
    # forward pass
    emb = C[X_train].view(-1, 30)
    h = torch.tanh(emb.view(-1, block_size * embedding_size) @ W1 + b1) # intermediate layer
    logits = h @ W2 + b2
        
    loss = F.cross_entropy(logits, Y_train)
    print("(", i ,"/", count, ") loss = ", loss.item())
    
    # backward pass
    for p in params:
        p.grad = None
    loss.backward()

    # update
    for p in params:
        p.data += - lrs[i] * p.grad 

    # track stats
    lr_i.append(lrs[i])
    step_i.append(i)
    loss_i.append(loss)

NameError: name 'C' is not defined

In [1]:
plt.plot(step_i, lr_i)

NameError: name 'plt' is not defined