# [Paper]()

- [ ] Activation functions
- [ ] Batch Normalization. 
- [ ] Residual connections 
- [ ] the Adam optimizer
- [ ] Backpropagation


In [1]:
"""
 - [ ] Activation functions
 - [ ] Batch Normalization. 
 - [ ] Residual connections 
 - [ ] the Adam optimizer
 - [ ] Backpropagation
"""
# [Paper]()

'\n - [ ] Activation functions\n - [ ] Batch Normalization. \n - [ ] Residual connections \n - [ ] the Adam optimizer\n - [ ] Backpropagation\n'

In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

%matplotlib inline
g = torch.Generator().manual_seed(2147483647) # for reproducibility

torch.__version__


'2.1.1+cu121'

In [3]:
import random
with open("names.txt", "r+") as f:
	words = f.read().splitlines()
	words = [word.strip() for word in words] # get rid of any trailing spaces
	words = [w for w in words if w] # get rid of any empty strings
	names = sorted(words, key=lambda x: random.random())

min_chars = min(len(v) for v in names)
max_chars = max(len(v) for v in names)
chars = sorted(list(set("".join(names))))

# in replacement of the start and end token. Every name should end with a period. and there should be no start token to begin a sequence
chars = ['.'] + chars
chars_count = len(chars)
print("names: ", names[:5])
print("number of names: ", len(names))
print("(list of chars, count): ", ("".join(chars), chars_count))
print("(max word length, min word length): ", (max_chars, min_chars))

atoi = {ch:i for i,ch in enumerate(chars)}
itoa = {i:ch for i,ch in enumerate(chars)}

# adding end token to each name
names = [list(name) + ['.'] for name in names]

names:  ['cionna', 'abagail', 'malai', 'saathvik', 'paiton']
number of names:  32033
(list of chars, count):  ('.abcdefghijklmnopqrstuvwxyz', 27)
(max word length, min word length):  (15, 2)


In [4]:
# hyper-params
n_embd = 10
block_size  = 3 # context length
n_embd2 = 200 # intermediate weight size
lr = 0.1 # determined based on graph
decay_rate = 0.01
batch_size = 64

In [5]:
# build_dset basically builds a rolling window on the dataset based on the context length.
def build_dset(dset, ctxt_len):
    X, Y = [], []
    for name in dset:
        context  = [0] * ctxt_len
        for ch in name:
            ix = atoi[ch]
            X.append(context)
            Y.append(ix)
            # print(''.join(itoa[i] for i in context), '--->', itoa[ix])
            context = context[1:] + [ix] # crop and append
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y
n1 = int(0.8*len(names))
n2 = int(0.9*len(names))

X_train, Y_train = build_dset(names[:n1], block_size)
X_val, Y_val = build_dset(names[n1:n2], block_size)
X_test, Y_test = build_dset(names[n2:], block_size)

for c, d in zip(X_train[:5], Y_train[:5]):
    print(''.join(itoa[i.item()] for i in c), "=>", itoa[d.item()])



... => c
..c => i
.ci => o
cio => n
ion => n


In [6]:
# train-test split(80, 10, 10)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

# parameters
C = torch.randn((27, n_embd), generator=g)
W1 = torch.randn((block_size * n_embd, n_embd2), generator=g) # hidden layer
b1 = torch.randn(n_embd2, generator=g)
W2 = torch.randn((n_embd2, 27), generator=g) # output layer
b2 = torch.randn(27, generator=g)

params = [C, W1, b1, W2, b2]

for p in params:
    p.requires_grad = True # autograd should record operations

In [7]:
# BACKPROPAGATION

max_steps = 20000
losses = []

for i in range(max_steps):
    # minibatch construct - for efficiency
    ix = torch.randint(0, X_train.shape[0], (batch_size,))

    # forward pass
    emb = C[X_train[ix]].view(-1, 30)
    v = emb @ W1 + b1
    h = torch.tanh(v) # intermediate layer
    logits = h @ W2 + b2
        
    loss = F.cross_entropy(logits, Y_train[ix])
    if i % 1000 == 0:
        print("(", (i * 100) / max_steps , ") loss = ", loss.item())
    
    # backward pass
    for p in params:
        p.grad = None
    loss.backward()

    # update
    lri = lr if max_steps > 10000 else decay_rate
    for p in params:
        p.data += - lri * p.grad 
    
    losses.append(loss)

( 0.0 ) loss =  24.17746925354004
( 5.0 ) loss =  3.5246422290802
( 10.0 ) loss =  2.8388516902923584
( 15.0 ) loss =  2.8533453941345215
( 20.0 ) loss =  2.642613410949707
( 25.0 ) loss =  2.3912975788116455
( 30.0 ) loss =  2.3728432655334473
( 35.0 ) loss =  2.677912712097168
( 40.0 ) loss =  2.1868112087249756
( 45.0 ) loss =  2.750415086746216
( 50.0 ) loss =  2.6992030143737793
( 55.0 ) loss =  2.7639715671539307
( 60.0 ) loss =  2.2791590690612793
( 65.0 ) loss =  2.637249231338501
( 70.0 ) loss =  2.218799114227295
( 75.0 ) loss =  2.6497700214385986
( 80.0 ) loss =  2.7670018672943115
( 85.0 ) loss =  2.2003753185272217
( 90.0 ) loss =  2.464599132537842
( 95.0 ) loss =  2.5227200984954834


In [None]:
plt.plot(losses)

In [10]:
@torch.no_grad()
def loss(x, y):
    emb = C[x].view(-1, 30)
    v = emb @ W1 + b1
    h = torch.tanh(v) # intermediate layer
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y)
    return loss

print("training loss: ", loss(X_train, Y_train))
print("test loss: ", loss(X_test, Y_test))


training loss:  tensor(2.3842)
test loss:  tensor(2.4027)


### Stats
|iter| train | test | method |
|--------|-----------| -------|---------|
| 1 | 2.834 | 2.4027 | --- |
| 2|  | | tuning output weights and bias - softmax | 
| 3|  | | tuning log params - tanh layer saturation | 
|4 | | | using kaiming init |
|5 | | | add batch norm layer |
| 6| | | running batch norm | 


#fff

| 1 | 2.834 | 2.4027 |  |
| 2| | | tuning bias & weights | 