# Wavenet Hyperparameter Tuning

In [None]:
# !pip install ray[tune]

In [None]:
# !pip install optuna

In [None]:
import numpy as np
import torch

import random
import os
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

In [None]:
device = "cpu"
if torch.cuda.is_available():
    device = "cuda:0"

In [None]:
device

'cpu'

In [None]:
torch.manual_seed(42);

In [None]:
random.seed(42)

### Setup Data Loader

In [None]:
words = open('../data/names.txt', 'r').read().splitlines()

In [None]:
random.shuffle(words)

In [None]:
def build_dataset(words, block_size=8):
    
    X, Y = [], []
    
    random.seed(42)
    random.shuffle(words)
    
    chars = sorted(list(set(''.join(words))))
    stoi = {s: i + 1 for i, s in enumerate(chars)}
    stoi['.'] = 0
    itos = {i: s for s, i in stoi.items()}
    vocab_size = len(itos)
    
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    
    X = torch.tensor(X).to(device)
    Y = torch.tensor(Y).to(device)
    return X, Y

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [None]:
Xtr.shape

torch.Size([182625, 8])

### Create Model 

In [None]:
# --- Flatten Consecutive ---
class FlattenConsecutive(nn.Module):
    def __init__(self, n):
        super().__init__()
        self.n = n
    
    def forward(self, x):
        B, T, C = x.shape
        x = x.reshape(B, T//self.n, C*self.n)
        if x.shape[1] == 1: 
            x = x.squeeze(1)
        self.out = x
        return self.out

# -- SwapDim ---
class SwapDim(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        return torch.transpose(x, 1, 2)

# -- SwapDimBack -- 
class SwapDimBack(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        return torch.transpose(x, 1, 2)

In [None]:
vocab_size = 27
n_embd = 24
n_hidden = 128
model = nn.Sequential(
    nn.Embedding(vocab_size, n_embd),
    FlattenConsecutive(2), nn.Linear(n_embd*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
    FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
   FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False),  nn.BatchNorm1d(n_hidden), nn.Tanh(),
#     nn.Linear(n_hidden, vocab_size),
)

In [None]:
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = model(Xb)
print(Xb.shape)
Xb, logits.shape

torch.Size([4, 8])


(tensor([[13,  1, 18,  9, 10,  1, 14,  5],
         [ 0,  0,  0,  0,  1,  2,  4, 15],
         [ 0,  0,  0,  0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  8,  1, 26,  1]]),
 torch.Size([4, 128]))

In [None]:
def build_model(n_embd, # the dimensionality of the character embedding vectors
                n_hidden, # the number of neurons in the hidden layer of the MLP 
                last_layer_factor = 0.1 # the factor by to reduce the weights of the last layer
               ):
    vocab_size = 27
    model = nn.Sequential(
    nn.Embedding(vocab_size, n_embd),
    FlattenConsecutive(2), nn.Linear(n_embd*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
    FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
   FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False),  nn.BatchNorm1d(n_hidden), nn.Tanh(),
      nn.Linear(n_hidden, vocab_size)
    )


    # parameter init
    with torch.no_grad(): model[-1].weight *= last_layer_factor

    parameters = model.parameters()
    print("No of parameters ", sum(p.nelement() for p in parameters))
    for p in parameters: p.requires_grad = True
    return model

In [None]:
model = build_model(24, 128)

No of parameters  76579


In [None]:
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = model(Xb)
print(Xb.shape)
Xb

torch.Size([4, 8])


tensor([[ 0,  0,  0,  0, 19,  8,  1, 25],
        [ 0,  0,  0,  0,  0,  4,  5, 22],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  1]])

In [None]:
logits.shape

torch.Size([4, 27])

In [None]:
def train(config, checkpoint_dir=None):
    
    n_embd = config['n_embd']
    n_hidden = config['n_hidden']
    last_layer_factor = config['last_layer_factor']
    max_steps = config['max_steps'] 
    lr = config['lr']
    batch_size = config['batch_size']
    
    model = build_model(n_embd, n_hidden, last_layer_factor)

    train_loss = F.cross_entropy(model(Xtr), Ytr)
    print('Initial loss ', train_loss)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    lossi = []
    
    for i in range(max_steps):
        running_loss = 0.0
        epoch_steps = 0
        # minibatch construct
        ix = torch.randint(0, Xtr.shape[0], (batch_size,))
        Xb, Yb = Xtr[ix], Ytr[ix]

        logits = model(Xb)
        loss = F.cross_entropy(logits, Yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # track stats
        if i % 10_000 == 0:
            print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
        lossi.append(loss.log10().item())
    
        
    return model

In [None]:
config = {
        "n_embd": 24,
        "n_hidden": 128,
        "lr": 0.001,
        "last_layer_factor": 0.1,
        "batch_size": 32,
        "max_steps": 200_000
    }

In [None]:
m = train(config)

No of parameters  76579
Initial loss  tensor(3.3022, grad_fn=<NllLossBackward0>)
      0/ 200000: 3.3107
  10000/ 200000: 2.1195
  20000/ 200000: 2.1580
  30000/ 200000: 2.0142
  40000/ 200000: 2.1674
  50000/ 200000: 2.5029
  60000/ 200000: 1.7215
  70000/ 200000: 2.0961
  80000/ 200000: 2.1328
  90000/ 200000: 2.2157
 100000/ 200000: 2.0725
 110000/ 200000: 2.1434
 120000/ 200000: 1.8127
 130000/ 200000: 1.8254


In [None]:
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = m(Xb)
logits

In [None]:
#for layer in m: layer.training = False
with torch.no_grad():
    train_loss = F.cross_entropy(m(Xtr), Ytr).item() 
    val_loss = F.cross_entropy(m(Xdev), Ydev).item()
    print(train_loss, val_loss)