# Wavenet Exercise

## Hyperparameter Tuning

In [23]:
import numpy
import torch
from ray import tune
import torch.nn.functional as F
from ray import tune, air
from ray.air import session
from ray.tune.search.optuna import OptunaSearch
from ray.tune.search.hyperopt import HyperOptSearch

In [2]:
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []

In [3]:
class Linear:
    
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out)) / fan_in ** 0.5 # note: kaiming init
        self.bias = torch.zeros(fan_out) if bias else None
    
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

In [4]:
class Flatten:
    
    def __call__(self, x):
        self.out = x.view(x.shape[0], -1)
        return self.out
    
    def parameters(self):
        return []

In [5]:
class Embedding: 
    def __init__(self, num_embeddings, embedding_dim):
        self.weight = torch.randn((num_embeddings, embedding_dim))
        
    def __call__(self, IX):
        self.out = self.weight[IX]
        return self.out
    
    def parameters(self):
        return [self.weight]

In [6]:
class Sequential:
    def __init__(self, layers):
        self.layers = layers
        
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out
    
    def parameters(self):
        # get parameters of all layers and stretch them out into one list
        return [p for layer in self.layers for p in layer.parameters()]

In [7]:
class BatchNorm1d:
    
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # buffers (trained with a running `momentum update`)
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)
    
    def __call__(self, x):
        # calculate the forward pass
        if self.training:
            if x.ndim == 2: dim = 0
            elif x.ndim == 3: dim = (0, 1)
            xmean = x.mean(dim, keepdim=True)
            xvar = x.var(dim, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var
        
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta
        # update the buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

In [8]:
class FlattenConsecutive:
    def __init__(self, n):
        self.n = n
    
    def __call__(self, x):
        B, T, C = x.shape
        x = x.view(B, T//self.n, C*self.n)
        if x.shape[1] == 1: 
            x = x.squeeze(1)
        self.out = x
        return self.out
    
    def parameters(self):
        return []

In [9]:
def build_model(n_embd, # the dimensionality of the character embedding vectors
                n_hidden # the number of neurons in the hidden layer of the MLP 
               ):
    vocab_size = 27
    model = Sequential([
        Embedding(vocab_size, n_embd),
        FlattenConsecutive(2), Linear(n_embd*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
        FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
        FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
        Linear(n_hidden, vocab_size),
    ])

    # parameter init
    with torch.no_grad():
        model.layers[-1].weight *= 0.1

    parameters = model.parameters()
    print(sum(p.nelement() for p in parameters))
    for p in parameters: p.requires_grad = True
    return model

In [10]:
words = open('/Users/anubhavmaity/projects/NeuralNetworks-Zero-To-Hero/nbs/data/names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


In [11]:
def get_map(words):
    chars = sorted(list(set(''.join(words))))
    stoi = {s: i + 1 for i, s in enumerate(chars)}
    stoi['.'] = 0
    itos = {i: s for s, i in stoi.items()}
    vocab_size = len(itos)
    return stoi

In [12]:
def build_dataset(words, block_size=8):
    X, Y = [], []
    stoi = get_map(words)
    
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1: n2])
Xte, Yte = build_dataset(words[n2:])

In [13]:
# evaluate the loss
@torch.no_grad() # this decorator disables gradient tracking inside pytorch
def split_loss(model, split):
    x, y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte)
    }[split]
    logits = model(x)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())
    return loss.item()

In [31]:
# same optimization as last time
def train(model, max_steps, batch_size):
    lossi = []

    for i in range(max_steps):

        # minibatch construct
        ix = torch.randint(0, Xtr.shape[0], (batch_size,))
        Xb, Yb = Xtr[ix], Ytr[ix]


        # forward pass
        logits = model(Xb)
        loss = F.cross_entropy(logits, Yb)

        # backward pass
        for p in model.parameters():
            p.grad = None
        loss.backward()

        # update: simple SGD
        lr = 0.1 if i < 150_000 else 0.01 # step learning rate decay
        for p in model.parameters(): 
            p.data += -lr * p.grad

        # track stats
        if i % 10_000 == 0: # print every once in a while
            print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
        lossi.append(loss.log10().item())
        
    return lossi

In [32]:
def objective(config):
    # Load from config
    max_steps = 200_000
    batch_size = config['batch_size']
    n_embd = config['n_embd']
    n_hidden = config['n_hidden']
    
    # Load the data
    words = open('/Users/anubhavmaity/projects/NeuralNetworks-Zero-To-Hero/nbs/data/names.txt', 'r').read().splitlines()
    n1 = int(0.8 * len(words))
    n2 = int(0.9 * len(words))
    Xtr, Ytr = build_dataset(words[:n1])
    Xdev, Ydev = build_dataset(words[n1: n2])
    Xte, Yte = build_dataset(words[n2:])
    
    # Create a model
    model = build_model(n_embd, n_hidden)
    
    
    train(model, max_steps, batch_size)
    val_loss = split_loss(model, 'val')  # Compute loss
    session.report({"val_loss": val_loss})  # Report to Tune

In [33]:
search_space = {"batch_size": tune.choice([32, 64, 128]), 
                "n_embd": tune.choice([10, 24, 50, 100, 300]), 
                "n_hidden": tune.choice([32, 64, 128, 256, 512])}

current_best_params = [{
    'batch_size': 32,
    'n_embd': 24,
    'n_hidden': 128,
}]

hyperopt_search = HyperOptSearch(
    metric="val_loss", mode="min",
    points_to_evaluate=current_best_params)

tuner = tune.Tuner(
    objective,
    tune_config=tune.TuneConfig(
        search_alg=hyperopt_search,
        num_samples = 10
    ),
    param_space=search_space
)
tuner.fit()

0,1
Current time:,2023-02-25 07:43:48
Running for:,08:44:44.81
Memory:,3.1/4.0 GiB

Trial name,status,loc,batch_size,n_embd,n_hidden
objective_e09e6d50,RUNNING,127.0.0.1:14371,32,24,128
objective_710a549e,RUNNING,127.0.0.1:14379,128,24,32
objective_5a2b4a2f,RUNNING,127.0.0.1:14388,32,50,512
objective_81f1e0c2,RUNNING,127.0.0.1:14397,128,24,128
objective_563ab57c,PENDING,,64,24,512




[2m[36m(objective pid=14371)[0m 76579
[2m[36m(objective pid=14371)[0m       0/ 200000: 3.2968
[2m[36m(objective pid=14379)[0m 7363
[2m[36m(objective pid=14379)[0m       0/ 200000: 3.2918
[2m[36m(objective pid=14388)[0m 1118049
[2m[36m(objective pid=14388)[0m       0/ 200000: 3.2886
[2m[36m(objective pid=14397)[0m 76579
[2m[36m(objective pid=14397)[0m       0/ 200000: 3.2904
[2m[36m(objective pid=14371)[0m   10000/ 200000: 2.2415
[2m[36m(objective pid=14379)[0m   10000/ 200000: 2.2179
[2m[36m(objective pid=14371)[0m   20000/ 200000: 1.6273
[2m[36m(objective pid=14379)[0m   20000/ 200000: 2.1193
[2m[36m(objective pid=14397)[0m   10000/ 200000: 2.0203
[2m[36m(objective pid=14379)[0m   30000/ 200000: 2.0717
[2m[36m(objective pid=14371)[0m   30000/ 200000: 1.8968
[2m[36m(objective pid=14379)[0m   40000/ 200000: 2.1775
[2m[36m(objective pid=14371)[0m   40000/ 200000: 1.9022
[2m[36m(objective pid=14388)[0m   10000/ 200000: 1.9961
[2m[36m

2023-02-25 07:43:48,221	ERROR tune.py:794 -- Trials did not complete: [objective_e09e6d50, objective_710a549e, objective_5a2b4a2f, objective_81f1e0c2, objective_563ab57c]
2023-02-25 07:43:48,223	INFO tune.py:799 -- Total run time: 31485.06 seconds (31484.79 seconds for the tuning loop).


<ray.tune.result_grid.ResultGrid at 0x7f8d9342b050>

In [19]:
# search_space = {"batch_size": tune.choice([32, 64]), "n_embd": tune.choice([10, 24, 50]), "n_hidden": tune.choice([32, 64, 128, 256, 512])}

In [None]:
# algo = OptunaSearch()

In [None]:
# tuner = tune.Tuner(
#     objective,
#     tune_config=tune.TuneConfig(
#         metric="val_loss",
#         mode="min",
#         search_alg=algo,
#     ),
#     run_config=air.RunConfig(
#         stop={"training_iteration": 5},
#     ),
#     param_space=search_space,
# )

In [None]:
# results = tuner.fit()
# print("Best config is:", results.get_best_result().config)