In [2]:
import torch
import torch.nn.functional as F

# WaveNet

This notebook introduces an architecture similar to the WaveNet model developed by DeepMind in 2016. Before diving into the details of this model, we'll review the preliminary steps and implement them in a more comprehensive and modular manner. Drawing inspiration from PyTorch, here are the results:

## Generate Dataset

In [9]:
from makemore.config import RAW_DATA_DIR, PROCESSED_DATA_DIR
print(f"RAW_DATA_DIR: {RAW_DATA_DIR}")
print(f"PROCESSED_DATA_DIR: {PROCESSED_DATA_DIR}")

[32m2024-06-23 15:40:03.843[0m | [1mINFO    [0m | [36mmakemore.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /workspaces/makemore[0m


RAW_DATA_DIR: /workspaces/makemore/data/raw
PROCESSED_DATA_DIR: /workspaces/makemore/data/processed


In [10]:
# Download the dataset
!python ../makemore/dataset.py

[32m2024-06-23 15:40:15.463[0m | [1mINFO    [0m | [36mmakemore.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /workspaces/makemore[0m
[32m2024-06-23 15:40:15.467[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m22[0m - [1mDownloading dataset...[0m
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  4.22it/s]
[32m2024-06-23 15:40:15.707[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m27[0m - [1mnamex.txt correctly download![0m


In [11]:
DATASET_NAME = "names.txt"

# Read words
with open(RAW_DATA_DIR / DATASET_NAME, "r") as names_file:
    names = names_file.readlines()

print(names[:10])

# Remove the \n from all the names
names = [name[:-1] for name in names]

names[:10]

['emma\n', 'olivia\n', 'ava\n', 'isabella\n', 'sophia\n', 'charlotte\n', 'mia\n', 'amelia\n', 'harper\n', 'evelyn\n']


['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [12]:
# Generate decoder and encoder for chars
chars = ".qwertyuiopasdfghjklzxcvbnm" # All available letters in latin alphabet plus .
decoder, encoder = {}, {}

for i, ch in enumerate(chars):
    encoder[ch] = i
    decoder[i] = ch

In [15]:
# Context window dimension
context_size = 8
vocab_size = len(decoder)

In [16]:
# Store dataset
X, Y = [], []

for name in names:
    # Starting context
    context = [0] * context_size

    for ch in name:
        # Encode ch in an int
        i_ch = encoder[ch]
        # Store couple
        X.append(context)
        Y.append(i_ch)
        # Update context
        context = context[1:] + [i_ch]

    i_ch = encoder["."]
    X.append(context)
    Y.append(i_ch)

# Store dataset in torch tensor
X = torch.tensor(X)
Y = torch.tensor(Y)

print(f"Dataset lenght: {len(Y)}")

Dataset lenght: 228145


## Pytorchifycantion 

In [29]:
# Linear Layer ----------------------------------------------------------------
class Linear:
  
  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5
    self.bias = torch.zeros(fan_out) if bias else None
  
  def __call__(self, x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out
  
  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])

# BatchNorm Layer --------------------------------------------------------------
class BatchNorm1d:
  
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    if self.training:
      xmean = x.mean(0, keepdim=True) # batch mean
      xvar = x.var(0, keepdim=True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out
  
  def parameters(self):
    return [self.gamma, self.beta]

# Tanh Layer -----------------------------------------------------------------------
class Tanh:
  
  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out
  
  def parameters(self):
    return []
  
# Embedding Layer ------------------------------------------------------------------
class Embedding:
  
  def __init__(self, num_embeddings, embedding_dim):
    self.weight = torch.randn((num_embeddings, embedding_dim))
    
  def __call__(self, IX):
    self.out = self.weight[IX]
    return self.out
  
  def parameters(self):
    return [self.weight]
  
# Flatten Layer --------------------------------------------------------------------
class Flatten:
  
  def __init__(self):
    return
  
  def __call__(self, x):
    return x.view(x.shape[0], -1)
  
  def parameters(self):
    return []

# Sequential Container -------------------------------------------------------------
class Sequential:
  
  def __init__(self, layers):
    self.layers = layers
  
  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    self.out = x
    return self.out
  
  def parameters(self):
    # get parameters of all layers and stretch them out into one list
    return [p for layer in self.layers for p in layer.parameters()]


In [23]:
# Set a manual seed
torch.manual_seed(16)

<torch._C.Generator at 0x7600dafc23f0>

In [30]:
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 100 # the number of neurons in the hidden layer of the MLP
vocab_size = 27
context_size = 3
g = torch.Generator().manual_seed(2147483647) # for reproducibility

layers = Sequential([
  Embedding(), Flatten(),
  Linear(n_embd * context_size, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(n_hidden, vocab_size, bias=False), BatchNorm1d(vocab_size),
])

with torch.no_grad():
  # last layer: make less confident
  layers[-1].gamma *= 0.1

TypeError: Embedding.__init__() missing 2 required positional arguments: 'num_embeddings' and 'embedding_dim'

In [25]:
parameters = layers.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

6224


In [None]:
layers()