In this chapter we build a nn with several layers. We group each layer by two characters and pass it through to the next layer etc.
This allows neural network to learn more intricate relationship between characters.

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def read_names(path):
    with open(path) as fileobj:
        names = [line.strip() for line in fileobj]
    return names

NAMES = read_names('names.txt')
len(NAMES), NAMES[:3]

(32033, ['emma', 'olivia', 'ava'])

In [3]:
import string

CHAR_TO_CLASS = {char: i for i, char in enumerate('.' + string.ascii_lowercase)}
CLASS_TO_CHAR = {i: char for char, i in CHAR_TO_CLASS.items()}

CHAR_TO_CLASS

{'.': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26}

In [4]:
CONTEXT_LEN = 8

def build_training_data(names, context_len):
    training_data = []
    prefix = '.' * context_len
    for a_name in names:
        a_name += '.'
        full_name = f'{prefix}{a_name}'
        for i, next_char in enumerate(a_name):
            context = full_name[i:i + context_len]
            training_data.append((context, next_char))
    return training_data

TRAINING_DATA = build_training_data(NAMES, CONTEXT_LEN)
len(TRAINING_DATA), TRAINING_DATA[:6]

(228146,
 [('........', 'e'),
  ('.......e', 'm'),
  ('......em', 'm'),
  ('.....emm', 'a'),
  ('....emma', '.'),
  ('........', 'o')])

In [5]:
NUM_CLASSES = len(CLASS_TO_CHAR)

def build_training_set(training_set):
    inputs = []
    outputs = []
    for context, next_char in training_set:
        an_input = [CHAR_TO_CLASS[char] for char in context]
        inputs.append(an_input)
        
        an_output = CHAR_TO_CLASS[next_char]
        outputs.append(an_output)
    inputs = torch.tensor(inputs)
    outputs = torch.tensor(outputs)
    return inputs, outputs


INPUTS, OUTPUTS = build_training_set(TRAINING_DATA)
print(f'{INPUTS.shape=},  {INPUTS[:3]=}, {OUTPUTS.shape=}, {OUTPUTS[:3]=}')

INPUTS.shape=torch.Size([228146, 8]),  INPUTS[:3]=tensor([[ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  5],
        [ 0,  0,  0,  0,  0,  0,  5, 13]]), OUTPUTS.shape=torch.Size([228146]), OUTPUTS[:3]=tensor([ 5, 13, 13])


In [59]:
import time


def requires_grad(t):
    t.requires_grad = True
    return t

C_DIM = 10


class EmbLayer:
    def __init__(self):
        # we place each character into C_DIM dimensional vector
        self._emb = requires_grad(torch.randn(NUM_CLASSES, C_DIM))

    def forward(self, inputs):
        result = self._emb[inputs]
        return result.view(inputs.shape[0], -1, C_DIM * 2)

    def parameters(self):
        return [self._emb]

    

class Layer:
    def __init__(self, input_size, output_size, nonlinearity):
        self._w = requires_grad(torch.randn(input_size, output_size))
        self._b = requires_grad(torch.randn(output_size))
        self._nonlinearity = nonlinearity

    def forward(self, inputs):
        output = inputs @ self._w + self._b
        return self._nonlinearity(output)

    def parameters(self):
        return [self._w, self._b]


class SqueezingLayer:
    def forward(self, inputs):
        return inputs.squeeze(dim=1)

    def parameters(self):
        return []


class GroupingLayer:
    def forward(self, inputs):
        return inputs.view(inputs.shape[0], inputs.shape[1] // 2, inputs.shape[2] * 2)

    def parameters(self):
        return []


def all_parameters(network):
    parameters = []
    for a_layer in network:
        parameters.extend(a_layer.parameters())
    return parameters


def softmax(logits, smooth=1e-4):
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True) + smooth
    return probs


def train(inputs, outputs, num_iterations=100_000, verbose=False):
    batch_size = 32
    
    assert len(inputs) == len(outputs)
    hidden_output_size = 100
    network = [
        EmbLayer(),
        # input = 8 characters
        Layer(input_size=C_DIM * 2, output_size=hidden_output_size, nonlinearity=torch.tanh),
        GroupingLayer(),
        # input = 4 "characters"
        Layer(input_size=hidden_output_size * 2, output_size=hidden_output_size, nonlinearity=torch.tanh),
        GroupingLayer(),
        # input = 2 "characters"
        Layer(input_size=hidden_output_size * 2, output_size=hidden_output_size, nonlinearity=torch.tanh),
        SqueezingLayer(),
        # input = 1 "character"
        Layer(input_size=hidden_output_size, output_size=NUM_CLASSES, nonlinearity=softmax),
    ]

    for i in range(num_iterations):
        batch_indexes = torch.randint(0, inputs.shape[0], (batch_size,))
        # inputs_batch.shape = (batch_size, CONTEXT_LEN)
        inputs_batch = inputs[batch_indexes]
        outputs_batch = outputs[batch_indexes]
        probs = forward(inputs_batch, network)
        m_probs = probs[torch.arange(len(probs)), outputs_batch]
        loss = -(m_probs.log().mean())
        loss.backward()
        # learning_rate decay
        learning_rate = 1e-1 if i < 100_000 else 1e-2
        if verbose and i % 10000 == 0:
            print(f'{time.time()=:.2f} {i=} {loss.data=}')
            
        for p in all_parameters(network):
            p.data -= learning_rate * p.grad
            p.grad = None
    print(f'{loss=}')
    return network


def forward(inputs_batch, network):
    outputs = inputs_batch
    for a_layer in network:
        outputs = a_layer.forward(outputs)
    return outputs
    

TRAINING_SET_SIZE = 200_000
NETWORK = train(
    INPUTS[:TRAINING_SET_SIZE], 
    OUTPUTS[:TRAINING_SET_SIZE],
    num_iterations=400_000, 
    verbose=True
);

time.time()=1692692658.86 i=0 loss.data=tensor(8.8331)
time.time()=1692692662.58 i=10000 loss.data=tensor(2.7259)
time.time()=1692692666.34 i=20000 loss.data=tensor(2.5605)
time.time()=1692692669.97 i=30000 loss.data=tensor(2.9242)
time.time()=1692692673.56 i=40000 loss.data=tensor(2.4640)
time.time()=1692692677.15 i=50000 loss.data=tensor(2.6430)
time.time()=1692692680.70 i=60000 loss.data=tensor(2.5088)
time.time()=1692692684.26 i=70000 loss.data=tensor(2.0211)
time.time()=1692692687.86 i=80000 loss.data=tensor(2.1376)
time.time()=1692692691.39 i=90000 loss.data=tensor(2.8058)
time.time()=1692692694.94 i=100000 loss.data=tensor(2.3467)
time.time()=1692692698.55 i=110000 loss.data=tensor(2.1369)
time.time()=1692692702.16 i=120000 loss.data=tensor(2.1413)
time.time()=1692692705.68 i=130000 loss.data=tensor(2.1387)
time.time()=1692692709.25 i=140000 loss.data=tensor(2.4025)
time.time()=1692692712.79 i=150000 loss.data=tensor(2.1486)
time.time()=1692692716.31 i=160000 loss.data=tensor(2.

In [66]:
def generate_examples(network):
    num_examples = 5
    for _ in range(num_examples):
        
        context = '.' * CONTEXT_LEN
        example = ''
        while True:
            cur_inputs = torch.tensor([[CHAR_TO_CLASS[char] for char in context]])
            probs = forward(cur_inputs, network)
            next_class = torch.multinomial(probs, 1).item()
            next_char = CLASS_TO_CHAR[next_class]
            if next_char == '.':
                break
            example += next_char
            context = context[1:] + next_char
        print(example)

generate_examples(NETWORK)

hasadi
narya
jylanna
eretta
zosine


Doesn't look like results from wavenet are much better than the results from mlp.
Maybe that's because I didn't implement batchnorm.