In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def read_names(path):
    with open(path) as fileobj:
        names = [line.strip() for line in fileobj]
    return names

NAMES = read_names('names.txt')
len(NAMES), NAMES[:3]

(32033, ['emma', 'olivia', 'ava'])

In [3]:
import string

CHAR_TO_CLASS = {char: i for i, char in enumerate('.' + string.ascii_lowercase)}
CLASS_TO_CHAR = {i: char for char, i in CHAR_TO_CLASS.items()}

CHAR_TO_CLASS

{'.': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26}

In [4]:
CONTEXT_LEN = 3

def build_training_data(names, context_len):
    training_data = []
    prefix = '.' * context_len
    for a_name in names:
        a_name += '.'
        full_name = f'{prefix}{a_name}'
        for i, next_char in enumerate(a_name):
            context = full_name[i:i + context_len]
            training_data.append((context, next_char))
    return training_data

TRAINING_DATA = build_training_data(NAMES, CONTEXT_LEN)
len(TRAINING_DATA), TRAINING_DATA[:6]

(228146,
 [('...', 'e'),
  ('..e', 'm'),
  ('.em', 'm'),
  ('emm', 'a'),
  ('mma', '.'),
  ('...', 'o')])

In [5]:
NUM_CLASSES = len(CLASS_TO_CHAR)

def build_training_set(training_set):
    inputs = []
    outputs = []
    for context, next_char in training_set:
        an_input = [CHAR_TO_CLASS[char] for char in context]
        inputs.append(an_input)
        
        an_output = CHAR_TO_CLASS[next_char]
        outputs.append(an_output)
    inputs = torch.tensor(inputs)
    outputs = torch.tensor(outputs)
    return inputs, outputs


INPUTS, OUTPUTS = build_training_set(TRAINING_DATA)
print(f'{INPUTS.shape=},  {INPUTS[:3]=}, {OUTPUTS.shape=}, {OUTPUTS[:3]=}')

INPUTS.shape=torch.Size([228146, 3]),  INPUTS[:3]=tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13]]), OUTPUTS.shape=torch.Size([228146]), OUTPUTS[:3]=tensor([ 5, 13, 13])


In [10]:
import time


def requires_grad(t):
    t.requires_grad = True
    return t

C_DIM = 10
    

def train(inputs, outputs, num_iterations=100_000, verbose=False):
    batch_size = 32
    
    assert len(inputs) == len(outputs)
    
    hidden_out_size = 200
    # we place each character into C_DIM dimensional vector
    C = requires_grad(torch.randn(NUM_CLASSES, C_DIM))
    # input of hidden layer is CONTEXT_LEN * C_DIM elements
    hidden_w = requires_grad(torch.randn(CONTEXT_LEN * C_DIM, hidden_out_size))
    hidden_b = requires_grad(torch.randn(hidden_out_size))
    # input of final layer is hidden_out_size
    last_w = requires_grad(torch.randn(hidden_out_size, NUM_CLASSES))
    last_b = requires_grad(torch.randn(NUM_CLASSES))
    # output of the network is 27
    parameters = [C, hidden_w, hidden_b, last_w, last_b]

    for i in range(num_iterations):
        batch_indexes = torch.randint(0, inputs.shape[0], (batch_size,))
        # inputs_batch.shape = (batch_size, CONTEXT_LEN)
        inputs_batch = inputs[batch_indexes]
        outputs_batch = outputs[batch_indexes]
        probs = forward(inputs_batch, parameters)
        m_probs = probs[torch.arange(len(probs)), outputs_batch]
        loss = -(m_probs.log().mean())

        # loss = F.cross_entropy(probs, outputs_batch)
        
        loss.backward()
        # learning_rate decay
        learning_rate = 1e-1 if i < 100_000 else 1e-2
        if verbose and i % 10000 == 0:
            print(f'{time.time()=:.2f} {i=} {loss.data=}')
        for p in parameters:
            p.data -= learning_rate * p.grad
            p.grad = None
    print(f'{loss=}')
    return parameters


def forward(inputs_batch, parameters):
    smooth = 0.0001
    C, hidden_w, hidden_b, last_w, last_b = parameters
    # C[inputs_batch].shape = (batch_size, CONTEXT_LEN, C_DIM)
    # first_output.shape == (batch_size, CONTEXT_LEN * C_DIM)
    first_output = C[inputs_batch].view(-1, C_DIM * CONTEXT_LEN)
    hidden_output = (first_output @ hidden_w + hidden_b).tanh()
    logits = hidden_output @ last_w + last_b
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True) + smooth
    return probs


TRAINING_SET_SIZE = 200_000
PARAMETERS = train(
    INPUTS[:TRAINING_SET_SIZE], 
    OUTPUTS[:TRAINING_SET_SIZE],
    num_iterations=200_000, 
    verbose=True
);

time.time()=1691235498.37 i=0 loss.data=tensor(8.4536)
time.time()=1691235500.05 i=10000 loss.data=tensor(4.0960)
time.time()=1691235501.67 i=20000 loss.data=tensor(3.0769)
time.time()=1691235503.26 i=30000 loss.data=tensor(2.7793)
time.time()=1691235504.84 i=40000 loss.data=tensor(2.5716)
time.time()=1691235506.43 i=50000 loss.data=tensor(2.5750)
time.time()=1691235508.03 i=60000 loss.data=tensor(2.5696)
time.time()=1691235509.62 i=70000 loss.data=tensor(1.9596)
time.time()=1691235511.19 i=80000 loss.data=tensor(2.5666)
time.time()=1691235512.75 i=90000 loss.data=tensor(2.3063)
time.time()=1691235514.34 i=100000 loss.data=tensor(2.4602)
time.time()=1691235515.94 i=110000 loss.data=tensor(2.1502)
time.time()=1691235517.52 i=120000 loss.data=tensor(2.1348)
time.time()=1691235519.12 i=130000 loss.data=tensor(2.0987)
time.time()=1691235520.74 i=140000 loss.data=tensor(2.2463)
time.time()=1691235522.30 i=150000 loss.data=tensor(2.0326)
time.time()=1691235523.87 i=160000 loss.data=tensor(2.

In [22]:
def generate_examples(parameters):
    num_examples = 5
    for _ in range(num_examples):
        
        context = '.' * CONTEXT_LEN
        example = ''
        while True:
            cur_inputs = torch.tensor([[CHAR_TO_CLASS[char] for char in context]])
            probs = forward(cur_inputs, parameters)
            next_class = torch.multinomial(probs, 1).item()
            next_char = CLASS_TO_CHAR[next_class]
            if next_char == '.':
                break
            example += next_char
            context = context[1:] + next_char
        print(example)

generate_examples(PARAMETERS)

kasiya
olura
aalyn
jailean
tasiyah


These examples are better than those made with the bigram model