In this chapter we build simple nn that uses the previous character to predict the next character.
Output of nn are logits. Logits are logarithms of counts. When we do logits.exp() we get the counts.
From counts we can get probabilities of getting given character class `(counts[i]/sum(counts))`

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plot
%matplotlib inline

In [2]:
def read_names(path):
    names = []
    with open(path) as fileobj:
        for line in fileobj:
            names.append(line.strip())
    return names

In [3]:
NAMES = read_names('names.txt')
len(NAMES), NAMES[:2]

(32033, ['emma', 'olivia'])

In [4]:
import itertools
import string

STRING_TO_CLASS = {char: i for i, char in enumerate('.' + string.ascii_lowercase)}
CLASS_TO_STRING = {i: char for char, i in STRING_TO_CLASS.items()}

def build_training_set(names):
    result = []
    for a_name in names:
        prev = '.'
        for char in (a_name + '.'):
            result.append((STRING_TO_CLASS[prev], STRING_TO_CLASS[char]))
            prev = char
    return result

# TRAINING_SET consists of tuples prev_character -> next_character
# characters are converted to integers using STRING_TO_CLASS

TRAINING_SET = build_training_set(NAMES)
len(TRAINING_SET), TRAINING_SET[:3]

(228146, [(0, 5), (5, 13), (13, 13)])

`one_hot` is a simple encoding used in classification: it takes a number of classes
and a tensor with class values.
It returns a tensor where each class value is encoded in num_classes-dimensional tensor

In [5]:
F.one_hot(torch.tensor([0, 3, 2]), num_classes=6)

tensor([[1, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0, 0]])

In [6]:
def train(training_set):
    inputs = torch.tensor([input_ for input_, _ in training_set])
    expected_outputs =  torch.tensor([output for _, output in training_set])
    num_classes = len(STRING_TO_CLASS)
    inputs = F.one_hot(inputs, num_classes=num_classes).float()
    expected_outputs = F.one_hot(expected_outputs, num_classes=num_classes).float()  # (n, 27)
    w = torch.randn((num_classes, num_classes))
    w.requires_grad = True
    b = torch.randn(num_classes)
    b.requires_grad = True
    
    num_iter = 1500
    learning_rate = 2
    smooth = 0.0001
    for i in range(num_iter):
        # we consider logits to be logarithms of counts
        # if logit is negative, then count is close to 0
        logits = (inputs @ w + b)
        counts = (inputs @ w + b).exp()  # (n, 27) @ (27, 27) = (n, 27); (n, 27) + (27) = (n, 27)
        # probs are probabilities now
        prob = counts / counts.sum(dim=1, keepdim=True)
        # negative log likelihood: we want to maximize product of probabilities
        # when we take negative sign, then we want to minimize the resulting function
        # it checks out: we want to minimize loss
        probs = prob[expected_outputs == 1] + smooth
        loss = -probs.log().mean()

        loss.backward()
        w.data -= learning_rate * w.grad
        b.data -= learning_rate * b.grad
        w.grad = None
        b.grad = None
        if i % 100 == 0:
            print(f'{i=} {loss.data=}')
    return w, b
        
    
W, B = train(TRAINING_SET)

i=0 loss.data=tensor(4.3815)
i=100 loss.data=tensor(2.7272)
i=200 loss.data=tensor(2.6082)
i=300 loss.data=tensor(2.5617)
i=400 loss.data=tensor(2.5364)
i=500 loss.data=tensor(2.5203)
i=600 loss.data=tensor(2.5092)
i=700 loss.data=tensor(2.5012)
i=800 loss.data=tensor(2.4950)
i=900 loss.data=tensor(2.4902)
i=1000 loss.data=tensor(2.4863)
i=1100 loss.data=tensor(2.4831)
i=1200 loss.data=tensor(2.4803)
i=1300 loss.data=tensor(2.4780)
i=1400 loss.data=tensor(2.4759)


In [7]:
def generate_examples(w, b):
    num_examples = 5
    for _ in range(num_examples):
        
        cur_char = '.'
        example = ''
        while True:
            cur_one_hot = F.one_hot(torch.tensor([STRING_TO_CLASS[cur_char]]), num_classes=len(STRING_TO_CLASS)).float()
            output = (cur_one_hot @ w + b).exp()
            probs = output / output.sum()
            next_class = torch.multinomial(probs, 1).item()
            next_char = CLASS_TO_STRING[next_class]
            if next_char == '.':
                break
            example += next_char
            cur_char = next_char
        print(example)

generate_examples(W, B)
    

gpi
jph
elynelion
joniyn
ny


We can see that generated examples suck.

That's because we only use information about the previous character to generate the next character.
We can do better.
See next chapter.