In [8]:
import numpy as np
import torch


In [9]:
with open('data/names.txt', 'r') as f:
    data = f.read().splitlines()


In [10]:
chars = sorted(set(''.join([d for d in data])))
chars = ['.'] + chars
ctoi = {c:i for i, c in enumerate(chars)}
itoc = {i:c for i,c in enumerate(chars)}

print(ctoi)
print(itoc)
X = []
block_size = 2 # Tri-gram
for word in data:
    context = [0] * block_size
    for c in word + '.':
        ix = ctoi[c]
        X.append(context + [ix])
        context = context[1:] + [ix]

print(len(X))

{'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
{0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}
228146


## Counting the occurence and then predicting 

In [12]:
# First let's 'train' a simple 'network': Count how many times one character follows another and
# use that to sample next word to generate word. NOT using a Neural Network


counts = torch.zeros(27, 27, 27)
# print(counts.size())
# counts[0, 0, 0] represents count of '...' sequence in our training data, counts[0, 1, 0] represents count of '.a.' in our training data and so on
for val in X:
    counts[val[0], val[1], val[2]] += 1


print(counts[0, 0, :])
# Now Normalize
# '...', '..a', '..b', '..c', '..d', ..... and so on should sum to 1.0 (i.e., normalize across count[0, 0, i])
# '.a.', '.aa', '.ab', '.ac', '.ad', ... and so on should sum to 1.0 (i.e., across count[0, 1, i])
# '.b.', '.ba', '.bb', '.bc', '.bd', ... and so on should sum to 1.0 (i.e., across count[0, 2, i])
# ...
# 'z..', 'z.a', 'z.b', 'z.c', 'z.d', ...
# ...
# 'zz.', 'zza', 'zzb', 'zzc', 'zzd', ...

# i.e., for every previous occurrences, we have to sum along dimension = 2 (position of i above)

counts = counts + 1 # Avoid division by zero (smoothing) as this 1 increase to higher number, we get uniform distribution. Hence smoothing.
counts = counts / counts.sum(dim=2, keepdim=True)

tensor([   0., 4410., 1306., 1542., 1690., 1531.,  417.,  669.,  874.,  591.,
        2422., 2963., 1572., 2538., 1146.,  394.,  515.,   92., 1639., 2055.,
        1308.,   78.,  376.,  307.,  134.,  535.,  929.])


In [7]:
# Check if it sums to 1.0
print(sum(counts[0, 0, :]))
print(sum(counts[0, 1, :]))
print(sum(counts[0, 1, :]))

# Rough code to see dimension
# ten = torch.tensor([[[1,2,3],[1,2,3],[1,2,3]], [[10,20,30],[10,20,30],[10,20,30]], [[100,200,300],[100,200,300],[100,200,300]]])
# print(ten)

# print("Sum")
# ten.sum(dim=2, keepdim=True)
# dim0 = (3, 3)
# dim1 = (1, 3)
# dim2 = (3, 1)

tensor(1.0000)
tensor(1.)
tensor(1.)


In [13]:
# Now sample from that counts (PREDICTION Step)

g = torch.Generator().manual_seed(1024)
for _ in range(50):
    out = []
    ix, prev_ix = 0, 0
    while True:
        # Now sample index from count[0, 0, :] row.
        p = counts[prev_ix, ix, :]
        new_ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itoc[new_ix])
        if new_ix == 0:
            break
        prev_ix = ix
        ix = new_ix
    print(''.join(out))


kynn.
karthordyn.
ja.
xen.
conn.
zoemadiah.
lanunne.
alen.
zeh.
saia.
jayton.
ta.
getsmya.
ja.
azdey.
ah.
adasenicelyntjrthad.
estorraleagra.
brafines.
johila.
nehori.
nessmdkken.
hanobdincina.
kyanaslggxyanaizlyn.
non.
stytosper.
haleikashada.
aizliand.
olce.
aravori.
arie.
tygtvin.
foko.
ne.
dalyn.
ronni.
suha.
keightya.
mar.
ri.
hen.
jell.
bree.
yunikyrein.
yic.
mirrhyrivikazenton.
lesynn.
el.
muna.
otanew.


In [None]:
# Evaluate the quality of this model:

# For that we look at the probability of the dataset from our 'learned' counts model.

for x in X[:10]:
    print(''.join(itoc[i] for i in x), end='--> ')
    print(f'{counts[x[0], x[1], x[2]].item():.2f}')

# MLE says that a good model maximizes the product of likelihood of this data
# since product of probabilities would be very small, we're taking log probability

print("Quality of the model: summarized by negative log likelihood")
log_likelihood = 0.0
n = 0
for x in X:
    prob = counts[x[0], x[1], x[2]]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1

# Log likelihood = 0 when PS = 1.0 (which we aim to go towards)
# Log likelihood = -ve when PS < 1.0 (which says that we haven't fitted the prediction for the likelihood.)

# We want a loss function (i.e., 0 when prediction is good, high when prediction is bad)
# so we use Negative log likelihood (nll) as loss function.
# we also average it instead of sum.
nll = -1 * log_likelihood
loss = nll/n

print(f'{loss=}')

# Right now, since we counted how many times 3rd character occurs given first two characters, our model right now is 'perfect'. So this is the loss that we can get to even when we train a neural network to do this.

..e--> 0.05
.em--> 0.19
emm--> 0.13
mma--> 0.37
ma.--> 0.07
..o--> 0.01
.ol--> 0.25
oli--> 0.11
liv--> 0.02
ivi--> 0.27
Quality of the model: 
loss=tensor(2.2120)


## Training a Neural Network for Trigram Character Level Language Model.

- We took an approach that we felt natural, i.e, counting the times last character appeared given two characters ahead, we computed the probabilities. We then used that distribution to predict the next character in prediction phase.
- Now, as the number of character increase, this becomes computationally infeasible.
- So, we want to use a neural network to do that.
- Here, we input two characters to the neural network and it predicts the probability distribution over the next characters.

In [27]:
# Print input data
print(X[:10])

[[0, 0, 5], [0, 5, 13], [5, 13, 13], [13, 13, 1], [13, 1, 0], [0, 0, 15], [0, 15, 12], [15, 12, 9], [12, 9, 22], [9, 22, 9]]


In [98]:
x = []
y = []
for data in X[:5]:
    x.append([data[0], data[1]])
    y.append(data[2])

x = torch.tensor(x)
y = torch.tensor(y)

print(x.shape, y.shape)
print(x)
print(y)

torch.Size([5, 2]) torch.Size([5])
tensor([[ 0,  0],
        [ 0,  5],
        [ 5, 13],
        [13, 13],
        [13,  1]])
tensor([ 5, 13, 13,  1,  0])


In [159]:
# Since we can't pass int64 (long) into a neural network, we use one hot vector
import torch.nn.functional as F

torch.manual_seed(1024)

x_enc = F.one_hot(x, num_classes=len(chars)).float()
print(x_enc.shape)
# Reshape it to stack the inputs
# X_temp = torch.tensor([[[1, 2, 3], [4, 5, 6]], [[10, 20, 30], [40, 50, 60]]]).float()

# print(X_temp.view(len(X_temp), -1))
# print(X_temp.reshape(2, 6))

x_enc = x_enc.view(len(x_enc), -1)
print(f'{x_enc.shape=}')

# Now we make a 27 layer output to predict probability distribution over characters
# weight = 54 (inputs) * 27 (output layer)

W = torch.randn(54, 27)

out = x_enc @ W
print(f'output shape: {out.shape}')

# 1st neuron output for 1st data
neuron_output = torch.dot(x_enc[0, :], W[:, 0])
print(neuron_output)


counts = out.exp()
prob_nn = counts / counts.sum(dim=1, keepdim=True)
print(f'{prob_nn.shape=}')
print(y)
# These give the probability for the current target values (true)
print(prob_nn[0, 5], prob_nn[1, 13], prob_nn[2, 13], prob_nn[3, 1], prob_nn[4, 0])
print(prob_nn[torch.arange(len(prob_nn)), y])

# Now to compute logprob
logprob = prob_nn[torch.arange(len(prob_nn)), y].log()
print(f'{logprob=}')

# We can see that logprob of fair predictions (i.e., for 4th data -> 0.1475) is more compared to others (very less prob -> bad predictions). These are true label, so the prob should be higher.

# To compute loss, we take negative logprob (and average it)
loss = -1 * logprob.mean()
print(f'{loss=}')

# This loss is pretty high compared to what we got from counting model above. our aim is to get close to counting model loss, because that reflects true probability distribution of the dataset.

torch.Size([5, 2, 27])
x_enc.shape=torch.Size([5, 54])
output shape: torch.Size([5, 27])
tensor(-2.6728)
prob_nn.shape=torch.Size([5, 27])
tensor([ 5, 13, 13,  1,  0])
tensor(0.0354) tensor(0.0087) tensor(0.0750) tensor(0.1475) tensor(0.0544)
tensor([0.0354, 0.0087, 0.0750, 0.1475, 0.0544])
logprob=tensor([-3.3410, -4.7408, -2.5896, -1.9139, -2.9121])
loss=tensor(3.0995)


In [260]:
# Now do for all data
x = []
y = []
for data in X:
    x.append([data[0], data[1]])
    y.append(data[2])

x = torch.tensor(x)
y = torch.tensor(y)

x_enc = F.one_hot(x, num_classes=len(chars)).float()
x_enc = x_enc.view(len(x_enc), -1)
W = torch.randn((54, 27),requires_grad=True)

# Now we can predict the probability distribution over the characters

In [261]:
def forward_backward_pass():
    # 1) FORWARD PASS
    logits = x_enc @ W
    counts_nn = logits.exp()
    prob_nn = counts_nn / counts_nn.sum(dim=1, keepdim=True)
    # lines 3, and 4 is softmax
    loss = -1 * torch.log(prob_nn[torch.arange(len(prob_nn)), y]).mean()
    # Loss is negative logprobs (average)
    print(loss)

    # 2) Backward Pass
    W.grad = None # Clear out gradients
    loss.backward()

    # Optimize weights
    W.data += -10 * W.grad # Gradients point towards the increase in loss.

forward_backward_pass()

tensor(4.2435, grad_fn=<MulBackward0>)


In [262]:
# We train for couple of epochs
epochs = 1000
for i in range(epochs):
    print(f'Epoch {i}:', end=" ")
    forward_backward_pass()

Epoch 0: tensor(4.0305, grad_fn=<MulBackward0>)
Epoch 1: tensor(3.8641, grad_fn=<MulBackward0>)
Epoch 2: tensor(3.7292, grad_fn=<MulBackward0>)
Epoch 3: tensor(3.6177, grad_fn=<MulBackward0>)
Epoch 4: tensor(3.5224, grad_fn=<MulBackward0>)
Epoch 5: tensor(3.4393, grad_fn=<MulBackward0>)
Epoch 6: tensor(3.3663, grad_fn=<MulBackward0>)
Epoch 7: tensor(3.3020, grad_fn=<MulBackward0>)
Epoch 8: tensor(3.2450, grad_fn=<MulBackward0>)
Epoch 9: tensor(3.1942, grad_fn=<MulBackward0>)
Epoch 10: tensor(3.1485, grad_fn=<MulBackward0>)
Epoch 11: tensor(3.1072, grad_fn=<MulBackward0>)
Epoch 12: tensor(3.0697, grad_fn=<MulBackward0>)
Epoch 13: tensor(3.0355, grad_fn=<MulBackward0>)
Epoch 14: tensor(3.0042, grad_fn=<MulBackward0>)
Epoch 15: tensor(2.9755, grad_fn=<MulBackward0>)
Epoch 16: tensor(2.9491, grad_fn=<MulBackward0>)
Epoch 17: tensor(2.9248, grad_fn=<MulBackward0>)
Epoch 18: tensor(2.9024, grad_fn=<MulBackward0>)
Epoch 19: tensor(2.8816, grad_fn=<MulBackward0>)
Epoch 20: tensor(2.8624, grad_

In [263]:
# Prediction
generator = torch.Generator().manual_seed(1024)
for _ in range(10):
    input_x = torch.tensor([0, 0])
    ix = 0
    output = []
    input_x = torch.cat([input_x[1:],torch.tensor([ix])])
    while True:
        one_hot_ip = F.one_hot(input_x, num_classes=len(chars))
        one_hot_ip = one_hot_ip.reshape(1, len(input_x) * len(chars)).float()
        logits_pred = one_hot_ip @ W
        counts_pred = logits_pred.exp()
        probs_pred = counts_pred/counts_pred.sum(dim=1, keepdim=True)
        ix = torch.multinomial(probs_pred, num_samples=1, replacement=True, generator=generator).item()
        char = itoc[ix]
        output.append(char)
        if ix == 0:
            break
    print(''.join(output))


kjdhdkmajjmrdymsjkaxpnhclenrznekhdsmhyljnpnnhkalrtkzbhaskgajjadtdmltajgatbmmmejahazdetraohadasejjcjakstjrtcbakescnhrambagrbwbjtfyscsjjbjilaanehojmmnessmdkkepbhajobdkjcmgjfkramaslggtbanaizlfaknoersryteskcmmhmlejkcsmadajaizcjkldboecnkaasvdazhaahectygtvglmfjkoanjbdtlgtbrckndasubjzkstkjjyaqmjkmrtaheaejelmkbdtejyhdmkyreimbykcnmcrrhmmwvikaznmtdjsljbblceearmrnaaotanewyaafgnksldlbrmksjzjkkabjajkjqrmaekidmadddclldssdlacckrjsmmmnjtknagttkharrymjkdthakscdazglrgmjatonjasblwcktmsxhjfaddsazbnkycaafcalbamzflcjzziacfrdckwjdmokdkkilrzolttoglvsaaajhakmakcazetmkajdjwawskdkgeveejkjctalnwdpajajaackdtajacseftjefaebmaaibanacsblmhmezprfacadhkkmmtzarratxjpkfasaljycasmramsmzadmgchskcdmarmkhzfsrjkjaatewihnjczndwvshrpamylnmbkjcedevkajrdnbgzrtpskjfmkdaearedksmndafhnmkcfskoehsclmmaeytaasznehjsrfkdekerzgkzsoselhmactziaagrdrfatjpzrcmakszbbldrsacoejeknyzbtaokssgleazwrmcccsmedtsbracatkdednapektsokkbssjiycampamtzyjerhnbtbavegasnaskxbskwjmncabotrdladedhytaahakkjsatvnafoaarctkrdmjangdceajaattlkaevszgmskgjkjdlladjrbbkkialr