### Bigram Model

In [None]:
words = open('names.txt', 'r').read().splitlines()
words[:20]

In [None]:
len(words)

In [None]:
min([len(word) for word in words])

In [None]:
max([len(word) for word in words])

In [None]:
# just figuring out what zip does for uneven lists
for one, two in zip([1, 2, 3], [2, 3]):
    print( one, two)
print('-')
for one, two in zip([1, 2], [1, 2, 3]):
    print( one, two)    

In [None]:
#how can zip help us in creating bigrams
bigram_counts = {}
for w in words:
    chs = ['.'] + list(w) + ['.']
    # print(chs)
    for c1, c2 in zip(chs, chs[1:]):
        bigram = (c1, c2)
        bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1
        # print(c1, c2)

In [None]:
bigrams_sorted_by_value = sorted(bigram_counts, key=bigram_counts.get, reverse=True)
for bigram in bigrams_sorted_by_value:
    print(bigram, bigram_counts[bigram])

In [None]:
import torch

In [None]:
BIGRAM_COUNTS = torch.ones((27, 27), dtype=torch.int32)
# ones instead of zeros to avoid 0 probability

In [None]:
# maps from string(character) to int index and vice versa - from int index to string(char) 
chars = sorted(list(set(''.join(words))))
chars.insert(0, '.')
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for i, s in enumerate(chars)}
print(stoi)
print(itos)

In [None]:
for w in words:
    chs = ['.'] + list(w) + ['.']
    # print(chs)
    for c1, c2 in zip(chs, chs[1:]):
        c1_idx = stoi[c1]
        c2_idx = stoi[c2]
        BIGRAM_COUNTS[c1_idx, c2_idx] += 1

In [None]:
BIGRAM_COUNTS[stoi['n'], stoi['.']]
BIGRAM_COUNTS.size(dim=0)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16, 16))
plt.imshow(BIGRAM_COUNTS, cmap='Blues')
for i in range(BIGRAM_COUNTS.size(dim=0)):
    for j in range(BIGRAM_COUNTS.size(dim=1)):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha='center', va='bottom', color='gray')
        plt.text(j, i, BIGRAM_COUNTS[i, j].item(), ha='center', va='top', color='gray')
plt.axis('off')

In [None]:
g = torch.Generator().manual_seed(2147483647)
p = torch.rand(3, generator=g)
p
p = p/p.sum()
p

In [None]:
results = torch.multinomial(p, num_samples=100, replacement=True, generator=g).numpy()
print(results)
from collections import Counter
Counter(results)

In [None]:
p = BIGRAM_COUNTS[0,:]/sum(BIGRAM_COUNTS[0,:])
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
ix
itos[ix]

In [None]:
P = BIGRAM_COUNTS/BIGRAM_COUNTS.sum(1, keepdims=True)

In [None]:
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    
    generated_word = []
    ix = 0
    while True:
#         row_for_index = BIGRAM_COUNTS[ix, :].float()
#         p = row_for_index/row_for_index.sum()
        ix = torch.multinomial(P[ix, :], num_samples=1, replacement=True, generator=g).item()
        sampled_char = itos[ix]
        generated_word.append(sampled_char)
        if sampled_char == '.':
            break

    print(''.join(generated_word))
    

In [None]:
# Goal: Maximize Likelihood of the data w.r.t the model parameters (statistical modeling)
# equivalent to maximizing the log likelihood (because log is monotonic)
# equivalent to minimizing the negative log likelihood
# equivalent to minimizing the mean negative log likelihood

# log(a*b*c) = log(a) + log(b) + log(c)

log_likelihood = 0.0
num_bigrams = 0
for w in words:
#for w in ["bkoyko"]:
    chs = ['.'] + list(w) + ['.']
    # print(chs)
    for c1, c2 in zip(chs, chs[1:]):
        c1_idx = stoi[c1]
        c2_idx = stoi[c2]
        prob = P[c1_idx, c2_idx]
        logprob = torch.log(prob)
        log_likelihood += logprob
        print(f'{c1}{c2}: {prob:.4f} {logprob:.4f}')
        num_bigrams += 1
        
print(f'{log_likelihood=}')
neg_log_likelihood = - log_likelihood
print(f'{neg_log_likelihood=}')
mean_neg_log_likelihood = neg_log_likelihood/num_bigrams
print(f'{mean_neg_log_likelihood=}')


### Neural Net

In [None]:

#create a training set of bigrams(x, y)
xs, ys = [], []

for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for c1, c2 in zip(chs, chs[1:]):
        c1_idx = stoi[c1]
        c2_idx = stoi[c2]
        xs.append(c1_idx)
        ys.append(c2_idx)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
xs, ys

In [None]:
import torch.nn.functional as F
x_enc = F.one_hot(xs, num_classes=27).float()
x_enc.shape
y_enc = F.one_hot(xs, num_classes=27).float()
plt.imshow(x_enc)

In [None]:
# Randomly initialize 27 neurons' weights, each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)
W.shape, x_enc.shape

In [None]:
#forward pass
x_enc = F.one_hot(xs, num_classes=27).float() # input to the network - one-hot encoded character
logits = x_enc @ W # predict log counts
counts = logits.exp() # counts equivalent to BIGRAM_COUNTS
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# above two lines are 'softmax'
loss = (-(probs[torch.arange(5), ys].log())).mean()
loss

In [None]:
# nll = negative log likelihood
nlls = torch.zeros(5)
for i in range(5):
    # the i-th bigram
    x = xs[i].item() # input character index
    y = ys[i].item() # label character index
    print('----')
    print(f'bigram example {i+1}: {itos[x]}{itos[y]} indexes({x},{y})')
    print('input to the Nerual Net: ', x)
    print('output probabilities from the neural net: ', probs[i])
    print('label(actual next character index): ', y)
    p = probs[i, y]
    print('Probability assigned by the neural net to the correct charatecter: ', p.item())
    logp = torch.log(p)
    print('log likelihood: ', logp)
    nll = -logp
    print('negative log likelihood: ', nll)
    nlls[i] = nll
    
print('========')
print('average negative log likelihood, i.e. loss = ', nlls.mean().item())
    

In [None]:
probs.shape

In [None]:
xs

In [None]:
ys

In [None]:
probs[0, 5], probs[1, 13], probs[2, 13], probs[3, 1], probs[4, 0]

In [None]:
loss = (-(probs[torch.arange(5), ys].log())).mean()
loss

In [None]:
# backward pass
W.grad = None # sets the gradient to Zero
loss.backward()

In [None]:
W.grad

In [None]:
# Update weights
W.data -= 0.1 * W.grad

##### Let's put it all together

In [264]:
# create the dataset
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for c1, c2 in zip(chs, chs[1:]):
        c1_idx = stoi[c1]
        c2_idx = stoi[c2]
        xs.append(c1_idx)
        ys.append(c2_idx)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

print("Number of examples: ", num)

# initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

Number of examples:  228146


In [265]:
# gradient descent
for k in range(100):
    
    #forward pass
    x_enc = F.one_hot(xs, num_classes=27).float() # input to the network - one-hot encoded character
    logits = x_enc @ W # predict log counts
    counts = logits.exp() # counts equivalent to BIGRAM_COUNTS
    probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    loss = (-(probs[torch.arange(num), ys].log())).mean()
    # regularization
    reg_lambda = 0.01
    loss += reg_lambda * (W**2).mean().item()
    if k % 9 == 0:
        print(loss.item())
    
    # backward pass
    W.grad = None # sets the gradient to Zero
    loss.backward()
    
    # Update weights
    learning_rate = 50
    W.data -= learning_rate * W.grad
    

3.768618583679199
2.719987630844116
2.596351146697998
2.5505459308624268
2.5276565551757812
2.514451503753662
2.506150484085083
2.500559091567993
2.4965858459472656
2.493650197982788
2.491422414779663
2.489698886871338


In [269]:
# sample form the neural net
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    
    sampled_word = []
    ix = 0
    
    while True:
        
        # BEFORE
        # p = P[ix]
        
        # NOW
        x_enc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = x_enc @ W # predict log counts
        counts = logits.exp() # counts equivalent to BIGRAM_COUNTS
        p = counts / counts.sum(1, keepdims=True) # probabilities for next character
        
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        sampled_char = itos[ix]
        sampled_word.append(sampled_char)
        if sampled_char == '.':
            break
     
    print(''.join(sampled_word))

mor.
axwaninaymoryles.
kondmaisah.
anchshizarie.
odaren.


In [None]:
mor.
axx.
minaymoryles.
kondlaisah.
anchshizarie.