In [None]:
# Imports
import torch
print(torch.__version__)

In [None]:
words = open('names.txt', 'r').read().splitlines() # Load names into array of strings
words[:10]

In [None]:
print('Number of words: ', len(words))
print('MIN: ', min(len(w) for w in words))
print('MAX: ', max(len(w) for w in words))

In [None]:
# Bigrams (predict next character using the previous character)

N = torch.zeros((27, 27), dtype = torch.int32) # Create 3 by 5 array of 32 bit integers, all zeroes, containing bigram counts
chars = sorted(list(set(''.join(words)))) # Get all characters used in words into a list, in order'

stoi = {s:i+1 for i, s in enumerate(chars)} # Map characters to indices for 2d array storage
stoi['.'] = 0 # Special Character for start or end of word
itos = {i:s for s, i in stoi.items()} # Map indicies

for w in words: # Iterate over all words
    chs = ['.'] + list(w) + ['.'] # Adds special token/character to word to analyze which letters are more likely to start and end a word
    for c1, c2 in zip(chs, chs[1:]): # Iterate over consecutive characters in word
        N[stoi[c1], stoi[c2]] += 1
        
N

In [None]:
# Sampling from bigram table

p = N[0].float()
p = p / p.sum() # Contains probability distrobution (Where first character is the start character (second character is the first character of the word)
# Sample using Pytorch

In [None]:
# Optimization: Normalize each row of N
P = (N+1).float() # Adding 1 removes all instances of impossible bigrams, still very unlikely

for row in P:
    row /= row.sum()

In [None]:
g = torch.Generator().manual_seed(2147483647)
# Generate Names
for i in range(10):
    out = []
    idx = 0
    while True:
        p = P[idx] # Get row corresponding to probabilities of next character, given previous character at idx
        
        idx = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[idx])
        if idx == 0:
            break
    print(''.join(out))

In [19]:
# Likelihood (measure how good the model is)
# Checks original dataset and the probabilites of bigrams assigned byy model, sees how closely the predictions match the actual bigrams in the training
# data. The likelihood of the model is all the probabilities of the bigrams found in the actual dataset multiplied, and we take the log of the
# likelihood to better measure the sum of all the probabilities (they get too close to zero when multiplying a lot) 
# log(a*b*c) = log(a) + log(b) + log(c)

log_likelihood = 0.0
count = 0
for w in words[:3]:
    chs = ['.'] + list(w) + ['.'] 
    for c1, c2 in zip(chs, chs[1:]):
        idx1 = stoi[c1]
        idx2 = stoi[c2]
        log_likelihood += torch.log(P[idx1, idx2]) # Increment log_likelihood by log of each individual probability of a bigram in the data set
        count += 1
        #print('(', c1, c2, ')', 'Likelihood:', log_likelihood) 

loss = -log_likelihood / count # Calculate loss by averaging negative of log_likelihood over each bigram found (zero loss is ideal)
print(loss)

tensor(2.4541)


In [22]:
# Using Nueral Networks to guess next character given the previous one. With a nueral network, we can minimize the loss function using backtracking
# Creating dataset: Get inputs and outputs

xs, ys = [], [] # Inputs and outputs

for w in words[:3]: # Iterate over all bigrams in all words
    chs = ['.'] + list(w) + ['.'] 
    for c1, c2 in zip(chs, chs[1:]): 
        idx1 = stoi[c1]
        idx2 = stoi[c2]
        xs.append(idx1)
        ys.append(idx2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [None]:
import torch.nn.functional as F

# Using one_hot encoding on training data
# Converts 5 to [0,0,0,0,1,0 ... 0] array size is 27, as there are 27 possible values
xenc = F.one_hot(xs, num_classes=27).float() # numclasses is the size of the onehot encoding array

In [68]:
W = torch.randn((27, 27), requires_grad=True) # Weights of 27 neurons 

In [69]:
logits = xenc @ W # Martix multiplication, gets outputs of 27 nuerons
# Output of  matrix multiplication is 5 X 27, 5 inputs and 27 results for each input

counts = logits.exp() # exponentiate all results to get positive values, this is just like the N matrix in the bigram counting example
prob = counts / counts.sum(1, keepdims=True) # Normalize each row of the matrix to have sum = 1

In [70]:
# Calculating Loss (without pytorch)
log_likelihood = 0.0
count = 0
for exp_y in ys:
    log_likelihood += torch.log(prob[0, exp_y])
    count += 1

loss = -log_likelihood/count 
print('Loss:',loss)

Loss: tensor(3.9117, grad_fn=<DivBackward0>)


In [74]:
# Conpacted Setup
import torch.nn.functional as F
xs, ys = [], [] # Inputs and outputs
num = 0

for w in words:
    chs = ['.'] + list(w) + ['.'] 
    for c1, c2 in zip(chs, chs[1:]): 
        idx1 = stoi[c1]
        idx2 = stoi[c2]
        xs.append(idx1)
        ys.append(idx2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('Num inputs:', num)
W = torch.randn((27, 27), requires_grad=True)

Num inputs: 228146


In [83]:
# Tuning Nueral Network using back propogation
for k in range(100):
    
    #Forward pass (Convert inputs, multiply inputs by weights, convert to probability distrobution)
    xenc = F.one_hot(xs, num_classes=27).float()
    logits = xenc @ W
    counts = logits.exp()
    prob = counts / counts.sum(1, keepdims=True)
    loss = -prob[torch.arange(num), ys].log().mean() # Calculate Loss in a better way using pytorch
    print('Round', str(k + 1) + ':', loss.item())
    
    #Backward Pass
    W.grad = None # Reset gradients 
    loss.backward() # Set gradients

    # Update (Only 1 parameter to worry about tuning)
    W.data += -50 * W.grad
    

Round 1: 2.45936918258667
Round 2: 2.4593489170074463
Round 3: 2.459329128265381
Round 4: 2.4593093395233154
Round 5: 2.459289789199829
Round 6: 2.4592700004577637
Round 7: 2.4592509269714355
Round 8: 2.459231376647949
Round 9: 2.4592125415802
Round 10: 2.459193706512451
Round 11: 2.459174633026123
Round 12: 2.459156036376953
Round 13: 2.459137439727783
Round 14: 2.4591188430786133
Round 15: 2.4591004848480225
Round 16: 2.4590821266174316
Round 17: 2.45906400680542
Round 18: 2.459045886993408
Round 19: 2.4590282440185547
Round 20: 2.459010362625122
Round 21: 2.4589927196502686
Round 22: 2.458975315093994
Round 23: 2.4589576721191406
Round 24: 2.4589405059814453
Round 25: 2.458922863006592
Round 26: 2.4589059352874756
Round 27: 2.4588892459869385
Round 28: 2.4588723182678223
Round 29: 2.458855628967285
Round 30: 2.458838701248169
Round 31: 2.458822250366211
Round 32: 2.458805799484253
Round 33: 2.458789348602295
Round 34: 2.458773136138916
Round 35: 2.458756923675537
Round 36: 2.4587407