In [1]:
# Imports
import torch
print(torch.__version__)

2.8.0+cu126


In [2]:
words = open('names.txt', 'r').read().splitlines() # Load names into array of strings
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
print('Number of words: ', len(words))
print('MIN: ', min(len(w) for w in words))
print('MAX: ', max(len(w) for w in words))

Number of words:  32033
MIN:  2
MAX:  15


In [4]:
# Bigrams (predict next character using the previous character)

N = torch.zeros((27, 27), dtype = torch.int32) # Create 3 by 5 array of 32 bit integers, all zeroes, containing bigram counts
chars = sorted(list(set(''.join(words)))) # Get all characters used in words into a list, in order'

stoi = {s:i+1 for i, s in enumerate(chars)} # Map characters to indices for 2d array storage
stoi['.'] = 0 # Special Character for start or end of word
itos = {i:s for s, i in stoi.items()} # Map indicies

for w in words: # Iterate over all words
    chs = ['.'] + list(w) + ['.'] # Adds special token/character to word to analyze which letters are more likely to start and end a word
    for c1, c2 in zip(chs, chs[1:]): # Iterate over consecutive characters in word
        N[stoi[c1], stoi[c2]] += 1
        
N

tensor([[   0, 4410, 1306, 1542, 1690, 1531,  417,  669,  874,  591, 2422, 2963,
         1572, 2538, 1146,  394,  515,   92, 1639, 2055, 1308,   78,  376,  307,
          134,  535,  929],
        [6640,  556,  541,  470, 1042,  692,  134,  168, 2332, 1650,  175,  568,
         2528, 1634, 5438,   63,   82,   60, 3264, 1118,  687,  381,  834,  161,
          182, 2050,  435],
        [ 114,  321,   38,    1,   65,  655,    0,    0,   41,  217,    1,    0,
          103,    0,    4,  105,    0,    0,  842,    8,    2,   45,    0,    0,
            0,   83,    0],
        [  97,  815,    0,   42,    1,  551,    0,    2,  664,  271,    3,  316,
          116,    0,    0,  380,    1,   11,   76,    5,   35,   35,    0,    0,
            3,  104,    4],
        [ 516, 1303,    1,    3,  149, 1283,    5,   25,  118,  674,    9,    3,
           60,   30,   31,  378,    0,    1,  424,   29,    4,   92,   17,   23,
            0,  317,    1],
        [3983,  679,  121,  153,  384, 1271,   82,

In [5]:
# Sampling from bigram table

p = N[0].float()
p = p / p.sum() # Contains probability distrobution (Where first character is the start character (second character is the first character of the word)
# Sample using Pytorch

In [6]:
# Optimization: Normalize each row of N
P = (N+1).float() # Adding 1 removes all instances of impossible bigrams, still very unlikely

for row in P:
    row /= row.sum()

In [7]:
g = torch.Generator().manual_seed(2147483647)
# Generate Names
for i in range(10):
    out = []
    idx = 0
    while True:
        p = P[idx] # Get row corresponding to probabilities of next character, given previous character at idx
        
        idx = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[idx])
        if idx == 0:
            break
    print(''.join(out))

cexze.
momasurailezitynn.
konimittain.
llayn.
ka.
da.
staiyaubrtthrigotai.
moliellavo.
ke.
teda.


In [8]:
# Likelihood (measure how good the model is)
# Checks original dataset and the probabilites of bigrams assigned byy model, sees how closely the predictions match the actual bigrams in the training
# data. The likelihood of the model is all the probabilities of the bigrams found in the actual dataset multiplied, and we take the log of the
# likelihood to better measure the sum of all the probabilities (they get too close to zero when multiplying a lot) 
# log(a*b*c) = log(a) + log(b) + log(c)

log_likelihood = 0.0
count = 0
for w in words[:3]:
    chs = ['.'] + list(w) + ['.'] 
    for c1, c2 in zip(chs, chs[1:]):
        idx1 = stoi[c1]
        idx2 = stoi[c2]
        log_likelihood += torch.log(P[idx1, idx2]) # Increment log_likelihood by log of each individual probability of a bigram in the data set
        count += 1
        #print('(', c1, c2, ')', 'Likelihood:', log_likelihood) 

loss = -log_likelihood / count # Calculate loss by averaging negative of log_likelihood over each bigram found (zero loss is ideal)
print(loss)

tensor(2.4255)


In [9]:
# Using Nueral Networks to guess next character given the previous one. With a nueral network, we can minimize the loss function using backtracking
# Creating dataset: Get inputs and outputs

xs, ys = [], [] # Inputs and outputs

for w in words[:3]: # Iterate over all bigrams in all words
    chs = ['.'] + list(w) + ['.'] 
    for c1, c2 in zip(chs, chs[1:]): 
        idx1 = stoi[c1]
        idx2 = stoi[c2]
        xs.append(idx1)
        ys.append(idx2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [10]:
import torch.nn.functional as F

# Using one_hot encoding on training data
# Converts 5 to [0,0,0,0,1,0 ... 0] array size is 27, as there are 27 possible values
xenc = F.one_hot(xs, num_classes=27).float() # numclasses is the size of the onehot encoding array

In [11]:
W = torch.randn((27, 27), requires_grad=True) # Weights of 27 neurons 

In [12]:
logits = xenc @ W # Martix multiplication, gets outputs of 27 nuerons
# Output of  matrix multiplication is 5 X 27, 5 inputs and 27 results for each input

counts = logits.exp() # exponentiate all results to get positive values, this is just like the N matrix in the bigram counting example
prob = counts / counts.sum(1, keepdims=True) # Normalize each row of the matrix to have sum = 1

In [13]:
# Calculating Loss (without pytorch)
log_likelihood = 0.0
count = 0
for exp_y in ys:
    log_likelihood += torch.log(prob[0, exp_y])
    count += 1

loss = -log_likelihood/count 
print('Loss:',loss)

Loss: tensor(3.9130, grad_fn=<DivBackward0>)


In [14]:
# Conpacted Setup
import torch.nn.functional as F
xs, ys = [], [] # Inputs and outputs
num = 0

for w in words:
    chs = ['.'] + list(w) + ['.'] 
    for c1, c2 in zip(chs, chs[1:]): 
        idx1 = stoi[c1]
        idx2 = stoi[c2]
        xs.append(idx1)
        ys.append(idx2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('Num inputs:', num)
W = torch.randn((27, 27), requires_grad=True)

Num inputs: 228146


In [25]:
# Tuning Nueral Network using back propogation
for k in range(100):
    
    #Forward pass (Convert inputs, multiply inputs by weights, convert to probability distrobution)
    xenc = F.one_hot(xs, num_classes=27).float()
    logits = xenc @ W
    counts = logits.exp()
    prob = counts / counts.sum(1, keepdims=True)
    loss = -prob[torch.arange(num), ys].log().mean() # Calculate Loss in a better way using pytorch
    #print('Round', str(k + 1) + ':', loss.item())
    
    #Backward Pass
    W.grad = None # Reset gradients 
    loss.backward() # Set gradients

    # Update (Only 1 parameter to worry about tuning)
    W.data += -50 * W.grad
    

In [44]:
# Sampling from the Nueral Network
for i in range(5):
    out = []
    ix = 0
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W
        counts = logits.exp()
        prob = counts / counts.sum(1, keepdims=True)

        ix = torch.multinomial(prob, num_samples=1, replacement = True).item()
        if ix == 0:
            break
        out.append(itos[ix])
    print(''.join(out))
        

kiryillara
h
imiyalalemzaymaradiridiloyn
nnico
jonihitaan
