E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [2]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [3]:
words = open('names.txt', 'r').read().splitlines()
print(words[:5])

['emma', 'olivia', 'ava', 'isabella', 'sophia']


In [4]:
import torch

In [8]:
N = torch.zeros((27, 27, 27), dtype=torch.int32)

In [9]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [17]:
for w in words:
    chs = ['.']*2 + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        N[ix1, ix2, ix3] += 1

In [22]:
P = (N + 1).float()
P /= P.sum(2, keepdim=True) + 1e-6

In [5]:
g = torch.Generator().manual_seed(2147483647)

In [27]:
for _ in range(5):
    out = []
    ix1, ix2 = 0, 0  
    while True:
        p = P[ix1, ix2] 
        ix3 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix3])
        if ix3 == 0:  
            break
        ix1, ix2 = ix2, ix3  
    print(''.join(out))

daill.
jalantestian.
na.
sudaeveigh.
diren.


In [22]:
xs, ys = [], []
for w in words:
    chs = ['.']*2 + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs.append((ix1, ix2))
        ys.append(ix3)
xs = torch.tensor(xs, dtype=torch.long)
ys = torch.tensor(ys, dtype=torch.long)
num = xs.shape[0]

xenc = torch.nn.functional.one_hot(xs, num_classes=27).float().view(num, -1)
W = torch.randn((54, 27), generator=g, requires_grad=True)

In [52]:
for k in range(100):
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    loss = -torch.log(probs[torch.arange(num), ys]).mean()
    print(loss)

    W.grad = None
    loss.backward()
    W.data += -50 * W.grad



tensor(2.3861, grad_fn=<NegBackward0>)
tensor(2.3851, grad_fn=<NegBackward0>)
tensor(2.3842, grad_fn=<NegBackward0>)
tensor(2.3834, grad_fn=<NegBackward0>)
tensor(2.3825, grad_fn=<NegBackward0>)
tensor(2.3817, grad_fn=<NegBackward0>)
tensor(2.3809, grad_fn=<NegBackward0>)
tensor(2.3801, grad_fn=<NegBackward0>)
tensor(2.3794, grad_fn=<NegBackward0>)
tensor(2.3787, grad_fn=<NegBackward0>)
tensor(2.3780, grad_fn=<NegBackward0>)
tensor(2.3773, grad_fn=<NegBackward0>)
tensor(2.3766, grad_fn=<NegBackward0>)
tensor(2.3760, grad_fn=<NegBackward0>)
tensor(2.3754, grad_fn=<NegBackward0>)
tensor(2.3748, grad_fn=<NegBackward0>)
tensor(2.3742, grad_fn=<NegBackward0>)
tensor(2.3736, grad_fn=<NegBackward0>)
tensor(2.3731, grad_fn=<NegBackward0>)
tensor(2.3727, grad_fn=<NegBackward0>)
tensor(2.3725, grad_fn=<NegBackward0>)
tensor(2.3733, grad_fn=<NegBackward0>)
tensor(2.3762, grad_fn=<NegBackward0>)
tensor(2.3885, grad_fn=<NegBackward0>)
tensor(2.4001, grad_fn=<NegBackward0>)
tensor(2.4626, grad_fn=<N

In [53]:
for i in range(5): 
    out = []
  
    context = [0, 0]
    while True:
        xenc = torch.cat(
            [torch.nn.functional.one_hot(torch.tensor([ix]), num_classes=27).float() 
             for ix in context],
            dim=1
        )
        
        logits = xenc @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdim=True)

        ix = torch.multinomial(probs.squeeze(0), num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])

        context = context[1:] + [ix]

        if ix == 0:
            break

    print(''.join(out))


camyn.
cor.
aryeshaumiylielyna.
aat.
raylaheree.


I don't know if it improved much. In theory it should, but our both models are so simple. Thus both kinda returns somewhat useless outputs. Currently I don't know any metric I can compare them with.

E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [1]:
from torch.utils.data import TensorDataset, DataLoader
import random

In [6]:
random.shuffle(words)
n = len(words)
train_words = words[:int(0.8*n)]
dev_words = words[int(0.8*n):int(0.9*n)]
test_words = words[int(0.9*n):]

In [10]:
def create_dataset(word_list):
    xs, ys = [], []
    for w in word_list:
        chs = ['.']*2 + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]
            xs.append((ix1, ix2))
            ys.append(ix3)
    return torch.tensor(xs), torch.tensor(ys)

X_train, y_train = create_dataset(train_words)
X_dev, y_dev = create_dataset(dev_words)
X_test, y_test = create_dataset(test_words)

In [11]:
def evaluate(model, X, y):
    with torch.no_grad():
        xenc = torch.nn.functional.one_hot(X, num_classes=27).float().view(-1, 54)
        logits = xenc @ model
        loss = -torch.log(logits.softmax(1)[torch.arange(len(y)), y]).mean()
    return loss.item()

In [12]:
W = torch.randn((54, 27), generator=g, requires_grad=True)

In [14]:
for k in range(100):
    xenc = torch.nn.functional.one_hot(X_train, num_classes=27).float().view(-1, 54)
    logits = xenc @ W
    probs = logits.softmax(1)
    loss = -torch.log(probs[torch.arange(len(y_train)), y_train]).mean()
    
    W.grad = None
    loss.backward()
    W.data += -50 * W.grad
    
    if k % 10 == 0:
        train_loss = loss.item()
        dev_loss = evaluate(W, X_dev, y_dev)
        test_loss = evaluate(W, X_test, y_test)
        print(f'Step {k}: train={train_loss:.4f}, dev={dev_loss:.4f}, test={test_loss:.4f}')

Step 0: train=2.3872, dev=2.4267, test=2.4237
Step 10: train=2.3899, dev=2.4452, test=2.4415
Step 20: train=2.3820, dev=2.4209, test=2.4177
Step 30: train=2.3859, dev=2.4418, test=2.4378
Step 40: train=2.3783, dev=2.4170, test=2.4135
Step 50: train=2.3829, dev=2.4392, test=2.4349
Step 60: train=2.3755, dev=2.4142, test=2.4104
Step 70: train=2.3806, dev=2.4371, test=2.4325
Step 80: train=2.3734, dev=2.4123, test=2.4082
Step 90: train=2.3787, dev=2.4354, test=2.4306


In [16]:
def generate_name(model, num_samples=5):
    for _ in range(num_samples):
        out = []
        context = [0, 0] 
        
        while True:
            xenc = torch.nn.functional.one_hot(torch.tensor(context), num_classes=27).float().view(1, -1)
            logits = xenc @ model
            probs = logits.softmax(1)
            
            ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
            out.append(itos[ix])
            
            context = context[1:] + [ix]
            if ix == 0:  
                break
                
        print(''.join(out[:-1]))  


print("Generated names:")
generate_name(W)


Generated names:
trasn
ha
ri
halayly
gvavekhtaere


E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

In [18]:
smoothing_values = [0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0, 1000.0, 10000.0]
results = []

for alpha in smoothing_values:
    N = torch.zeros((27, 27, 27), dtype=torch.int32)
    P = (N + alpha).float()
    P /= P.sum(2, keepdim=True)
    
    W = torch.randn((54, 27), generator=g, requires_grad=True)

    for k in range(100):
        xenc = torch.nn.functional.one_hot(X_train, num_classes=27).float().view(-1, 54)
        logits = xenc @ W
        probs = logits.softmax(1)
        loss = -torch.log(probs[torch.arange(len(y_train)), y_train]).mean()
        
        W.grad = None
        loss.backward()
        W.data += -50 * W.grad
    
    train_loss = evaluate(W, X_train, y_train)
    dev_loss = evaluate(W, X_dev, y_dev)
    results.append((alpha, train_loss, dev_loss))
    print(f'Alpha={alpha}: train={train_loss:.4f}, dev={dev_loss:.4f}')

Alpha=0.1: train=2.3842, dev=2.3907
Alpha=0.5: train=2.3855, dev=2.3924
Alpha=1.0: train=2.3914, dev=2.3995
Alpha=2.0: train=2.3870, dev=2.3953
Alpha=5.0: train=2.4357, dev=2.4496
Alpha=10.0: train=2.3864, dev=2.3924
Alpha=100.0: train=2.3841, dev=2.3924
Alpha=1000.0: train=2.3853, dev=2.3929
Alpha=10000.0: train=2.3862, dev=2.3946


In [20]:
smoothing_values = [0.001, 0.01, 0.1, 0.5, 1.0]
results = []

for alpha in smoothing_values:
    W = torch.randn((54, 27), generator=g, requires_grad=True)
    
    for k in range(100):
        xenc = torch.nn.functional.one_hot(X_train, num_classes=27).float().view(-1, 54)
        logits = xenc @ W
        probs = logits.softmax(1)
        loss = -torch.log(probs[torch.arange(len(y_train)), y_train]).mean() + alpha*(W**2).mean()
        
        W.grad = None
        loss.backward()
        W.data += -50 * W.grad

    train_loss = evaluate(W, X_train, y_train)
    dev_loss = evaluate(W, X_dev, y_dev)
    results.append((alpha, train_loss, dev_loss))
    print(f'Alpha={alpha}: train={train_loss:.4f}, dev={dev_loss:.4f}')

Alpha=0.001: train=2.3884, dev=2.3957
Alpha=0.01: train=2.3875, dev=2.3932
Alpha=0.1: train=2.3925, dev=2.3984
Alpha=0.5: train=2.5318, dev=2.5431
Alpha=1.0: train=2.6020, dev=2.6127


I tried 2 different smoothing techniques, but it didnt create better results.

E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

In [54]:
xs, ys = [], []
for w in words:
    chs = ['.']*2 + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs.append((ix1, ix2))
        ys.append(ix3)
xs = torch.tensor(xs, dtype=torch.long)
ys = torch.tensor(ys, dtype=torch.long)
num = xs.shape[0]

xenc = torch.nn.functional.one_hot(xs, num_classes=27).float().view(num, -1)
W = torch.randn((54, 27), generator=g, requires_grad=True)

In [55]:
xenc = torch.nn.functional.one_hot(xs, num_classes=27).float().view(-1, 54) 
logits1 = xenc @ W  

W_reshaped = W.view(2, 27, 27) 
logits3 = W_reshaped[0][xs[:,0]] + W_reshaped[1][xs[:,1]]  

In [53]:
print (logits1, logits3)

tensor([[ 0.4559,  0.3997, -0.3886,  ...,  0.7825,  0.6419, -0.1456],
        [-0.7279, -2.6708, -0.8460,  ...,  0.6269,  2.5628,  0.6440],
        [ 0.2446, -1.6428,  1.4619,  ..., -0.2249,  0.1742,  0.0665],
        ...,
        [ 1.6848, -2.9752,  0.1465,  ...,  0.0926, -0.4444, -1.6957],
        [-0.3198,  1.0638,  1.2827,  ...,  1.4146, -0.0802,  1.5365],
        [ 0.8131, -2.1948,  2.7385,  ...,  0.6649, -1.9760, -1.0428]],
       grad_fn=<MmBackward0>) tensor([[ 0.4559,  0.3997, -0.3886,  ...,  0.7825,  0.6419, -0.1456],
        [-0.7279, -2.6708, -0.8460,  ...,  0.6269,  2.5628,  0.6440],
        [ 0.2446, -1.6428,  1.4619,  ..., -0.2249,  0.1742,  0.0665],
        ...,
        [ 1.6848, -2.9752,  0.1465,  ...,  0.0926, -0.4444, -1.6957],
        [-0.3198,  1.0638,  1.2827,  ...,  1.4146, -0.0802,  1.5365],
        [ 0.8131, -2.1948,  2.7385,  ...,  0.6649, -1.9760, -1.0428]],
       grad_fn=<AddBackward0>)
