# Building Makemore MLP Exercise

## Imports

In [None]:
from tqdm import tqdm
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plot
import random
import math

In [None]:
torch.cuda.is_available(), torch.cuda.device_count(), torch.cuda.current_device(), torch.cuda.device(0), torch.cuda.get_device_name(0)


AssertionError: Torch not compiled with CUDA enabled

In [None]:
g = torch.Generator().manual_seed(42)

## Setup

In [None]:
words = open('/kaggle/input/nameszhnn/names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [None]:
len(words)

32033

In [None]:
def generate_training_set(words, block_size, print_disabled=False):
    
    chars = sorted(list(set(''.join(words))))
    stoi = {s: i+1 for i, s in enumerate(chars)}
    stoi['.'] = 0
    itos = {i:s for s, i in stoi.items()}
    
    X, Y = [], []
    
    for w in words:
        if print_disabled: print(w)
        
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            if print_disabled: print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix] # crop and append
            
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

In [None]:
X, Y = generate_training_set(words, 3)

In [None]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [None]:
def generate_train_valid_test_split(words, block_size=3):
    random.seed(42)
    random.shuffle(words)
    n1 = int(0.8*len(words))
    n2 = int(0.9*len(words))

    Xtr, Ytr = generate_training_set(words[:n1], block_size)
    Xdev, Ydev = generate_training_set(words[n1:n2], block_size)
    Xte, Yte = generate_training_set(words[n2:], block_size)
    
    return Xtr, Ytr, Xdev, Ydev, Xte, Yte

In [None]:
Xtr, Ytr, Xdev, Ydev, Xte, Yte = generate_train_valid_test_split(words, block_size=3)

In [None]:
Xtr.shape, Ytr.shape

(torch.Size([182625, 3]), torch.Size([182625]))

In [None]:
Xdev.shape, Ydev.shape

(torch.Size([22655, 3]), torch.Size([22655]))

In [None]:
Xte.shape, Yte.shape

(torch.Size([22866, 3]), torch.Size([22866]))

## E01

Tune the hyperparameters of the training to beat the validation loss of 2.2

   - no of neurons in the hidden layer
    
   - embedding size
    
   - no of characters
    
   - epochs
    
   - learning rate; change/decay it over the epochs
    
   - batch size

In [None]:
def evaluate_loss(parameters, X, Y, block_size=3, embedding_size=10):
    C, W1, b1, W2, b2 = parameters
    emb = C[X]
    h = torch.tanh(emb.view(-1, block_size * embedding_size) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits.cuda(), Y)
    return loss

In [None]:
def _regularization_loss(parameters, lambdas):
    C = parameters[0]
    W1 = parameters[1]
    W2 = parameters[3]
    
    return lambdas[0]*(C**2).mean() + lambdas[1]*(W1**2).mean() + lambdas[2]*(W2**2).mean()

In [None]:
def train(X, 
          Y, 
          epochs, 
          block_size=3, 
          embedding_size=10, 
          hidden_neuron=300, 
          bs=32, 
          lr=0.1, 
          parameters=[], 
          lambdas = [0, 0, 0],
          enable_print=True,
          print_at_every_nth_epoch=10000
         ):
    
    if not parameters:
        C = torch.randn((27, embedding_size), generator=g).cuda()
        W1 = torch.randn((block_size * embedding_size, hidden_neuron), generator=g).cuda()
        b1 = torch.randn(hidden_neuron, generator=g).cuda()
        W2 = torch.randn((hidden_neuron, 27), generator=g).cuda()
        b2 = torch.randn(27, generator=g).cuda()
        parameters = [C, W1, b1, W2, b2]

    
    for p in parameters: p.requires_grad = True 
        
    for epoch in tqdm(range(epochs)):
            
        ix = torch.randint(0, X.shape[0], (bs, )).cuda()

        loss = evaluate_loss(parameters, X[ix].cuda(), Y[ix].cuda(), block_size, embedding_size)
        regularization_loss = _regularization_loss(parameters, lambdas)
        loss += regularization_loss

        for p in parameters:
            p.grad= None
        loss.backward()


        for p in parameters:
            p.data += - lr * p.grad

        if enable_print and epoch % print_at_every_nth_epoch == 0: print(epoch, loss.item())
    
    return parameters, loss.item()

#### 1st try

In [None]:
parameters, loss = train(Xtr, Ytr, 100_000, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=0.1, enable_print=True, print_at_every_nth_epoch=10_000)

  0%|          | 12/100000 [00:00<30:45, 54.19it/s]

0 19.391998291015625


 10%|█         | 10010/100000 [03:03<27:34, 54.40it/s]

10000 2.2867367267608643


 20%|██        | 20009/100000 [06:02<23:37, 56.43it/s]

20000 2.210310935974121


 27%|██▋       | 26950/100000 [08:09<22:06, 55.07it/s]


KeyboardInterrupt: 

In [None]:
loss, evaluate_loss(parameters, Xdev.cuda(), Ydev.cuda(), block_size=3, embedding_size=50)

(2.112903356552124,
 tensor(2.1565, device='cuda:0', grad_fn=<NllLossBackward0>))

#### 2nd try

In [None]:
parameters, loss = train(Xtr, Ytr, 300_000, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=0.01, parameters=parameters, enable_print=False)

100%|██████████| 300000/300000 [1:28:47<00:00, 56.31it/s]


In [None]:
loss, evaluate_loss(parameters, Xdev.cuda(), Ydev.cuda(), block_size=3, embedding_size=50)

(2.1061928272247314,
 tensor(2.1500, device='cuda:0', grad_fn=<NllLossBackward0>))

#### 3rd try

In [None]:
parameters, loss = train(Xtr, Ytr, 10_000, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=1, parameters=parameters, enable_print=True, print_at_every_nth_epoch=1000)

100%|██████████| 10000/10000 [02:57<00:00, 56.38it/s]


In [None]:
loss, evaluate_loss(parameters, Xdev.cuda(), Ydev.cuda(), block_size=3, embedding_size=50)

(2.125706911087036,
 tensor(2.2161, device='cuda:0', grad_fn=<NllLossBackward0>))

#### 4th try

In [None]:
parameters, loss = train(Xtr, Ytr, 10_000, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=0.1, parameters=parameters, enable_print=True, print_at_every_nth_epoch=1000)

  0%|          | 12/10000 [00:00<03:03, 54.49it/s]

0 2.166624069213867


 10%|█         | 1008/10000 [00:17<02:38, 56.77it/s]

1000 2.0816526412963867


 20%|██        | 2010/10000 [00:35<02:20, 57.06it/s]

2000 2.0698986053466797


 30%|███       | 3012/10000 [00:53<02:03, 56.78it/s]

3000 2.085846424102783


 40%|████      | 4008/10000 [01:10<01:45, 57.01it/s]

4000 2.0758490562438965


 50%|█████     | 5010/10000 [01:28<01:33, 53.26it/s]

5000 2.085636615753174


 60%|██████    | 6012/10000 [01:46<01:10, 56.89it/s]

6000 2.076601505279541


 70%|███████   | 7008/10000 [02:04<00:52, 56.62it/s]

7000 2.0770840644836426


 80%|████████  | 8010/10000 [02:21<00:35, 56.16it/s]

8000 2.0789082050323486


 90%|█████████ | 9012/10000 [02:39<00:17, 56.72it/s]

9000 2.0974490642547607


100%|██████████| 10000/10000 [02:57<00:00, 56.49it/s]


In [None]:
loss, evaluate_loss(parameters, Xdev.cuda(), Ydev.cuda(), block_size=3, embedding_size=50)

(2.085941791534424,
 tensor(2.1405, device='cuda:0', grad_fn=<NllLossBackward0>))

#### 5th try

In [None]:
parameters, loss = train(Xtr, Ytr, 100_000, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=0.01, parameters=parameters, enable_print=True, print_at_every_nth_epoch=10_000)

  0%|          | 7/100000 [00:00<27:16, 61.12it/s]

0 2.070305109024048


 10%|█         | 10011/100000 [02:57<27:57, 53.65it/s]

10000 2.0962393283843994


 20%|██        | 20012/100000 [05:54<23:27, 56.84it/s]

20000 2.0833041667938232


 30%|███       | 30008/100000 [08:50<20:31, 56.83it/s]

30000 2.074430465698242


 40%|████      | 40008/100000 [11:47<17:32, 56.99it/s]

40000 2.087279796600342


 50%|█████     | 50010/100000 [14:45<14:36, 57.01it/s]

50000 2.0869252681732178


 60%|██████    | 60008/100000 [17:42<11:40, 57.07it/s]

60000 2.0887160301208496


 70%|███████   | 70010/100000 [20:39<08:49, 56.67it/s]

70000 2.097712755203247


 80%|████████  | 80010/100000 [23:36<05:50, 57.03it/s]

80000 2.0827200412750244


 90%|█████████ | 90008/100000 [26:33<02:55, 57.04it/s]

90000 2.0914275646209717


100%|██████████| 100000/100000 [29:30<00:00, 56.50it/s]


In [None]:
loss, evaluate_loss(parameters, Xdev.cuda(), Ydev.cuda(), block_size=3, embedding_size=50)

(2.063863515853882,
 tensor(2.1395, device='cuda:0', grad_fn=<NllLossBackward0>))

### Test Loss

In [None]:
loss, evaluate_loss(parameters, Xte.cuda(), Yte.cuda(), block_size=3, embedding_size=50)

(2.063863515853882,
 tensor(2.1439, device='cuda:0', grad_fn=<NllLossBackward0>))

## E02
- Weight Initialization

(1) What is the loss you'd get if the predicted probabilities at initialization were perfectly uniform? What loss do we achieve? 

(2) Can you tune the initialization to get a starting loss that is much more similar to (1)?

Answer to (1)

If the predicted probabilities were uniform then the probabilities would have been `1/27` of each character prediction

And we would have take the log of the probability which would have been

In [None]:
torch.tensor(1/27).log()

tensor(-3.2958)

to the get the loss it would have been

In [None]:
- torch.tensor(1/27).log()

tensor(3.2958)

No we sum up the losses and divide by the count, `(n * (3.2958))/n`
which is equal to `3.2958`

Lets see the initial loss when we train the model with current initialization

In [None]:
parameters, loss = train(Xtr, Ytr, 10, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=0.1, enable_print=True, print_at_every_nth_epoch=1)

100%|██████████| 10/10 [00:01<00:00,  8.59it/s]

0 18.98647117614746
1 18.139089584350586
2 17.4639949798584
3 16.95638084411621
4 16.41069984436035
5 16.088415145874023
6 15.858123779296875
7 15.584877967834473
8 15.392867088317871
9 14.91295051574707





The initial loss is `18.98` which is high comparative to `3.2958`

Lets see the probabilities of the output

In [None]:
parameters, loss = train(Xtr, Ytr, 1, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=0.1, enable_print=True, print_at_every_nth_epoch=1)

100%|██████████| 1/1 [00:00<00:00, 45.83it/s]

0 17.784887313842773





In [None]:
def compute_probs(parameters, X, block_size=3, embedding_size=50):
    C, W1, b1, W2, b2 = parameters
    emb = C[X]
    h = torch.tanh(emb.view(-1, block_size * embedding_size) @ W1 + b1)
    logits = h @ W2 + b2
    return F.softmax(logits, dim=1)

In [None]:
compute_probs(parameters, Xtr)

tensor([[1.0857e-04, 3.6501e-02, 2.1932e-06,  ..., 2.3620e-11, 3.5701e-07,
         1.1134e-09],
        [4.6675e-07, 3.9523e-03, 2.2363e-13,  ..., 9.9556e-17, 1.7200e-13,
         1.9502e-14],
        [1.1533e-03, 3.5033e-05, 1.8715e-15,  ..., 1.2193e-10, 1.2248e-08,
         6.8308e-16],
        ...,
        [5.5263e-07, 4.1990e-03, 2.8210e-11,  ..., 1.3669e-13, 7.7488e-08,
         4.2720e-17],
        [8.9470e-06, 6.5655e-03, 6.6855e-04,  ..., 5.5212e-08, 6.7759e-13,
         1.9382e-04],
        [3.5907e-07, 2.2142e-07, 4.3963e-06,  ..., 1.0096e-06, 9.9789e-01,
         1.2103e-13]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

Lets view a single row of probabilities

In [None]:
compute_probs(parameters, Xtr)[0]

tensor([1.0857e-04, 3.6501e-02, 2.1932e-06, 1.3373e-08, 1.1020e-10, 5.3793e-03,
        3.0107e-08, 2.6294e-14, 9.6476e-12, 1.1091e-11, 1.0275e-08, 1.0968e-10,
        3.7358e-05, 8.2368e-08, 3.6925e-07, 1.4491e-08, 2.4197e-08, 1.3568e-12,
        4.5940e-06, 1.9727e-07, 8.8267e-05, 1.2796e-02, 1.9823e-04, 9.4488e-01,
        2.3620e-11, 3.5701e-07, 1.1134e-09], device='cuda:0',
       grad_fn=<SelectBackward0>)

to get a uniform probability, I think we need to have all logits as equal so that we can get probability of each as `1/27`

#### Try 1

lets try uniform wieght initialization

In [None]:
def train_v2(X, 
          Y, 
          epochs, 
          block_size=3, 
          embedding_size=10, 
          hidden_neuron=300, 
          bs=32, 
          lr=0.1, 
          parameters=[], 
          lambdas = [0, 0, 0],
          enable_print=True,
          print_at_every_nth_epoch=10000
         ):
    
    if not parameters:
        C = torch.rand((27, embedding_size), generator=g).cuda()
        W1 = torch.rand((block_size * embedding_size, hidden_neuron), generator=g).cuda()
        b1 = torch.rand(hidden_neuron, generator=g).cuda()
        W2 = torch.rand((hidden_neuron, 27), generator=g).cuda()  
        b2 = torch.rand(27, generator=g).cuda()
        parameters = [C, W1, b1, W2, b2]

    
    for p in parameters: p.requires_grad = True 
        
    for epoch in tqdm(range(epochs)):
            
        ix = torch.randint(0, X.shape[0], (bs, )).cuda()

        loss = evaluate_loss(parameters, X[ix].cuda(), Y[ix].cuda(), block_size, embedding_size)
        regularization_loss = _regularization_loss(parameters, lambdas)
        loss += regularization_loss

        for p in parameters:
            p.grad= None
        loss.backward()


        for p in parameters:
            p.data += - lr * p.grad

        if enable_print and epoch % print_at_every_nth_epoch == 0: print(epoch, loss.item())
    
    return parameters, loss.item()

In [None]:
parameters, loss = train_v2(Xtr, Ytr, 1, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=0.1, enable_print=True, print_at_every_nth_epoch=1)

100%|██████████| 1/1 [00:00<00:00, 46.14it/s]

0 6.422854900360107





With uniform weight initialization the intial loss (`6.422`) obtained is less than of normal weight initialization (`17.7`)

#### Try 2

Lets initialize the last layers of weights and biases as zero.

In [None]:
def train_v3(X, 
          Y, 
          epochs, 
          block_size=3, 
          embedding_size=10, 
          hidden_neuron=300, 
          bs=32, 
          lr=0.1, 
          parameters=[], 
          lambdas = [0, 0, 0],
          enable_print=True,
          print_at_every_nth_epoch=10000
         ):
    
    if not parameters:
        C = torch.rand((27, embedding_size), generator=g).cuda()
        W1 = torch.rand((block_size * embedding_size, hidden_neuron), generator=g).cuda()
        b1 = torch.rand(hidden_neuron).cuda()
        W2 = torch.zeros((hidden_neuron, 27)).cuda()  
        b2 = torch.zeros(27).cuda()
        parameters = [C, W1, b1, W2, b2]

    
    for p in parameters: p.requires_grad = True 
        
    for epoch in tqdm(range(epochs)):
            
        ix = torch.randint(0, X.shape[0], (bs, )).cuda()

        loss = evaluate_loss(parameters, X[ix].cuda(), Y[ix].cuda(), block_size, embedding_size)
        regularization_loss = _regularization_loss(parameters, lambdas)
        loss += regularization_loss

        for p in parameters:
            p.grad= None
        loss.backward()


        for p in parameters:
            p.data += - lr * p.grad

        if enable_print and epoch % print_at_every_nth_epoch == 0: print(epoch, loss.item())
    
    return parameters, loss.item()

In [None]:
parameters, loss = train_v3(Xtr, Ytr, 1, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=0.1, enable_print=True, print_at_every_nth_epoch=1)

100%|██████████| 1/1 [00:00<00:00, 46.01it/s]

0 3.295814037322998





The initial loss is now `3.2958` (which we wanted). 

Lets see how well it trains now

In [None]:
parameters, loss = train_v3(Xtr, Ytr, 30_000, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=0.1, enable_print=True, print_at_every_nth_epoch=10_000)

  0%|          | 12/30000 [00:00<09:14, 54.09it/s]

0 3.295814037322998


 33%|███▎      | 10008/30000 [03:03<06:04, 54.85it/s]

10000 2.8334920406341553


 67%|██████▋   | 20009/30000 [06:08<03:04, 54.04it/s]

20000 2.827484369277954


100%|██████████| 30000/30000 [09:11<00:00, 54.35it/s]


In [None]:
loss

2.8184070587158203

#### Try 3

As we can see the losses are not decreasing faster, lets not initialize weight to zero but close to zero and see ...

In [None]:
def train_v4(X, 
          Y, 
          epochs, 
          block_size=3, 
          embedding_size=10, 
          hidden_neuron=300, 
          bs=32, 
          lr=0.1, 
          parameters=[], 
          lambdas = [0, 0, 0],
          enable_print=True,
          print_at_every_nth_epoch=10000
         ):
    
    if not parameters:
        C = torch.rand((27, embedding_size), generator=g).cuda()
        W1 = torch.rand((block_size * embedding_size, hidden_neuron), generator=g).cuda()
        b1 = torch.rand(hidden_neuron).cuda()
        W2 = torch.rand((hidden_neuron, 27)).cuda() * 0.01 # close to zero
        b2 = torch.zeros(27).cuda()
        parameters = [C, W1, b1, W2, b2]

    
    for p in parameters: p.requires_grad = True 
        
    for epoch in tqdm(range(epochs)):
            
        ix = torch.randint(0, X.shape[0], (bs, )).cuda()

        loss = evaluate_loss(parameters, X[ix].cuda(), Y[ix].cuda(), block_size, embedding_size)
        regularization_loss = _regularization_loss(parameters, lambdas)
        loss += regularization_loss

        for p in parameters:
            p.grad= None
        loss.backward()


        for p in parameters:
            p.data += - lr * p.grad

        if enable_print and epoch % print_at_every_nth_epoch == 0: print(epoch, loss.item())
    
    return parameters, loss.item()

In [None]:
parameters, loss = train_v4(Xtr, Ytr, 30_000, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=0.1, enable_print=True, print_at_every_nth_epoch=10_000)

  0%|          | 6/30000 [00:00<08:38, 57.86it/s]

0 3.2842519283294678


 33%|███▎      | 10008/30000 [03:01<05:57, 55.96it/s]

10000 2.810333013534546


 67%|██████▋   | 20010/30000 [06:01<03:02, 54.85it/s]

20000 2.8224377632141113


100%|██████████| 30000/30000 [09:05<00:00, 55.03it/s]


#### Try 4

Lets not try to uniformly initiate all the weights but only the last layers and the rest we can keep as normal initialized

In [None]:
def train_v5(X, 
          Y, 
          epochs, 
          block_size=3, 
          embedding_size=10, 
          hidden_neuron=300, 
          bs=32, 
          lr=0.1, 
          parameters=[], 
          lambdas = [0, 0, 0],
          enable_print=True,
          print_at_every_nth_epoch=10000
         ):
    
    if not parameters:
        C = torch.randn((27, embedding_size), generator=g).cuda()
        W1 = torch.randn((block_size * embedding_size, hidden_neuron), generator=g).cuda()
        b1 = torch.randn(hidden_neuron).cuda()
        W2 = torch.rand((hidden_neuron, 27)).cuda() * 0.01 # close to zero
        b2 = torch.zeros(27).cuda()
        parameters = [C, W1, b1, W2, b2]

    
    for p in parameters: p.requires_grad = True 
        
    for epoch in tqdm(range(epochs)):
            
        ix = torch.randint(0, X.shape[0], (bs, )).cuda()

        loss = evaluate_loss(parameters, X[ix].cuda(), Y[ix].cuda(), block_size, embedding_size)
        regularization_loss = _regularization_loss(parameters, lambdas)
        loss += regularization_loss

        for p in parameters:
            p.grad= None
        loss.backward()


        for p in parameters:
            p.data += - lr * p.grad

        if enable_print and epoch % print_at_every_nth_epoch == 0: print(epoch, loss.item())
    
    return parameters, loss.item()

In [None]:
parameters, loss = train_v5(Xtr, Ytr, 30_000, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=0.1, enable_print=True, print_at_every_nth_epoch=10_000)

  0%|          | 7/30000 [00:01<58:14,  8.58it/s]  

0 3.2948546409606934


 33%|███▎      | 10008/30000 [02:54<05:42, 58.38it/s]

10000 2.190589427947998


 67%|██████▋   | 20010/30000 [05:48<02:53, 57.62it/s]

20000 2.175851821899414


100%|██████████| 30000/30000 [08:42<00:00, 57.39it/s]


The losses are reducing now. Lets train for 100_000 and check

In [None]:
parameters, loss = train_v5(Xtr, Ytr, 100_000, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=0.1, enable_print=True, print_at_every_nth_epoch=10_000)

  0%|          | 12/100000 [00:00<29:50, 55.84it/s]

0 3.297055721282959


 10%|█         | 10009/100000 [02:58<26:48, 55.94it/s]

10000 2.2041165828704834


 20%|██        | 20008/100000 [05:55<23:44, 56.16it/s]

20000 2.1607794761657715


 30%|███       | 30010/100000 [08:49<20:05, 58.04it/s]

30000 2.153400182723999


 40%|████      | 40012/100000 [11:42<17:13, 58.07it/s]

40000 2.128110408782959


 50%|█████     | 50008/100000 [14:34<14:20, 58.13it/s]

50000 2.111949920654297


 60%|██████    | 60008/100000 [17:27<11:28, 58.10it/s]

60000 2.116779327392578


 70%|███████   | 70009/100000 [20:19<08:36, 58.10it/s]

70000 2.094712972640991


 80%|████████  | 80011/100000 [23:12<05:43, 58.23it/s]

80000 2.095874309539795


 90%|█████████ | 90008/100000 [26:04<02:56, 56.70it/s]

90000 2.104464530944824


100%|██████████| 100000/100000 [28:59<00:00, 57.47it/s]


In [None]:
loss

2.0862724781036377

The losses are getting reduced faster!

In [None]:
evaluate_loss(parameters, Xdev.cuda(), Ydev.cuda(), block_size=3, embedding_size=50)

tensor(2.1515, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
evaluate_loss(parameters, Xte.cuda(), Yte.cuda(), block_size=3, embedding_size=50)

tensor(2.1520, device='cuda:0', grad_fn=<NllLossBackward0>)

## E03

Read the Bengio et al 2003 paper, implement and try any idea from the paper. Did it work?

In the paper there is a mention of direct connection from the word features to output. 

Lets implement the direct connection from embedding to output and see the results

### Direct connection from embedding to output

In [None]:
C = torch.randn((27, 50), generator=g).cuda()

In [None]:
C[X].shape; C[X].view(-1, 150).shape

torch.Size([228146, 150])

In [None]:
def evaluate_loss_dir_conn(parameters, X, Y, block_size=3, embedding_size=10):
    C, W1, b1, W2, W3, b2 = parameters
    emb = C[X]
    h = torch.tanh(emb.view(-1, block_size * embedding_size) @ W1 + b1)
    logits = h @ W2 + b2 + C[X].view(-1, block_size * embedding_size) @ W3
    loss = F.cross_entropy(logits.cuda(), Y)
    return loss

In [None]:
def train_dir_conn(X, 
          Y, 
          epochs, 
          block_size=3, 
          embedding_size=10, 
          hidden_neuron=300, 
          bs=32, 
          lr=0.1, 
          parameters=[], 
          lambdas = [0, 0, 0],
          enable_print=True,
          print_at_every_nth_epoch=10000
         ):
    
    if not parameters:
        C = torch.randn((27, embedding_size), generator=g).cuda()
        W1 = torch.randn((block_size * embedding_size, hidden_neuron), generator=g).cuda()
        b1 = torch.randn(hidden_neuron).cuda()
        W2 = torch.rand((hidden_neuron, 27)).cuda() * 0.01 # close to zero
        W3 = torch.rand((block_size * embedding_size, 27)).cuda() * 0.01 # close to zero
        b2 = torch.zeros(27).cuda()
        parameters = [C, W1, b1, W2, W3, b2]

    
    for p in parameters: p.requires_grad = True 
        
    for epoch in tqdm(range(epochs)):
            
        ix = torch.randint(0, X.shape[0], (bs, )).cuda()

        loss = evaluate_loss_dir_conn(parameters, X[ix].cuda(), Y[ix].cuda(), block_size, embedding_size)
        regularization_loss = _regularization_loss(parameters, lambdas)
        loss += regularization_loss

        for p in parameters:
            p.grad= None
        loss.backward()


        for p in parameters:
            p.data += - lr * p.grad

        if enable_print and epoch % print_at_every_nth_epoch == 0: print(epoch, loss.item())
    
    return parameters, loss.item()

In [None]:
parameters, loss = train_dir_conn(Xtr, Ytr, 100_000, block_size=3, embedding_size=50, hidden_neuron=100, bs=16384, lr=0.1, enable_print=True, print_at_every_nth_epoch=10_000)

  0%|          | 4/100000 [00:00<51:53, 32.11it/s]

0 3.295320987701416


 10%|█         | 10006/100000 [05:20<47:52, 31.33it/s] 

10000 2.150477647781372


 20%|██        | 20005/100000 [10:40<42:17, 31.53it/s]

20000 2.124218702316284


 30%|███       | 30006/100000 [16:01<37:17, 31.29it/s]

30000 2.0882301330566406


 40%|████      | 40006/100000 [21:21<31:49, 31.41it/s]

40000 2.1032118797302246


 50%|█████     | 50007/100000 [26:41<27:06, 30.74it/s]

50000 2.0811822414398193


 60%|██████    | 60006/100000 [32:01<21:20, 31.23it/s]

60000 2.0820119380950928


 70%|███████   | 70007/100000 [37:22<16:01, 31.18it/s]

70000 2.064924478530884


 80%|████████  | 80005/100000 [42:42<10:33, 31.57it/s]

80000 2.0588784217834473


 90%|█████████ | 90005/100000 [48:02<05:15, 31.69it/s]

90000 2.078995704650879


100%|██████████| 100000/100000 [53:26<00:00, 31.19it/s]


In [None]:
loss

2.0455985069274902

In [None]:
evaluate_loss_dir_conn(parameters, Xdev.cuda(), Ydev.cuda(), block_size=3, embedding_size=50)

tensor(2.1238, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
evaluate_loss_dir_conn(parameters, Xte.cuda(), Yte.cuda(), block_size=3, embedding_size=50)

tensor(2.1266, device='cuda:0', grad_fn=<NllLossBackward0>)

The loss decreased by lot with this direct connection

### Mixing NN output with the trigram model output
> Trigram model is the statistical model I implemented in the Lesson's 2 exercise