In [None]:
# 1. block size ? 2. how to produce the input? 3. vocab size?

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## Input 
(B, T) -> (B, T, C)

## Dropout 
+ **During training**, randomly zeroes some of the elements of the input tensor with probability p.
+ the outputs are scaled by a factor $\frac{1}{1-p}$ of during training. This means that during evaluation the module simply computes an identity function.

In [4]:
m = nn.Dropout(p=0.2)
input = torch.randn(20, 16)
m(input)

tensor([[-1.2737e+00, -1.5830e+00, -5.7301e-02,  6.6911e-01,  2.0291e+00,
          7.4391e-01, -7.2854e-01,  0.0000e+00, -3.7143e-01, -9.9263e-02,
          0.0000e+00,  1.8499e+00, -2.0506e+00, -1.4774e+00,  1.2005e+00,
         -5.3075e-01],
        [-1.5099e+00, -4.1922e-01,  1.1408e+00,  4.6337e-01, -0.0000e+00,
         -5.4516e-01,  0.0000e+00,  1.2371e+00,  0.0000e+00,  0.0000e+00,
         -2.4196e+00, -3.4048e+00,  5.7232e-01,  4.9851e-01,  0.0000e+00,
          7.1118e-01],
        [-1.7112e+00,  2.9942e-02, -1.3743e+00,  5.6715e-01,  2.7618e-01,
          2.0464e+00, -5.5915e-01,  7.9842e-01, -4.4031e-01,  2.0749e-01,
         -0.0000e+00,  1.7583e+00,  1.9439e+00, -1.8261e+00, -5.8562e-01,
         -0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  7.8681e-01,  2.7161e+00, -6.0087e-01,
          2.3701e+00,  1.9097e-02,  1.7065e+00,  4.5573e-01,  8.6003e-01,
          3.0200e-01,  5.0502e-01,  9.8682e-01, -9.2199e-01, -3.1990e-01,
         -1.2229e-01],
        [-1.9405e+00

## Self Attention
+ self attention: q, k, v source is x
+ **Attention is a communication mechanism**. This can be seen as  nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights. However, **node 1 can't accesses 2 and 3**.
+  Scaled" attention: $\frac{sim}{\sqrt k}$. This makes it so when input Q,K are unit variance, wei will be **unit variance too and Softmax will stay diffuse and not saturate too much**.

![](assets/comm.png)

In [1]:
# Self Attention
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
head_size = 16
x = torch.randn(B,T,C)
query = nn.Linear(C, head_size, bias=False)
key = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

q = query(x)  # B, T, h
k = query(x)  # B, T, h
v = query(x)  # B, T, h

sim = torch.matmul(q, k.transpose(-1, -2))  # B, T, T

sim = F.softmax(sim, dim=-1)

# mask some weight for special cases
tril = torch.tril(torch.ones(T, T))
wei = sim.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

out = torch.matmul(wei, v)

NameError: name 'torch' is not defined

## difference between layernorm and batchnrom
![](assets/norm.png)

## Transformer Framework
+ word embedding + position embedding
+ attention block: layer norm + attention + MLP
+ Layer Nom
+ classifier

![](assets/transformer.png)