In [2]:
import os

In [56]:
torch.ones(4,2)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.]])

In [5]:
print(100 % 100)

0


In [5]:
block_size = 8
B = 3
t = torch.arange(0, block_size).unsqueeze(0).expand(B, -1)
print(t)

tensor([[0, 1, 2, 3, 4, 5, 6, 7],
        [0, 1, 2, 3, 4, 5, 6, 7],
        [0, 1, 2, 3, 4, 5, 6, 7]])


In [11]:
# checking broadcasting
y = torch.arange(0, block_size) # [block_size]
print(y)
x = torch.ones(5, block_size) # [5 x block_size]
print(x)
print(x + y) # [5 x block_size]

tensor([0, 1, 2, 3, 4, 5, 6, 7])
tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[1., 2., 3., 4., 5., 6., 7., 8.],
        [1., 2., 3., 4., 5., 6., 7., 8.],
        [1., 2., 3., 4., 5., 6., 7., 8.],
        [1., 2., 3., 4., 5., 6., 7., 8.],
        [1., 2., 3., 4., 5., 6., 7., 8.]])


### Possible future work

- use a better econder such as SentencePiece or tiktoken

### Optimization

- batch gradient descent
- mini-batch gradient descent
- stochastic gradient descent (batch size = 1)
- mini-batch gradient descent with momentum
- RMSprop
- Adam combines momentum with RMSprop

### Self-attention

In [23]:
import torch
import torch.nn.functional as F
import torch.nn

In [39]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.rand(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [None]:
# method 1: loop through to calculate weighted average channels of previous tokens
xbow = torch.zeros(B,T,C)
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b,t] = torch.mean(xprev, 0)

In [40]:
# method 2: use matrix multiplication taking
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x
# wei is T,T and x is B, T, C
# it will broadcast wei to a B, T, T
# so for each batch it will do T, T @ T, C => T, C
torch.allclose(xbow2, xbow)

True

In [55]:
# method 3
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
# fill wei with -inf where tril is zero
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow2, xbow3)

True

In [37]:
# toy example showing how method 2 averages
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [44]:
# version 4: self-attention (prelim)
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T)) # T, T
wei = wei.masked_fill(tril == 0, float('-inf')) # T, T
wei = F. softmax(wei, dim=-1) # T, T
out = wei @ x

out.shape
print(wei)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [57]:
a = [1, 2, 3]
*a

SyntaxError: can't use starred expression here (3688491267.py, line 2)

In [52]:
# version 4: self-attention (with weights)
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

head_size = 16
key = nn.Linear(C, head_size, bias=False) # [C x 16]
query = nn.Linear(C, head_size, bias=False) # [C x 16]
value = nn.Linear(C, head_size, bias=False) # [C x 16]
k = key(x)   # B x T x 16
q = query(x) # B x T x 16
wei = q @ k.transpose(-2, -1) * head_size**-0.5# [B x T x 16] x [B x 16 x T] --> [B x T x T] 



tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
# print(wei[0,:,:])
wei = F.softmax(wei, dim=-1)
v = value(x)
out = wei @ v

print(wei[0])

out.shape

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3966, 0.6034, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3069, 0.2892, 0.4039, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3233, 0.2175, 0.2443, 0.2149, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1479, 0.2034, 0.1663, 0.1455, 0.3369, 0.0000, 0.0000, 0.0000],
        [0.1259, 0.2490, 0.1324, 0.1062, 0.3141, 0.0724, 0.0000, 0.0000],
        [0.1598, 0.1990, 0.1140, 0.1125, 0.1418, 0.1669, 0.1061, 0.0000],
        [0.0845, 0.1197, 0.1078, 0.1537, 0.1086, 0.1146, 0.1558, 0.1553]],
       grad_fn=<SelectBackward0>)


torch.Size([4, 8, 16])

In [54]:
tril[:T, :T]

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

### Tensors

In [1]:
import torch

Initializing a tensor:

In [8]:
x = torch.empty(3, 4)
x = torch.ones(3, 4)
x = torch.zeros(3, 4)
print(x)

<class 'torch.Tensor'>
torch.float32
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])


Also, there is a `torch_*_like()` method:

In [9]:
x = torch.empty(2, 2, 3)
print(x.shape)
print(x)
ones_like_x = torch.ones_like(x)
print(ones_like_x.shape)
print(ones_like_x)

torch.Size([2, 2, 3])
tensor([[[0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.]]])
torch.Size([2, 2, 3])
tensor([[[1., 1., 1.],
         [1., 1., 1.]],

        [[1., 1., 1.],
         [1., 1., 1.]]])


In [3]:
torch.cuda.is_available()

True

In [4]:
print(torch.version.cuda)

11.8


In [5]:
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device: {torch.cuda.current_device()}")
	
print(f"Name of current CUDA device: {torch.cuda.get_device_name(cuda_id)}")


Is CUDA supported by this system? True
CUDA version: 11.8
ID of current CUDA device: 0
Name of current CUDA device: NVIDIA GeForce RTX 3090


### Programming concepts

Lambda functions:

In [11]:
add = lambda x, y: x + y
add(1,3)

4

In [16]:
items = [('bread', 5), ('milk', 2), ('eggs', 15)]
sorted_items = sorted(items, key=lambda item: item[1], reverse=1)
print(sorted_items)

[('eggs', 15), ('bread', 5), ('milk', 2)]


In [22]:
numbers = [1, 2, 3, 4, 5]
squared_numbers = list(map(lambda x: x**2, numbers))
print(squared_numbers)

[1, 4, 9, 16, 25]


In [26]:
even_numbers = list(filter(lambda x: x%2 == 0, numbers))
print(even_numbers)

[2, 4]


### Other

In [33]:
# Flatten a matrix (list of lists) into a single list of elements
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
flat = [num for row in matrix for num in row]
print(flat)

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [35]:
import torch.nn as nn

In [54]:
torch.manual_seed(1337)
Embedding_example = nn.Embedding(4,4)
a = torch.ones(3,2, dtype=torch.long)
Embedding_example(a)

tensor([[[0.6258, 0.0255, 0.9545, 0.0643],
         [0.6258, 0.0255, 0.9545, 0.0643]],

        [[0.6258, 0.0255, 0.9545, 0.0643],
         [0.6258, 0.0255, 0.9545, 0.0643]],

        [[0.6258, 0.0255, 0.9545, 0.0643],
         [0.6258, 0.0255, 0.9545, 0.0643]]], grad_fn=<EmbeddingBackward0>)

tensor([[[-0.1346, -0.5477, -0.4532, -0.7402],
         [-0.1346, -0.5477, -0.4532, -0.7402]],

        [[-0.1346, -0.5477, -0.4532, -0.7402],
         [-0.1346, -0.5477, -0.4532, -0.7402]],

        [[-0.1346, -0.5477, -0.4532, -0.7402],
         [-0.1346, -0.5477, -0.4532, -0.7402]]], grad_fn=<EmbeddingBackward0>)

tensor([[[-1.1694, -0.1636, -0.8678,  0.5622],
         [-1.1694, -0.1636, -0.8678,  0.5622]],

        [[-1.1694, -0.1636, -0.8678,  0.5622],
         [-1.1694, -0.1636, -0.8678,  0.5622]],

        [[-1.1694, -0.1636, -0.8678,  0.5622],
         [-1.1694, -0.1636, -0.8678,  0.5622]]], grad_fn=<EmbeddingBackward0>)