In [1]:
import torch

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
x = torch.tensor([5,3], dtype=torch.float32, device=device, requires_grad=True)
y = torch.Tensor([2,1])
print(x*y)

RuntimeError: expected device cuda:0 but got device cpu

In [9]:
x = torch.tensor([5,3], dtype=torch.float32, device=device, requires_grad=True)
y = torch.tensor([2,1], dtype=torch.float32, device=device, requires_grad=True)
print(x*y)

tensor([10.,  3.], device='cuda:0', grad_fn=<MulBackward0>)


In [4]:
x = torch.zeros([2,5])
y = torch.rand([2,5])
print(x.shape, x, y)

torch.Size([2, 5]) tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]) tensor([[0.3701, 0.3684, 0.0544, 0.5988, 0.5395],
        [0.1707, 0.9708, 0.0945, 0.4065, 0.1974]])


In [5]:
y.view([1,10])

tensor([[0.3701, 0.3684, 0.0544, 0.5988, 0.5395, 0.1707, 0.9708, 0.0945, 0.4065,
         0.1974]])

In [11]:
torch.eye(5,5)

tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]])

In [10]:
x = torch.linspace(start=0.1, end=1, steps=10)
x

tensor([0.1000, 0.2000, 0.3000, 0.4000, 0.5000, 0.6000, 0.7000, 0.8000, 0.9000,
        1.0000])

In [12]:
torch.empty(size=(1,5)).uniform_(0,1)

tensor([[0.0305, 0.9668, 0.2055, 0.5282, 0.8413]])

In [13]:
torch.diag(torch.ones(3))

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])

In [17]:
tensor=torch.arange(4)
print(tensor)
print(tensor.bool())
print(tensor.short())
print(tensor.long())
print(tensor.float())
print(tensor.double())

tensor([0, 1, 2, 3])
tensor([False,  True,  True,  True])
tensor([0, 1, 2, 3], dtype=torch.int16)
tensor([0, 1, 2, 3])
tensor([0., 1., 2., 3.])
tensor([0., 1., 2., 3.], dtype=torch.float64)


In [19]:
import numpy as np
np_array = np.zeros((5,5))
tensor = torch.from_numpy(np_array)
np_array_back = tensor.numpy()
tensor

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], dtype=torch.float64)

In [20]:
x = torch.tensor([1,2,3])
y = torch.tensor([9,8,7])

z1 = torch.empty(3)
torch.add(x,y,out=z1)

tensor([10., 10., 10.])

In [22]:
z2 = torch.add(x, y)
z = x + y
z

tensor([10, 10, 10])

In [23]:
x - y

tensor([-8, -6, -4])

In [24]:
z = torch.true_divide(x, y)
z

tensor([0.1111, 0.2500, 0.4286])

In [27]:
# inplace operations
t = torch.zeros(3)
t.add_(x) # _ signifies inplace; also returns result
t += x # t = t + x will not be inplace
t

tensor([2., 4., 6.])

In [28]:
z = x.pow(2)
z = x ** 2
z

tensor([1, 4, 9])

In [30]:
x1 = torch.rand((2,5))
x1

tensor([[0.8048, 0.3464, 0.6343, 0.3774, 0.3717],
        [0.6451, 0.8586, 0.8056, 0.9452, 0.1954]])

In [31]:
x2 = torch.rand((5,3))
x3 = torch.mm(x1, x2)
x3

tensor([[1.2918, 1.5011, 1.2975],
        [1.8299, 2.1756, 1.7424]])

In [32]:
x1.mm(x2)

tensor([[1.2918, 1.5011, 1.2975],
        [1.8299, 2.1756, 1.7424]])

In [33]:
torch.dot(x,y)

tensor(46)

In [34]:
# Batch matrix multiplication
batch = 32
n = 10
m = 20
p = 30
tensor1 = torch.rand((batch,n,m))
tensor2 = torch.rand((batch,m,p))

out_bmm = torch.bmm(tensor1, tensor2) # matrix mult across the appropriate dim (out (batch, n, p))
out_bmm.shape

torch.Size([32, 10, 30])

In [38]:
# Broadcasting
x1 = torch.rand((5,5))
x2 = torch.arange(5) # expanded in the subtraction to have 5 rows where each row is the same
x1 - 5*x2

tensor([[  0.3835,  -4.7586,  -9.4987, -14.4273, -19.7072],
        [  0.8517,  -4.7827,  -9.7003, -14.7938, -19.7391],
        [  0.8542,  -4.3145,  -9.1017, -14.6758, -19.9026],
        [  0.8664,  -4.7841,  -9.8909, -14.7451, -19.4490],
        [  0.9881,  -4.1576,  -9.1309, -14.8459, -19.6225]])

In [42]:
x

tensor([1, 2, 3])

In [40]:
# Other useful tensor operations
torch.sum(x, dim=0)

tensor(6)

In [44]:
values, indices = torch.max(x, dim=0)
indices

tensor(2)

In [52]:
x.max()

tensor(3)

In [43]:
torch.argmax(x, dim=0)

tensor(2)

In [45]:
torch.eq(x,y)

tensor([False, False, False])

In [46]:
torch.mean(x.float())

tensor(2.)

In [47]:
torch.sort(y,dim=0,descending=False)

torch.return_types.sort(
values=tensor([7, 8, 9]),
indices=tensor([2, 1, 0]))

In [50]:
torch.clamp(x, min=0, max=2)

tensor([1, 2, 2])

In [51]:
torch.clamp(x,min=0) # this is just ReLU

tensor([1, 2, 3])

In [62]:
# Tensor Indexing
batch_size = 10
features = 25
x = torch.rand((batch_size, features))

print(x[0].shape)
print(x[:,0].shape)
print(x[2,0:10])

x[0,0] = 100

# Fancy indexing
x = torch.arange(10)
indices = [2,5,8]
print(x[indices])

x = torch.rand((3,5))
rows = torch.tensor([1,0])
cols = torch.tensor([4,0])
print(x)
print(x[rows,cols]) # 2 elements

x = torch.arange(10)
print(x[(x<2) & (x>8)])
print(x[x.remainder(2) == 0])

print(torch.where(x>5, x, x*2)) # if x > 5 stay as x else multiply by 2

print(torch.tensor([0,0,1,2,2,3,4]).unique())
print(x.ndimension())
print(x.numel()) # number of elements

torch.Size([25])
torch.Size([10])
tensor([0.5842, 0.0946, 0.4091, 0.1546, 0.4195, 0.8964, 0.3115, 0.2267, 0.5685,
        0.3963])
tensor([2, 5, 8])
tensor([[0.3966, 0.3297, 0.0121, 0.5839, 0.9605],
        [0.4165, 0.7195, 0.1293, 0.1870, 0.7970],
        [0.4029, 0.8183, 0.5800, 0.4022, 0.7884]])
tensor([0.7970, 0.3966])
tensor([], dtype=torch.int64)
tensor([0, 2, 4, 6, 8])
tensor([ 0,  2,  4,  6,  8, 10,  6,  7,  8,  9])
tensor([0, 1, 2, 3, 4])
1
10


In [70]:
# Tensor reshaping

x = torch.arange(9)
x_3x3 = x.view(3,3)
print(x_3x3)
x_3x3 = x.reshape(3,3) # safe bet, makes a copy, can have performance loss

y= x_3x3.t()
print(y.contiguous().view(9))
print(x)

x1 = torch.rand((2,5))
x2 = torch.rand((2,5))
print(torch.cat((x1,x2),dim=0).shape)
print(torch.cat((x1,x2),dim=1).shape)
print(x1.view(-1).shape)

tensor([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]])
tensor([0, 3, 6, 1, 4, 7, 2, 5, 8])
tensor([0, 1, 2, 3, 4, 5, 6, 7, 8])
torch.Size([4, 5])
torch.Size([2, 10])
torch.Size([10])


In [77]:
batch = 64
x = torch.rand((batch, 2, 5))
z = x.view(batch,-1)
print(z.shape)
z = x.permute(0, 2, 1) # transpose is a special case of permute
print(z.shape)

x = torch.arange(10)
print(x.shape)
print(x.unsqueeze(0).shape)
print(x.unsqueeze(1).shape)

x = torch.arange(10).unsqueeze(0).unsqueeze(1) # 1x1x10
z = x.squeeze(1)
print(z.shape)

torch.Size([64, 10])
torch.Size([64, 5, 2])
torch.Size([10])
torch.Size([1, 10])
torch.Size([10, 1])
torch.Size([1, 10])


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F # ReLU tanh

In [34]:
n_words_source = 30
n_words_target = 30
batch_size = 10
dim_model = 512
dim_values = 64
dim_keys = 64
n_heads = 8
x = torch.rand((n_words_target, batch_size, dim_model)) # (n_words, batch_size, dim_model)
#values = torch.rand((batch_size, n_words_source, dim_model))

values_projected = nn.Linear(dim_model, dim_values, bias=False)
keys_projected = nn.Linear(dim_model, dim_keys, bias=False)
queries_projected = nn.Linear(dim_model, dim_keys, bias=False)

print(x.shape)
values = values_projected(x) # (n_words, batch_size, dim_values)
keys = keys_projected(x)
queries = queries_projected(x)
print(values.shape)
print(keys.permute(2,1,0).shape)

scaled = torch.einsum('ibj,jbk->ibk', queries, keys.permute(2,1,0)) / (dim_keys ** (1/2))
print(scaled.shape)
# every row of the word x word needs to sum to 1
softmaxed = F.softmax(scaled, dim=2)
print(softmaxed.shape)
out = torch.einsum('ibj,jbk->ibk', softmaxed, values)
print(out.shape)

torch.Size([30, 10, 512])
torch.Size([30, 10, 64])
torch.Size([64, 10, 30])
torch.Size([30, 10, 30])
torch.Size([30, 10, 30])
torch.Size([30, 10, 64])


In [35]:
w_o = nn.Linear(n_heads*dim_values, dim_model, bias=False)

concatenated_attn_heads = torch.cat([out for _ in range(n_heads)], dim=2)
print(concatenated_attn_heads.shape)
z = w_o(concatenated_attn_heads)
print(z.shape)

torch.Size([30, 10, 512])
torch.Size([30, 10, 512])


In [26]:
m = nn.Softmax(dim=1)
input = torch.randn(2, 3)
output = m(input)
output

tensor([[0.7373, 0.1394, 0.1233],
        [0.1155, 0.3117, 0.5728]])

In [18]:
queries = torch.rand((n_words_target, batch_size, dim_model)) # (n_words, batch_size, dim_model)
keys = torch.rand((n_words_source, batch_size, dim_model))
values = torch.rand((n_words_source, batch_size, dim_model))
F.multi_head_attention_forward(queries, keys, values, dim_model, 8, None, None, None, None, None, 0, None, None)

TypeError: 'NoneType' object is not subscriptable

In [14]:
queries = torch.rand((batch_size, n_words_source, dim_model))
keys = torch.rand((batch_size, n_words_source, dim_model))
values = torch.rand((batch_size, n_words_source, dim_model))
print(values.shape)
values = values_projected(values) # (batch_size, n_words_source, dim_values)
keys = keys_projected(keys) # (batch_size, n_words_source, dim_values)
queries = queries_projected(queries) # (batch_size, n_words_source, dim_values)
print(values.shape)

scaled = torch.bmm(queries, keys.permute(0, 2, 1)) / (dim_keys ** (1/2)) # (batch_size, n_words, n_words)
print(scaled.shape)
out = torch.bmm(F.softmax(scaled, dim=2), values) # attention (batch_size, n_words, dim_values)

print(out.shape)

torch.Size([8, 30, 512])
torch.Size([8, 30, 64])
torch.Size([8, 30, 30])
torch.Size([8, 30, 64])


In [None]:
class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)

    def forward(self, q, k, v, mask=None):

        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))

        if mask is not None:
            attn = attn.masked_fill(mask == 0, -1e9)

        attn = self.dropout(F.softmax(attn, dim=-1))
        output = torch.matmul(attn, v)

        return output, attn
    
class MultiHeadAttention(nn.Module):
    ''' Multi-Head Attention module '''

    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
        self.fc = nn.Linear(n_head * d_v, d_model, bias=False)

        self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)

        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)


    def forward(self, q, k, v, mask=None):

        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)

        residual = q

        # Pass through the pre-attention projection: b x lq x (n*dv)
        # Separate different heads: b x lq x n x dv
        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)

        # Transpose for attention dot product: b x n x lq x dv
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

        if mask is not None:
            mask = mask.unsqueeze(1)   # For head axis broadcasting.

        q, attn = self.attention(q, k, v, mask=mask)

        # Transpose to move the head dimension back: b x lq x n x dv
        # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
        q = self.dropout(self.fc(q))
        q += residual

        q = self.layer_norm(q)

        return q, attn

## Neural Network example

In [80]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F # ReLU tanh
from torch.utils.data import DataLoader # Easier dataset management
import torchvision.datasets as datasets
import torchvision.transforms as transforms # transformations we can perform on dataset

In [81]:
# Create fully connected network
class NN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(NN, self).__init__() # calls initialization method of the parent class (nn.Module)
        self.fc1 = nn.Linear(input_size, 50) # 50 nodes in hidden layer
        self.fc2 = nn.Linear(50, num_classes)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
model = NN(784, 10)
x = torch.randn(64, 784) #64 examples simultaneously (minibatch size)

print(model(x).shape)        

torch.Size([64, 10])


In [82]:
# Set device
device = torch.device('cuda'if torch.cuda.is_available() else 'cpu')

In [83]:
# Hyperparameters
input_size = 784
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 1

In [93]:
# Load Data
train_dataset = datasets.MNIST(root='dataset/', train=True, transform = transforms.ToTensor(), download=True ) #transforms converts it from numpy to tensor
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) # shuffle each epoch
test_dataset = datasets.MNIST(root='dataset/', train=False, transform = transforms.ToTensor(), download=True ) #transforms converts it from numpy to tensor
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True) # shuffle each epoch

In [94]:
# Initialize network
model = NN(input_size=input_size, num_classes=num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [87]:
# Train Network
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(device=device)
        targets = targets.to(device=device)
        
        print(data.shape) # (64,1,28,28) 1 channel, 28x28 pixels
        data = data.reshape(data.shape[0], -1) # Keep 64, then flatten all the others
        
        # forward
        scores = model(data)
        loss = criterion(scores, targets) # targets are the correct labels, scores are the predictions
        
        # backward
        optimizer.zero_grad() # set all gradients to 0 foreach batch so don't store from previous forward props
        loss.backward()
        
        optimizer.step() # gradient descent or adam step

torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 2

torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 2

torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 2

torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 28, 28])
torch.Size([64, 1, 2

In [95]:
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()
    
    with torch.no_grad():
        # don't need to actually compute gradients in calculations
        for x, y in loader:
            x = x.to(device=device)
            y = y.to(device=device)
            x = x.reshape(x.shape[0],-1)
            scores=model(x)
            _, predictions = scores.max(dim=1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
            
        print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')
        
    model.train()

check_accuracy(train_loader, model)
check_accuracy(test_loader, model)

Got 6116 / 60000 with accuracy 10.19
Got 1024 / 10000 with accuracy 10.24
