**Imports**

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

**Defining a torch tensor**

In [None]:
state = torch.tensor([1,2,3]) # long by default
state

tensor([1, 2, 3])

In [None]:
# Other tensor types
state = torch.tensor([1,2,3], dtype=torch.float)
state

tensor([1., 2., 3.])

In [None]:
# Multi-dimenstional tensors
state = torch.Tensor([[1,2,3], [4,5,6], [7,8,9]]) # float32 by default
state

tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])

In [None]:
# A couple more useful torch functions to initialize tensors
print(torch.zeros((2,2)))
print(torch.ones((3,3)))

tensor([[0., 0.],
        [0., 0.]])
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])


**Some useful tensor properties**

In [None]:
print(state)
print(state.dtype) # the type of elements in the tensor, eg: long, float32, etc.
print(state.shape) # the shape of the tensor
print(state.device) # the device the tensor is on: CPU or GPU
print(state.grad) # the gradient of the tensor, more on this later

tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])
torch.float32
torch.Size([3, 3])
cpu
None


**Side note: How to check if I have a GPU?**

In [None]:
# Not using a GPU right now
torch.cuda.is_available()

False

**Useful tensor operations**

In [None]:
print(state.sum()) # Also torch.sum(state)
print(state.mean()) # Also torch.mean(state)
print(state.max()) # Also torch.max(state)

tensor(45.)
tensor(5.)
tensor(9.)


You can also do these operations across specific dimensions. Here is an example with `sum` but you can do the same things with other operations like `max` and `mean` as well.

In [None]:
# summing with dim=0 collapses 0th dimension aka row dimension and sums the columns
print(state.sum(dim=0)) 
# summing with dim=1 collapses the 1st dimension aka column dimension and sums the rows
print(state.sum(dim=1)) 
# summing with dim=-1 collapses the last dimension aka column dimension and sums the rows
print(state.sum(dim=-1)) 
# Here is a more telling example
print(torch.ones((3,4,5)).sum(-1).shape)

tensor([12., 15., 18.])
tensor([ 6., 15., 24.])
tensor([ 6., 15., 24.])
torch.Size([3, 4])


You can add and subtract tensors and multiply them as you do with `numpy` arrays. You can also do broadcast operations as you would with numpy arrays

In [None]:
print(state * 2) # every element is multiplied by 2
print("*****************")
print(state + 2) # 2 is added to every element

tensor([[ 2.,  4.,  6.],
        [ 8., 10., 12.],
        [14., 16., 18.]])
*****************
tensor([[ 3.,  4.,  5.],
        [ 6.,  7.,  8.],
        [ 9., 10., 11.]])


**Element-wise multiplication and matrix multiplication**

In [None]:
x1 = torch.Tensor([1,2,3]) # 1x3
x2 = torch.Tensor([4,5,6]) # 1x3
x3 = torch.ones((3,3)) # 3x3

print(x1 * x2) # element-wise multiplication
print(x1.dot(x2)) # vector dot product
print(x1.matmul(x3)) # matrix multiplication

tensor([ 4., 10., 18.])
tensor(32.)
tensor([6., 6., 6.])


**Reshaping Torch tensors**

In [None]:
x = torch.ones((3,4,5))
y = torch.ones((2,3))

print(x.reshape(20,3).shape)
print(x.reshape(3, -1).shape) # torch infers what should come in the -1 position
print(x.reshape(5,2,-1).shape)
print(y.T.shape) # NOT the same as y.reshape(3,2)
print(x.transpose(-1,1).shape) # allows you to swapaxes in multi-dimensional tensors

torch.Size([20, 3])
torch.Size([3, 20])
torch.Size([5, 2, 6])
torch.Size([3, 2])
torch.Size([3, 5, 4])


**Some more useful methods**

In [None]:
x1 = torch.Tensor([1,2,3,4])
x2 = torch.Tensor([2,3,5,7])

print(F.mse_loss(x1, x2)) # calculates MSE loss between x1 and x2
print(torch.sigmoid(x1)) # applies the sigmoid function on the given tensor
print(F.one_hot(x1.long(), 5)) # converts the tensor to one-hot
print(F.one_hot(x1.long(), 7))

tensor(3.7500)
tensor([0.7311, 0.8808, 0.9526, 0.9820])
tensor([[0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1]])
tensor([[0, 1, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0]])


**Tensors with gradients**

In [None]:
state = torch.Tensor([[1,2,3], [4,5,6]])
print(state.grad) # state has no gradient now
print(state.requires_grad) # torch is not tracking the gradient either
state.requires_grad = True # this will tell torch to track the gradient of state
print(state.requires_grad)
print(state.grad) # but still no gradient

None
False
True
None


In [None]:
target = 25 # some random target
prediction = state.sum()
loss = (target - prediction) **2 # by the way, ** is also a broadcast operation
loss

tensor(16., grad_fn=<PowBackward0>)

In [None]:
loss.backward()
state.grad

tensor([[-8., -8., -8.],
        [-8., -8., -8.]])

We can now use this gradient to perform an update on `state` using gradient descent or any other fancy optimizer.

**The torch `nn` module**

In [None]:
class MyModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.layer1 = nn.Linear(10,5) # encapsulates a weight and a bias and is tracking gradients by default
    self.layer2 = nn.Linear(5, 1) 
  
  def forward(self, x):
    out = self.layer1(x) # self.layer(x) performs w.T.matmul(x) + b
    out = nn.ReLU()(out)
    return self.layer2(out)

In [None]:
model = MyModel()
print(model.layer1.weight.shape, model.layer1.bias.shape)
input = torch.ones(4, 10) # shape is batchsize x dimension
out = model(input) # same as model.forward(input)
print(out)
print(out.shape) # batchsize x output_dim

torch.Size([5, 10]) torch.Size([5])
tensor([[-0.0836],
        [-0.0836],
        [-0.0836],
        [-0.0836]], grad_fn=<AddmmBackward0>)
torch.Size([4, 1])


Let's add a simple loss function

In [None]:
target = torch.tensor([0.4, 0.2, 0.99, 0.3]).reshape(-1, 1) # random target
loss = F.mse_loss(out, target) # MSE loss
print(loss)

tensor(0.4035, grad_fn=<MseLossBackward0>)


In [None]:
print(model.layer1.weight, model.layer1.bias) # no gradient yet
print("*************************************")
print(model.layer1.weight.grad, model.layer1.bias.grad) # no gradient yet

Parameter containing:
tensor([[ 0.1318, -0.0576, -0.3030, -0.2118,  0.2717, -0.1217,  0.0581, -0.2497,
         -0.2116,  0.2607],
        [ 0.2519,  0.2266, -0.0837,  0.0837,  0.0498, -0.1639,  0.1967,  0.0892,
          0.2818,  0.2643],
        [ 0.2002, -0.2317, -0.1012,  0.1545, -0.1425, -0.1672, -0.2687, -0.0745,
          0.1760,  0.2588],
        [ 0.1746, -0.2022, -0.0185,  0.2798, -0.0306,  0.2052, -0.2881,  0.0842,
          0.2125,  0.2463],
        [ 0.3013,  0.1859, -0.0012,  0.0886, -0.0355,  0.1531, -0.2281, -0.2515,
          0.2611,  0.2024]], requires_grad=True) Parameter containing:
tensor([ 0.0390,  0.1265, -0.1221, -0.0808,  0.0080], requires_grad=True)
*************************************
None None


Before we get the gradient, let's define an optimizer

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1) # this learning rate is extremely high for Adam btw

In [None]:
optimizer.zero_grad() # clears the gradients (they're cleared anyway at this point)
loss.backward() # calculates the gradients
print(model.layer1.weight.grad, model.layer1.bias.grad) # now we have gradients

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.2292,  0.2292,  0.2292,  0.2292,  0.2292,  0.2292,  0.2292,  0.2292,
          0.2292,  0.2292],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.4020,  0.4020,  0.4020,  0.4020,  0.4020,  0.4020,  0.4020,  0.4020,
          0.4020,  0.4020],
        [-0.4448, -0.4448, -0.4448, -0.4448, -0.4448, -0.4448, -0.4448, -0.4448,
         -0.4448, -0.4448]]) tensor([ 0.0000,  0.2292,  0.0000,  0.4020, -0.4448])


But we still haven't updated the weights

In [None]:
print(model.layer1.weight, model.layer1.bias)

Parameter containing:
tensor([[ 0.1318, -0.0576, -0.3030, -0.2118,  0.2717, -0.1217,  0.0581, -0.2497,
         -0.2116,  0.2607],
        [ 0.2519,  0.2266, -0.0837,  0.0837,  0.0498, -0.1639,  0.1967,  0.0892,
          0.2818,  0.2643],
        [ 0.2002, -0.2317, -0.1012,  0.1545, -0.1425, -0.1672, -0.2687, -0.0745,
          0.1760,  0.2588],
        [ 0.1746, -0.2022, -0.0185,  0.2798, -0.0306,  0.2052, -0.2881,  0.0842,
          0.2125,  0.2463],
        [ 0.3013,  0.1859, -0.0012,  0.0886, -0.0355,  0.1531, -0.2281, -0.2515,
          0.2611,  0.2024]], requires_grad=True) Parameter containing:
tensor([ 0.0390,  0.1265, -0.1221, -0.0808,  0.0080], requires_grad=True)


Now let's update the weights using the optimizer, As we can see below, the weight has now been updated!

In [None]:
optimizer.step()
print(model.layer1.weight, model.layer1.bias)

Parameter containing:
tensor([[ 0.1318, -0.0576, -0.3030, -0.2118,  0.2717, -0.1217,  0.0581, -0.2497,
         -0.2116,  0.2607],
        [ 0.1519,  0.1266, -0.1837, -0.0163, -0.0502, -0.2639,  0.0967, -0.0108,
          0.1818,  0.1643],
        [ 0.2002, -0.2317, -0.1012,  0.1545, -0.1425, -0.1672, -0.2687, -0.0745,
          0.1760,  0.2588],
        [ 0.0746, -0.3022, -0.1185,  0.1798, -0.1306,  0.1052, -0.3881, -0.0158,
          0.1125,  0.1463],
        [ 0.4013,  0.2859,  0.0988,  0.1886,  0.0645,  0.2531, -0.1281, -0.1515,
          0.3611,  0.3024]], requires_grad=True) Parameter containing:
tensor([ 0.0390,  0.0265, -0.1221, -0.1808,  0.1080], requires_grad=True)


**A general PyTorch recipe:**


1.   Create a model class
2.   Get your training and evaluation data
3.   Set up an optimizer
4.   Perform forward passes with training data
5. `optimizer.zero_grad()`
6. `loss.backward()`
7. `optimizer.step()`
8. After few steps of training, evaluate on evaluation data
9. Repeat until loss doesn't decrease or evaluation performance starts to fall off



**Saving and loading models**

In [None]:
model.state_dict()

OrderedDict([('layer1.weight',
              tensor([[ 0.1318, -0.0576, -0.3030, -0.2118,  0.2717, -0.1217,  0.0581, -0.2497,
                       -0.2116,  0.2607],
                      [ 0.1519,  0.1266, -0.1837, -0.0163, -0.0502, -0.2639,  0.0967, -0.0108,
                        0.1818,  0.1643],
                      [ 0.2002, -0.2317, -0.1012,  0.1545, -0.1425, -0.1672, -0.2687, -0.0745,
                        0.1760,  0.2588],
                      [ 0.0746, -0.3022, -0.1185,  0.1798, -0.1306,  0.1052, -0.3881, -0.0158,
                        0.1125,  0.1463],
                      [ 0.4013,  0.2859,  0.0988,  0.1886,  0.0645,  0.2531, -0.1281, -0.1515,
                        0.3611,  0.3024]])),
             ('layer1.bias',
              tensor([ 0.0390,  0.0265, -0.1221, -0.1808,  0.1080])),
             ('layer2.weight',
              tensor([[-0.2829, -0.1061,  0.1022, -0.2615,  0.4999]])),
             ('layer2.bias', tensor([0.2259]))])

In [None]:
torch.save(model.state_dict(), "model.pt") # saving the model
new_model = MyModel()
new_model.load_state_dict(torch.load("model.pt")) # loading the model

<All keys matched successfully>

A couple more useful torch modules that you might find useful for assignments 2 and 3 are `nn.Sequential` and the `Conv2D` module. Check these out in the [PyTorch documentation](https://pytorch.org/docs/stable/index.html)!