## Notes

- Similar to numpy but can perform computations on CPU, GPU and TPU
- Automatic differentiation and allows computing of gradients efficiently
- Tensors: multi-dimensional arrays with added functionalities
- requires_grad = true for calculating gradients and auto differentiation
- PyTorch takes care of scaling across multiple devices, and threads, and supports a variety of platforms
- Need to do optimizer.zero_grad() to avoide accumulating gradients from past passes
- Broadcasting: perform operations elementwise such as addition or multiplication with diff dim tensors

In [114]:
import torch                                        # root package
from torch.utils.data import Dataset, DataLoader    # dataset representation and loading

import torch.autograd as autograd         # computation graph
from torch import Tensor                  # tensor node in the computation graph
import torch.nn as nn                     # neural networks
import torch.nn.functional as F           # layers, activations and more
import torch.optim as optim               # optimizers e.g. gradient descent, ADAM, etc.
from torch.jit import script, trace       # hybrid frontend decorator and tracing jit

In [None]:
x = torch.randn(*size)              # tensor with independent N(0,1) entries
x = torch.[ones|zeros](*size)       # tensor with all 1's [or 0's]
x = torch.tensor(L)                 # create tensor from [nested] list or ndarray L
y = x.clone()                       # clone of x
with torch.no_grad():               # code wrap that stops autograd from tracking tensor history
requires_grad=True                  # arg, when set to True, tracks computation
                                    # history for future derivative calculations

## basics

In [1]:
import torch

In [5]:
torch.tensor(3)

tensor(3)

In [7]:
torch.tensor([2,2])

tensor([2, 2])

In [8]:
torch.zeros([2,2])

tensor([[0., 0.],
        [0., 0.]])

In [9]:
torch.rand([2,2])

tensor([[0.3183, 0.6828],
        [0.1345, 0.2790]])

In [10]:
x = torch.rand([2,2])
y = torch.rand([2,5])

x @ y

tensor([[0.5167, 0.4390, 0.0489, 0.1904, 0.1595],
        [0.0932, 0.0784, 0.0094, 0.0361, 0.0297]])

In [12]:
x = torch.rand([2,2])
y = torch.rand([2,2])

x + y

tensor([[0.8124, 0.4714],
        [1.5008, 1.3396]])

In [13]:
(x+y).numpy()

array([[0.81240046, 0.4713856 ],
       [1.500824  , 1.3395782 ]], dtype=float32)

In [14]:
torch.tensor((x+y).numpy())

tensor([[0.8124, 0.4714],
        [1.5008, 1.3396]])

### differentiation

In [34]:
x = torch.tensor(1.0, requires_grad = True) #by default it is false

def u(x):
    return x*x

def g(u):
    return -u

g(u(x)) --> -x*x

So its derivative with respect to x is -2x. At point x=1, this is equal to -2.

In [35]:
dgdx = torch.autograd.grad(g(u(x)), x)

In [36]:
dgdx

(tensor(-2.),)

In [37]:
x = torch.tensor(3.0, requires_grad = True)
torch.autograd.grad(g(u(x)), x)

(tensor(-6.),)

In [38]:
x

tensor(3., requires_grad=True)

## Curve fitting

f(x) = 5x^2 + 3

g(x, w) = w0 x^2 + w1 x + w2

goal --> g(x, w) ≈ f(x)

loss function: L(w) = Σ (f(x) - g(x, w))^2

- We can calculate stochastic gradient descent of L(W) wrt W

In [39]:
import numpy as np
import torch

# Assuming we know that the desired function is a polynomial of 2nd degree, we
# allocate a vector of size 3 to hold the coefficients and initialize it with
# random noise.
w = torch.tensor(torch.randn([3, 1]), requires_grad=True)

w

  w = torch.tensor(torch.randn([3, 1]), requires_grad=True)


tensor([[-1.2976],
        [-1.0065],
        [ 0.9774]], requires_grad=True)

In [40]:
# We use the Adam optimizer with learning rate set to 0.1 to minimize the loss.
opt = torch.optim.Adam([w], 0.1)

opt

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.1
    maximize: False
    weight_decay: 0
)

In [44]:
x2 = torch.rand(100) 
x = x2 * 20 - 10
y = 5 * x * x + 3

In [45]:
x

tensor([-2.8069, -9.0613,  8.1456,  1.6699, -6.4031, -1.5050, -4.1948, -0.7398,
         9.3466, -4.9756, -2.1995, -3.2008,  1.2858, -1.9381, -9.8799,  9.6505,
        -2.5194,  9.7330,  6.7686, -7.7375, -0.3468, -3.1263, -3.7703,  3.5126,
        -7.4346,  8.9468,  8.1607,  3.4066, -6.0022,  0.0320, -1.8150, -8.6166,
        -2.1704, -8.2007, -1.3589, -3.6989, -9.7847, -7.9993,  7.5302, -5.6971,
         5.0948, -7.8923, -0.3708,  6.7338, -6.8006, -5.3266, -8.6830,  2.7671,
         1.8125,  2.0179, -7.9539, -3.1543,  2.5737,  1.6119, -2.1442,  5.7369,
        -3.6795,  7.4567,  4.1785, -7.6737, -6.0905,  3.2620, -4.1335, -4.3653,
         6.0254, -5.9413,  9.2114,  6.3920,  7.7732, -0.5257,  9.0921, -9.3792,
        -6.7093,  3.7172, -4.1679, -8.5698, -4.7751, -6.5368, -2.7709,  9.4531,
        -6.3863,  2.0916, -1.5845, -6.0845,  4.4837,  8.5079, -0.8623, -2.6502,
         0.5062,  5.6700,  1.3096, -3.3423,  2.6363, -7.9379,  2.3675,  7.4028,
         0.5558,  7.7707,  0.4296,  6.02

In [52]:
a = x[0]
a*a, a, 1

(tensor(7.8785), tensor(-2.8069), 1)

#### stack

In [56]:
a = torch.tensor(1)
torch.stack([a,a,a])

tensor([1, 1, 1])

In [58]:
torch.stack([a,a,a], 1)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [59]:
a = torch.rand([2,2])
torch.stack([a,a,a])

tensor([[[0.2222, 0.0676],
         [0.7781, 0.2202]],

        [[0.2222, 0.0676],
         [0.7781, 0.2202]],

        [[0.2222, 0.0676],
         [0.7781, 0.2202]]])

In [60]:
torch.stack([a,a,a], 1)

tensor([[[0.2222, 0.0676],
         [0.2222, 0.0676],
         [0.2222, 0.0676]],

        [[0.7781, 0.2202],
         [0.7781, 0.2202],
         [0.7781, 0.2202]]])

In [50]:
f = torch.stack([x * x, x, torch.ones_like(x)], 1)
f

tensor([[ 7.8785e+00, -2.8069e+00,  1.0000e+00],
        [ 8.2107e+01, -9.0613e+00,  1.0000e+00],
        [ 6.6350e+01,  8.1456e+00,  1.0000e+00],
        [ 2.7887e+00,  1.6699e+00,  1.0000e+00],
        [ 4.1000e+01, -6.4031e+00,  1.0000e+00],
        [ 2.2649e+00, -1.5050e+00,  1.0000e+00],
        [ 1.7596e+01, -4.1948e+00,  1.0000e+00],
        [ 5.4730e-01, -7.3980e-01,  1.0000e+00],
        [ 8.7358e+01,  9.3466e+00,  1.0000e+00],
        [ 2.4756e+01, -4.9756e+00,  1.0000e+00],
        [ 4.8378e+00, -2.1995e+00,  1.0000e+00],
        [ 1.0245e+01, -3.2008e+00,  1.0000e+00],
        [ 1.6532e+00,  1.2858e+00,  1.0000e+00],
        [ 3.7563e+00, -1.9381e+00,  1.0000e+00],
        [ 9.7612e+01, -9.8799e+00,  1.0000e+00],
        [ 9.3132e+01,  9.6505e+00,  1.0000e+00],
        [ 6.3472e+00, -2.5194e+00,  1.0000e+00],
        [ 9.4731e+01,  9.7330e+00,  1.0000e+00],
        [ 4.5814e+01,  6.7686e+00,  1.0000e+00],
        [ 5.9869e+01, -7.7375e+00,  1.0000e+00],
        [ 1.2029e-01

In [61]:
torch.stack([x * x, x, torch.ones_like(x)])

tensor([[ 7.8785e+00,  8.2107e+01,  6.6350e+01,  2.7887e+00,  4.1000e+01,
          2.2649e+00,  1.7596e+01,  5.4730e-01,  8.7358e+01,  2.4756e+01,
          4.8378e+00,  1.0245e+01,  1.6532e+00,  3.7563e+00,  9.7612e+01,
          9.3132e+01,  6.3472e+00,  9.4731e+01,  4.5814e+01,  5.9869e+01,
          1.2029e-01,  9.7739e+00,  1.4215e+01,  1.2338e+01,  5.5274e+01,
          8.0045e+01,  6.6597e+01,  1.1605e+01,  3.6026e+01,  1.0226e-03,
          3.2944e+00,  7.4246e+01,  4.7107e+00,  6.7252e+01,  1.8465e+00,
          1.3682e+01,  9.5740e+01,  6.3989e+01,  5.6703e+01,  3.2457e+01,
          2.5957e+01,  6.2289e+01,  1.3753e-01,  4.5345e+01,  4.6248e+01,
          2.8373e+01,  7.5394e+01,  7.6569e+00,  3.2852e+00,  4.0720e+00,
          6.3265e+01,  9.9496e+00,  6.6237e+00,  2.5984e+00,  4.5977e+00,
          3.2912e+01,  1.3539e+01,  5.5602e+01,  1.7459e+01,  5.8886e+01,
          3.7094e+01,  1.0641e+01,  1.7086e+01,  1.9055e+01,  3.6305e+01,
          3.5299e+01,  8.4850e+01,  4.

In [64]:
f.shape, w.shape

(torch.Size([100, 3]), torch.Size([3, 1]))

#### mat mul

In [62]:
f @ w

tensor([[-6.4205e+00],
        [-9.6443e+01],
        [-9.3317e+01],
        [-4.3220e+00],
        [-4.5779e+01],
        [-4.4676e-01],
        [-1.7633e+01],
        [ 1.0118e+00],
        [-1.2179e+02],
        [-2.6138e+01],
        [-3.0863e+00],
        [-9.0950e+00],
        [-2.4620e+00],
        [-1.9460e+00],
        [-1.1574e+02],
        [-1.2958e+02],
        [-4.7229e+00],
        [-1.3174e+02],
        [-6.5284e+01],
        [-6.8921e+01],
        [ 1.1704e+00],
        [-8.5584e+00],
        [-1.3673e+01],
        [-1.8568e+01],
        [-6.3262e+01],
        [-1.1189e+02],
        [-9.3652e+01],
        [-1.7510e+01],
        [-3.9728e+01],
        [ 9.4388e-01],
        [-1.4705e+00],
        [-8.6691e+01],
        [-2.9505e+00],
        [-7.8033e+01],
        [-5.0909e-02],
        [-1.3053e+01],
        [-1.1341e+02],
        [-7.4003e+01],
        [-8.0180e+01],
        [-3.5404e+01],
        [-3.7832e+01],
        [-7.1904e+01],
        [ 1.1722e+00],
        [-6

In [66]:
yhat = torch.squeeze(f @ w, 1)
yhat

tensor([-6.4205e+00, -9.6443e+01, -9.3317e+01, -4.3220e+00, -4.5779e+01,
        -4.4676e-01, -1.7633e+01,  1.0118e+00, -1.2179e+02, -2.6138e+01,
        -3.0863e+00, -9.0950e+00, -2.4620e+00, -1.9460e+00, -1.1574e+02,
        -1.2958e+02, -4.7229e+00, -1.3174e+02, -6.5284e+01, -6.8921e+01,
         1.1704e+00, -8.5584e+00, -1.3673e+01, -1.8568e+01, -6.3262e+01,
        -1.1189e+02, -9.3652e+01, -1.7510e+01, -3.9728e+01,  9.4388e-01,
        -1.4705e+00, -8.6691e+01, -2.9505e+00, -7.8033e+01, -5.0909e-02,
        -1.3053e+01, -1.1341e+02, -7.4003e+01, -8.0180e+01, -3.5404e+01,
        -3.7832e+01, -7.1904e+01,  1.1722e+00, -6.4639e+01, -5.2189e+01,
        -3.0478e+01, -8.8113e+01, -1.1743e+01, -5.1099e+00, -6.3375e+00,
        -7.3109e+01, -8.7582e+00, -1.0208e+01, -4.0167e+00, -2.8303e+00,
        -4.7503e+01, -1.2887e+01, -7.8676e+01, -2.5884e+01, -6.7709e+01,
        -4.1025e+01, -1.6114e+01, -1.7032e+01, -1.9355e+01, -5.2196e+01,
        -3.8846e+01, -1.1840e+02, -5.8472e+01, -8.5

In [67]:
yhat.shape, y.shape

(torch.Size([100]), torch.Size([100]))

In [68]:
loss = torch.nn.functional.mse_loss(yhat, y)


In [69]:
loss

tensor(79168.4453, grad_fn=<MseLossBackward0>)

In [70]:
opt.zero_grad()
loss.backward()
opt.step()

In [73]:
w

tensor([[-1.1976],
        [-1.1065],
        [ 1.0774]], requires_grad=True)

### model

In [74]:
for input, target in dataset:
    optimizer.zero_grad()   # Reset gradients to zero to avoid accumulation
    output = model(input)   # Forward pass: compute the predicted output
    loss = loss_fn(output, target)  # Compute loss
    loss.backward()         # Backpropagation: compute gradients
    optimizer.step()        # Update parameters based on current gradients

NameError: name 'dataset' is not defined

In [79]:
import numpy as np
import torch

# Assuming we know that the desired function is a polynomial of 2nd degree, we
# allocate a vector of size 3 to hold the coefficients and initialize it with
# random noise.
w = torch.tensor(torch.randn([3, 1]), requires_grad=True)

# We use the Adam optimizer with learning rate set to 0.1 to minimize the loss.
opt = torch.optim.Adam([w], 0.1)

def model(x):
    # We define yhat to be our estimate of y.
    f = torch.stack([x * x, x, torch.ones_like(x)], 1)
    yhat = torch.squeeze(f @ w, 1)
    return yhat

def compute_loss(y, yhat):
    # The loss is defined to be the mean squared error distance between our
    # estimate of y and its true value. 
    loss = torch.nn.functional.mse_loss(yhat, y)
    return loss

def generate_data():
    # Generate some training data based on the true function
    x = torch.rand(100) * 20 - 10
    y = 5 * x * x + 3
    return x, y

x, y = generate_data()

def train_step():
   
    yhat = model(x)
    loss = compute_loss(y, yhat)

    opt.zero_grad()
    loss.backward()
    opt.step()

for _ in range(1000):
    train_step()

print(w.detach().numpy())

  w = torch.tensor(torch.randn([3, 1]), requires_grad=True)


[[ 4.9943714e+00]
 [-8.7670790e-04]
 [ 3.3216717e+00]]


In [80]:
w

tensor([[ 4.9944e+00],
        [-8.7671e-04],
        [ 3.3217e+00]], requires_grad=True)

In [81]:
w.detach()

tensor([[ 4.9944e+00],
        [-8.7671e-04],
        [ 3.3217e+00]])

In [85]:
w.detach()[1]

tensor([-0.0009])

## Modules

In [94]:
import torch

class Net(torch.nn.Module):

  def __init__(self):
    super().__init__()
    self.a = torch.nn.Parameter(torch.rand(1)) #can be accessed as parameters
    self.b = torch.nn.Parameter(torch.rand(1))

  def forward(self, x):
    yhat = self.a * x + self.b
    return yhat

In [95]:
x = torch.arange(100, dtype=torch.float32)

net = Net()
y = net(x)

In [96]:
for p in net.parameters():
    print(p)

Parameter containing:
tensor([0.0862], requires_grad=True)
Parameter containing:
tensor([0.0406], requires_grad=True)


In [97]:
x = torch.arange(100, dtype=torch.float32) / 100
y = 5 * x + 3 + torch.rand(100) * 0.3

# lets optimize parametres

In [98]:
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.01)

for i in range(10000):
  net.zero_grad()
  yhat = net(x)
  loss = criterion(yhat, y)
  loss.backward()
  optimizer.step()

print(net.a, net.b) # Should be close to 5 and 3

Parameter containing:
tensor([5.0489], requires_grad=True) Parameter containing:
tensor([3.1260], requires_grad=True)


In [99]:
x

tensor([0.0000, 0.0100, 0.0200, 0.0300, 0.0400, 0.0500, 0.0600, 0.0700, 0.0800,
        0.0900, 0.1000, 0.1100, 0.1200, 0.1300, 0.1400, 0.1500, 0.1600, 0.1700,
        0.1800, 0.1900, 0.2000, 0.2100, 0.2200, 0.2300, 0.2400, 0.2500, 0.2600,
        0.2700, 0.2800, 0.2900, 0.3000, 0.3100, 0.3200, 0.3300, 0.3400, 0.3500,
        0.3600, 0.3700, 0.3800, 0.3900, 0.4000, 0.4100, 0.4200, 0.4300, 0.4400,
        0.4500, 0.4600, 0.4700, 0.4800, 0.4900, 0.5000, 0.5100, 0.5200, 0.5300,
        0.5400, 0.5500, 0.5600, 0.5700, 0.5800, 0.5900, 0.6000, 0.6100, 0.6200,
        0.6300, 0.6400, 0.6500, 0.6600, 0.6700, 0.6800, 0.6900, 0.7000, 0.7100,
        0.7200, 0.7300, 0.7400, 0.7500, 0.7600, 0.7700, 0.7800, 0.7900, 0.8000,
        0.8100, 0.8200, 0.8300, 0.8400, 0.8500, 0.8600, 0.8700, 0.8800, 0.8900,
        0.9000, 0.9100, 0.9200, 0.9300, 0.9400, 0.9500, 0.9600, 0.9700, 0.9800,
        0.9900])

In [106]:
x.unsqueeze(0)

tensor([[0.0000, 0.0100, 0.0200, 0.0300, 0.0400, 0.0500, 0.0600, 0.0700, 0.0800,
         0.0900, 0.1000, 0.1100, 0.1200, 0.1300, 0.1400, 0.1500, 0.1600, 0.1700,
         0.1800, 0.1900, 0.2000, 0.2100, 0.2200, 0.2300, 0.2400, 0.2500, 0.2600,
         0.2700, 0.2800, 0.2900, 0.3000, 0.3100, 0.3200, 0.3300, 0.3400, 0.3500,
         0.3600, 0.3700, 0.3800, 0.3900, 0.4000, 0.4100, 0.4200, 0.4300, 0.4400,
         0.4500, 0.4600, 0.4700, 0.4800, 0.4900, 0.5000, 0.5100, 0.5200, 0.5300,
         0.5400, 0.5500, 0.5600, 0.5700, 0.5800, 0.5900, 0.6000, 0.6100, 0.6200,
         0.6300, 0.6400, 0.6500, 0.6600, 0.6700, 0.6800, 0.6900, 0.7000, 0.7100,
         0.7200, 0.7300, 0.7400, 0.7500, 0.7600, 0.7700, 0.7800, 0.7900, 0.8000,
         0.8100, 0.8200, 0.8300, 0.8400, 0.8500, 0.8600, 0.8700, 0.8800, 0.8900,
         0.9000, 0.9100, 0.9200, 0.9300, 0.9400, 0.9500, 0.9600, 0.9700, 0.9800,
         0.9900]])

In [109]:
x.unsqueeze(1)

tensor([[0.0000],
        [0.0100],
        [0.0200],
        [0.0300],
        [0.0400],
        [0.0500],
        [0.0600],
        [0.0700],
        [0.0800],
        [0.0900],
        [0.1000],
        [0.1100],
        [0.1200],
        [0.1300],
        [0.1400],
        [0.1500],
        [0.1600],
        [0.1700],
        [0.1800],
        [0.1900],
        [0.2000],
        [0.2100],
        [0.2200],
        [0.2300],
        [0.2400],
        [0.2500],
        [0.2600],
        [0.2700],
        [0.2800],
        [0.2900],
        [0.3000],
        [0.3100],
        [0.3200],
        [0.3300],
        [0.3400],
        [0.3500],
        [0.3600],
        [0.3700],
        [0.3800],
        [0.3900],
        [0.4000],
        [0.4100],
        [0.4200],
        [0.4300],
        [0.4400],
        [0.4500],
        [0.4600],
        [0.4700],
        [0.4800],
        [0.4900],
        [0.5000],
        [0.5100],
        [0.5200],
        [0.5300],
        [0.5400],
        [0

In [110]:
class Net(torch.nn.Module):

  def __init__(self):
    super().__init__()
    self.linear = torch.nn.Linear(1, 1)

  def forward(self, x):
    yhat = self.linear(x.unsqueeze(1)).squeeze(1)
    return yhat

In [111]:
net = Net()
for p in net.parameters():
    print(p)

Parameter containing:
tensor([[0.3907]], requires_grad=True)
Parameter containing:
tensor([0.7819], requires_grad=True)


In [112]:
model = torch.nn.Sequential(
    torch.nn.Linear(64, 32),
    torch.nn.ReLU(),
    torch.nn.Linear(32, 10),
)

In [113]:
model

Sequential(
  (0): Linear(in_features=64, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=10, bias=True)
)

## Broadcasting

In [117]:
import torch

a = torch.tensor([[1., 2.], [3., 4.]])
b = torch.tensor([[1.], [2.]])
c1 = a + b.repeat([1, 2])
c = a + b

In [118]:
c

tensor([[2., 3.],
        [5., 6.]])

In [119]:
c1

tensor([[2., 3.],
        [5., 6.]])

In [122]:
import torch.nn as nn

# Define a linear layer with 10 input features and 5 output features
linear_layer = nn.Linear(10, 5)

# Example input tensor (e.g., a single data point with 10 features)
input_tensor = torch.randn(1, 10)

# Applying the linear layer
output_tensor = linear_layer(input_tensor)


In [123]:
input_tensor

tensor([[-0.2088,  0.9938, -0.4504,  1.0474,  1.5581,  1.5831,  1.6051, -0.6723,
         -0.1007,  2.0560]])

In [124]:
linear_layer

Linear(in_features=10, out_features=5, bias=True)

In [125]:
output_tensor

tensor([[-1.6648, -0.5032,  1.1686,  0.2401, -0.3479]],
       grad_fn=<AddmmBackward0>)

In [139]:
a = torch.rand([5, 3, 5])
b = torch.rand([5, 1, 6])

linear = torch.nn.Linear(11, 10)

# concat a and b and apply nonlinearity
tiled_b = b.repeat([1, 3, 1])

In [140]:
b

tensor([[[0.2791, 0.1036, 0.9933, 0.2836, 0.8271, 0.9139]],

        [[0.6310, 0.0739, 0.6931, 0.1218, 0.4810, 0.8365]],

        [[0.4930, 0.1933, 0.4766, 0.4366, 0.1786, 0.4227]],

        [[0.0317, 0.1791, 0.8637, 0.7781, 0.8859, 0.2716]],

        [[0.4337, 0.0602, 0.6332, 0.1904, 0.7232, 0.2977]]])

In [141]:
b.shape

torch.Size([5, 1, 6])

In [142]:
b[0]

tensor([[0.2791, 0.1036, 0.9933, 0.2836, 0.8271, 0.9139]])

In [143]:
tiled_b

tensor([[[0.2791, 0.1036, 0.9933, 0.2836, 0.8271, 0.9139],
         [0.2791, 0.1036, 0.9933, 0.2836, 0.8271, 0.9139],
         [0.2791, 0.1036, 0.9933, 0.2836, 0.8271, 0.9139]],

        [[0.6310, 0.0739, 0.6931, 0.1218, 0.4810, 0.8365],
         [0.6310, 0.0739, 0.6931, 0.1218, 0.4810, 0.8365],
         [0.6310, 0.0739, 0.6931, 0.1218, 0.4810, 0.8365]],

        [[0.4930, 0.1933, 0.4766, 0.4366, 0.1786, 0.4227],
         [0.4930, 0.1933, 0.4766, 0.4366, 0.1786, 0.4227],
         [0.4930, 0.1933, 0.4766, 0.4366, 0.1786, 0.4227]],

        [[0.0317, 0.1791, 0.8637, 0.7781, 0.8859, 0.2716],
         [0.0317, 0.1791, 0.8637, 0.7781, 0.8859, 0.2716],
         [0.0317, 0.1791, 0.8637, 0.7781, 0.8859, 0.2716]],

        [[0.4337, 0.0602, 0.6332, 0.1904, 0.7232, 0.2977],
         [0.4337, 0.0602, 0.6332, 0.1904, 0.7232, 0.2977],
         [0.4337, 0.0602, 0.6332, 0.1904, 0.7232, 0.2977]]])

In [146]:
a

tensor([[[0.1106, 0.5821, 0.3970, 0.5055, 0.0507],
         [0.9373, 0.9731, 0.4537, 0.6365, 0.8376],
         [0.1120, 0.9922, 0.2408, 0.6086, 0.2000]],

        [[0.9817, 0.7266, 0.7363, 0.4193, 0.5483],
         [0.0100, 0.8953, 0.6534, 0.2833, 0.8950],
         [0.9196, 0.6958, 0.5075, 0.1058, 0.1247]],

        [[0.4926, 0.3604, 0.8470, 0.3826, 0.9909],
         [0.7365, 0.7955, 0.9966, 0.5950, 0.5847],
         [0.1638, 0.8862, 0.9742, 0.6937, 0.6358]],

        [[0.4838, 0.6866, 0.8910, 0.8888, 0.9326],
         [0.5920, 0.9253, 0.0351, 0.7451, 0.8309],
         [0.1682, 0.5924, 0.2854, 0.4372, 0.0248]],

        [[0.7940, 0.9118, 0.0298, 0.8933, 0.3513],
         [0.9799, 0.4836, 0.7677, 0.7462, 0.3504],
         [0.5648, 0.0758, 0.2294, 0.9026, 0.0443]]])

In [152]:
c = torch.cat([a, tiled_b], 2)

In [153]:
a.shape, tiled_b.shape, c.shape

(torch.Size([5, 3, 5]), torch.Size([5, 3, 6]), torch.Size([5, 3, 11]))

In [154]:
c

tensor([[[0.1106, 0.5821, 0.3970, 0.5055, 0.0507, 0.2791, 0.1036, 0.9933,
          0.2836, 0.8271, 0.9139],
         [0.9373, 0.9731, 0.4537, 0.6365, 0.8376, 0.2791, 0.1036, 0.9933,
          0.2836, 0.8271, 0.9139],
         [0.1120, 0.9922, 0.2408, 0.6086, 0.2000, 0.2791, 0.1036, 0.9933,
          0.2836, 0.8271, 0.9139]],

        [[0.9817, 0.7266, 0.7363, 0.4193, 0.5483, 0.6310, 0.0739, 0.6931,
          0.1218, 0.4810, 0.8365],
         [0.0100, 0.8953, 0.6534, 0.2833, 0.8950, 0.6310, 0.0739, 0.6931,
          0.1218, 0.4810, 0.8365],
         [0.9196, 0.6958, 0.5075, 0.1058, 0.1247, 0.6310, 0.0739, 0.6931,
          0.1218, 0.4810, 0.8365]],

        [[0.4926, 0.3604, 0.8470, 0.3826, 0.9909, 0.4930, 0.1933, 0.4766,
          0.4366, 0.1786, 0.4227],
         [0.7365, 0.7955, 0.9966, 0.5950, 0.5847, 0.4930, 0.1933, 0.4766,
          0.4366, 0.1786, 0.4227],
         [0.1638, 0.8862, 0.9742, 0.6937, 0.6358, 0.4930, 0.1933, 0.4766,
          0.4366, 0.1786, 0.4227]],

        [[0.4

In [155]:
d = torch.nn.functional.relu(linear(c))

print(d.shape) 

torch.Size([5, 3, 10])


In [156]:
a = torch.rand([5, 3, 5])
b = torch.rand([5, 1, 6])

linear1 = torch.nn.Linear(5, 10)
linear2 = torch.nn.Linear(6, 10)

pa = linear1(a)
pb = linear2(b)
d = torch.nn.functional.relu(pa + pb)

print(d.shape)

torch.Size([5, 3, 10])


In [157]:
a = torch.tensor([[1.], [2.]])
b = torch.tensor([1., 2.])
c = torch.sum(a + b)

print(c)

tensor(12.)


In [161]:
a = torch.tensor([[1.], [2.]])
b = torch.tensor([1., 2.])
c = torch.sum(a + b, 0)

print(c)

tensor([5., 7.])


In [164]:
a = torch.tensor([[1.], [2.]])
b = torch.tensor([[1., 2.], [1., 2.]])
torch.sum(a + b, 1)


tensor([5., 7.])

In [165]:
[1, 1
 2, 2
 1, 2
 1, 2]

SyntaxError: invalid syntax. Perhaps you forgot a comma? (435586646.py, line 1)