In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [2]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [3]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [4]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [5]:
X = torch.rand(1, 28, 28, device=device)
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

Predicted class: tensor([3], device='mps:0')


In [6]:
input_image = torch.rand(3,28,28)
print(input_image.size())

torch.Size([3, 28, 28])


In [7]:
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())

torch.Size([3, 784])


In [8]:
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
print(hidden1.size())

torch.Size([3, 20])


In [9]:
print(f"Before ReLU: {hidden1}\n\n")
hidden1 = nn.ReLU()(hidden1)
print(f"After ReLU: {hidden1}")

Before ReLU: tensor([[-0.6791,  0.5234, -0.3026, -0.5764, -0.4069, -0.2916, -1.1230, -0.3832,
         -0.2399, -0.1829,  0.1310, -0.8418,  0.2029, -0.2639,  0.5392, -0.0893,
         -0.0281, -0.0976, -0.0612, -0.8858],
        [-0.2225,  0.4426, -0.3271, -0.0850, -0.3785,  0.2563, -1.2436, -0.4439,
         -0.4035, -0.2912, -0.2403, -0.8931,  0.0549, -0.0542,  0.4632, -0.3807,
          0.0587, -0.2009, -0.3668, -0.5422],
        [-0.1711,  0.8223,  0.0136,  0.3530, -0.2582, -0.4468, -1.0489, -0.3348,
         -0.4758, -0.0736,  0.0798, -0.8588,  0.1293, -0.1687,  0.4663, -0.1518,
          0.0052, -0.3246, -0.2947, -0.7349]], grad_fn=<AddmmBackward0>)


After ReLU: tensor([[0.0000, 0.5234, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.1310, 0.0000, 0.2029, 0.0000, 0.5392, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.4426, 0.0000, 0.0000, 0.0000, 0.2563, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0549, 0.0000, 0.46

In [10]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)
input_image = torch.rand(3,28,28)
logits = seq_modules(input_image)

In [11]:
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)

In [12]:
print(f"Model structure: {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values : tensor([[ 0.0143, -0.0335,  0.0059,  ...,  0.0160,  0.0163,  0.0035],
        [ 0.0282, -0.0273,  0.0251,  ..., -0.0245, -0.0354, -0.0164]],
       device='mps:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values : tensor([0.0302, 0.0096], device='mps:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values : tensor([[ 0.0332,  0.0025, -0.0089,  ..., -0.0319, -0.0199, -0.0200],
        [ 0.0196,  0.0303, -0.0428,  ..., -0.0080, -0.0112,  0.0247]],
       device='mps:0', grad_fn=<SliceBa

In [13]:
import torch

x = torch.ones(5)  # input tensor
y = torch.zeros(3)  # expected output
w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
z = torch.matmul(x, w)+b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)

In [19]:
x

tensor([1., 1., 1., 1., 1.])

In [20]:
y

tensor([0., 0., 0.])

In [15]:
z

tensor([ 1.7029,  1.4576, -0.6328], grad_fn=<AddBackward0>)

In [21]:
w

tensor([[ 0.6624, -0.0310, -1.6024],
        [-0.2458, -0.2621, -0.7574],
        [ 0.1121,  0.9590, -0.0176],
        [-0.2341, -0.0659,  0.0900],
        [ 1.0663,  0.4067,  0.6415]], requires_grad=True)

In [22]:
b

tensor([0.3421, 0.4509, 1.0132], requires_grad=True)

In [23]:
z

tensor([ 1.7029,  1.4576, -0.6328])

In [14]:
loss

tensor(1.3211, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [16]:
print(f"Gradient function for z = {z.grad_fn}")
print(f"Gradient function for loss = {loss.grad_fn}")

Gradient function for z = <AddBackward0 object at 0x12d72b9a0>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x12d72bf10>


In [17]:
loss.backward()
print(w.grad)
print(b.grad)

tensor([[0.2820, 0.2704, 0.1156],
        [0.2820, 0.2704, 0.1156],
        [0.2820, 0.2704, 0.1156],
        [0.2820, 0.2704, 0.1156],
        [0.2820, 0.2704, 0.1156]])
tensor([0.2820, 0.2704, 0.1156])


In [18]:
z = torch.matmul(x, w)+b
print(z.requires_grad)

with torch.no_grad():
    z = torch.matmul(x, w)+b
print(z.requires_grad)

True
False


In [24]:
inp = torch.eye(4, 5, requires_grad=True)
out = (inp+1).pow(2).t()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"First call\n{inp.grad}")
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")
inp.grad.zero_()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")

First call
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])

Second call
tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.]])

Call after zeroing gradients
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])
