In [54]:
import os 
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [55]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [56]:
class NeuralNetwork(nn.Module):
    def __init__(self): #initialise nn layers
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512), #a linear layer with 28*28 (784) input features, 512 output features
            #it has 512 neurons, which all have a weight (w) for each of the 784 input features (x)
            #every neuron also has a bias (b)
            #i.e. neuron = sum(W*x + b)
            nn.ReLU(),
            nn.Linear(512, 512), 
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x): #occurs when data is input into nn
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

The weights and biases of each neuron are trained to be different through the process of backpropagation and optimization during the training of the neural network:

Initialization: Initially, the weights and biases are typically initialized randomly or using specific initialization techniques.

Forward Pass: During the forward pass, the input data is passed through the network, and each neuron computes its output using its current weights and biases.

Loss Calculation: The network's output is compared to the true labels using a loss function, which quantifies the difference between the predicted and actual values.

Backpropagation: The loss is propagated backward through the network. During this process, the gradients of the loss with respect to each weight and bias are computed using the chain rule of calculus.

Optimization: An optimization algorithm (e.g., Stochastic Gradient Descent, Adam) updates the weights and biases using the computed gradients. The update rule typically looks like this: [ \text{weight} = \text{weight} - \text{learning rate} \times \text{gradient of weight} ] [ \text{bias} = \text{bias} - \text{learning rate} \times \text{gradient of bias} ]

Iteration: Steps 2-5 are repeated for many epochs (iterations over the entire dataset), gradually adjusting the weights and biases to minimize the loss.

In [57]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [58]:
X = torch.rand(1, 28, 28, device=device)
logits = model(X) #10 raw predicted values for each class, this also runs forward once
logits

tensor([[ 0.0296, -0.0298,  0.0475,  0.1090,  0.0137, -0.0965, -0.1441,  0.1119,
          0.0360,  0.1035]], grad_fn=<AddmmBackward0>)

In [59]:
pred_probab = nn.Softmax(dim=1)(logits) #finds the prediction probabilities
pred_probab

tensor([[0.1008, 0.0950, 0.1026, 0.1092, 0.0992, 0.0889, 0.0847, 0.1095, 0.1015,
         0.1086]], grad_fn=<SoftmaxBackward0>)

In [60]:
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}") #highest probability

Predicted class: tensor([7])


In [61]:
#model layer demonstration
#sample mini batch
input_image = torch.rand(3,28,28)
print(input_image.size())

torch.Size([3, 28, 28])


In [62]:
#flatten converts the 2D 28x28 image into an array of 784 pixel values
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())

torch.Size([3, 784])


In [63]:
#applies a linear transformation on the input based on stored weights and biases
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
print(hidden1.size())

torch.Size([3, 20])


In [64]:
#ReLU activation function 
#determines whether neurons are active or not, transforms data and adds non linearity to nns
print(f"Before ReLU: {hidden1}\n\n")
hidden1 = nn.ReLU()(hidden1)
print(f"After ReLU: {hidden1}")
#but a problem with ReLU is that it can cause alot of dead neurons, because negative values get turned into zero
#below you can see a decent amount of dead neurons - not catastrophic but could cause problems if we decide to make the network deeper

Before ReLU: tensor([[ 0.2581, -0.1897, -0.2964,  0.6508, -0.1967,  0.4134,  0.2895,  0.1934,
          0.6775,  0.3864, -0.0812,  0.2852, -0.0442, -0.0150, -0.0426, -0.1924,
         -0.5672, -0.4487, -0.0916,  0.2519],
        [ 0.4474, -0.0427,  0.0074,  0.8741,  0.0471, -0.0846,  0.2101,  0.1953,
          0.2255,  0.1774,  0.0940,  0.2141,  0.0608, -0.1953,  0.6072, -0.4226,
         -0.1450, -0.3199,  0.1953, -0.3686],
        [ 0.3435, -0.0528, -0.1857,  0.5968, -0.0433,  0.1711,  0.0752,  0.3432,
          0.2639,  0.0344,  0.0628,  0.4501,  0.1666, -0.2169,  0.5048,  0.0233,
         -0.1420, -0.6633, -0.0027, -0.1372]], grad_fn=<AddmmBackward0>)


After ReLU: tensor([[0.2581, 0.0000, 0.0000, 0.6508, 0.0000, 0.4134, 0.2895, 0.1934, 0.6775,
         0.3864, 0.0000, 0.2852, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.2519],
        [0.4474, 0.0000, 0.0074, 0.8741, 0.0471, 0.0000, 0.2101, 0.1953, 0.2255,
         0.1774, 0.0940, 0.2141, 0.0608, 0.0000, 0.60

In [65]:
#nn.Sequential is an ordered container of modules - e.g. data is passed through the modules in the order that u wrote
# u can use this to speed make a nn
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)
input_image = torch.rand(3,28,28)
logits = seq_modules(input_image)

the last layer of the nn returns logits [-inf,inf] range

we pass this onto the softmax fn which scales it to values range [0,1]

represents predicted probability of each class

In [66]:
#dim parameter is indicating the dimension which values must sum to 1
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)

most layers in an nn are parameterised i.e. have weights and biases (which are then optimised in training)
subclassing nn.Module tracks all these fields, so you can see the parameters using parameters() or named_parameters()

this example iterates through every parameter, prints size then preview of values

In [67]:
print(f"Model structure: {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values : tensor([[ 0.0185,  0.0333, -0.0154,  ..., -0.0127,  0.0203, -0.0217],
        [-0.0047,  0.0294, -0.0304,  ..., -0.0239, -0.0148, -0.0238]],
       grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values : tensor([0.0161, 0.0330], grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values : tensor([[-0.0385, -0.0351,  0.0363,  ...,  0.0411, -0.0147,  0.0365],
        [-0.0293,  0.0306, -0.0183,  ...,  0.0421, -0.0337, -0.0442]],
       grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.bias | Si