# Training Our First Neural Network with PyTorch

## Pass forward
Generating a prediction from a model. 

Output can be
* Single probability
* Multiclass probability
* Numerical output (regression)

Binary classifier example

In [7]:
import torch
import torch.nn as nn

input_tensor = torch.Tensor([[3, 4, 6, 2, 3, 6, 8, 9]])

# Implement a small neural network for binary classification
model = nn.Sequential(
  nn.Linear(8, 1),
  nn.Sigmoid()
)

output = model(input_tensor)
print(output)

tensor([[0.8924]], grad_fn=<SigmoidBackward0>)


Linear regression example

In [8]:
import torch
import torch.nn as nn

input_tensor = torch.Tensor([[3, 4, 6, 7, 10, 12, 2, 3, 6, 8, 9]])

# Update network below to perform a multi-class classification with four labels
model = nn.Sequential(
  nn.Linear(11, 20),
  nn.Linear(20, 12),
  nn.Linear(12, 6),
  nn.Linear(6, 1)
)

output = model(input_tensor)
print(output)

tensor([[-0.4080]], grad_fn=<AddmmBackward0>)


Multiclass classification

In [9]:
import torch
import torch.nn as nn

input_tensor = torch.Tensor([[3, 4, 6, 7, 10, 12, 2, 3, 6, 8, 9]])

# Update network below to perform a multi-class classification with four labels
model = nn.Sequential(
  nn.Linear(11, 20),
  nn.Linear(20, 12),
  nn.Linear(12, 6),
  nn.Linear(6, 4), 
  nn.Softmax(dim=-1)
)

output = model(input_tensor)
print(output)

tensor([[0.1614, 0.2548, 0.2402, 0.3435]], grad_fn=<SoftmaxBackward0>)


## Loss function
Assess difference between actual value ansd predicted value

Takes 
* prediction ŷ, a tensor with the same dimension as the number of classes (the softmax output)
* actual value y a float 
* outputs a float that has to be minimized

loss = F(y, ŷ) 

During comparison: transform y using one-hot encoding, to get a tensor of the same dimension as ŷ.

In [11]:
import torch.nn.functional as F

F.one_hot(torch.tensor(0), num_classes=3)

tensor([1, 0, 0])

### Cross entropy loss function

In [29]:
from torch.nn import CrossEntropyLoss

scores = torch.tensor([[-0.122, 0.105]]) # model prediction before the softmax function
one_hot_target = torch.Tensor([[1, 0]]) # ground truth label
criterion = CrossEntropyLoss()
criterion(scores.double(), one_hot_target.double()) # a single float: the loss of that sample (minimize this)

tensor(0.8131, dtype=torch.float64)

Example

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

y = [2]
scores = torch.tensor([[0.1, 6.0, -2.0, 3.2]])

# Create a one-hot encoded vector of the label y
one_hot_label = F.one_hot(torch.tensor(y), scores.shape[1])

# Create the cross entropy loss function
criterion = nn.CrossEntropyLoss()

# Calculate the cross entropy loss
loss = criterion(scores.double(), one_hot_label.double())
print(loss)

tensor(8.0619, dtype=torch.float64)


## Derivatives

Derivative = slope = gradient

When the model is created, weights are initialized randomly.
During the first forward pass, compute gradient of the loss function. Update the model with back-propagation.

In [42]:
import torch
import torch.nn as nn
import numpy as np

sample = torch.Tensor([[1, 2, 3,4 ,5 ,6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]])
target = torch.Tensor([[1, 0]]) # ground truth label

# create a model
model = nn.Sequential(
    nn.Linear(16, 8),
    nn.Linear(8, 4),
    nn.Linear(4, 2)
)
prediction = model(sample)

In [43]:
# calculate loss and gradients
criterion = nn.CrossEntropyLoss()
loss = criterion(prediction, target)
loss.backward()

In [46]:
# Layer gradients
model[0].weight.grad, model[0].bias.grad

(tensor([[-3.0243e-02, -6.0486e-02, -9.0729e-02, -1.2097e-01, -1.5121e-01,
          -1.8146e-01, -2.1170e-01, -2.4194e-01, -2.7219e-01, -3.0243e-01,
          -3.3267e-01, -3.6291e-01, -3.9316e-01, -4.2340e-01, -4.5364e-01,
          -4.8389e-01],
         [ 6.0826e-02,  1.2165e-01,  1.8248e-01,  2.4330e-01,  3.0413e-01,
           3.6496e-01,  4.2578e-01,  4.8661e-01,  5.4744e-01,  6.0826e-01,
           6.6909e-01,  7.2991e-01,  7.9074e-01,  8.5157e-01,  9.1239e-01,
           9.7322e-01],
         [-7.1586e-03, -1.4317e-02, -2.1476e-02, -2.8634e-02, -3.5793e-02,
          -4.2952e-02, -5.0110e-02, -5.7269e-02, -6.4427e-02, -7.1586e-02,
          -7.8744e-02, -8.5903e-02, -9.3062e-02, -1.0022e-01, -1.0738e-01,
          -1.1454e-01],
         [ 5.3040e-02,  1.0608e-01,  1.5912e-01,  2.1216e-01,  2.6520e-01,
           3.1824e-01,  3.7128e-01,  4.2432e-01,  4.7736e-01,  5.3040e-01,
           5.8344e-01,  6.3648e-01,  6.8952e-01,  7.4256e-01,  7.9560e-01,
           8.4864e-01],
    

Update model parameters by hand

In [31]:
# learning rate (small)
lr = 0.001

# Update weights
weight = model[0].weight
weight_grad = weight.grad
weight = weight - lr * weight_grad

# update biases
bias = model[0].bias
bias_grad = bias.grad
bias = bias - lr * bias_grad

### Gradient descent

Convex functions have 1 global minimum. N on-convex functions may have several local minima. Loss functions used in NN are non-convex. The method to find the best local minimum is called gradient descent, and is implemented in PyTorch as optimizers.

In [32]:
import torch.optim as optim

# create optimizer stochastic gradient descent
optimizer = optim.SGD(model.parameters(), lr = 0.001)

# automatically update parameters
optimizer.step()

## First training loop

* Create a model
* Choose a loss function
* Create a dataset
* Define optimizer
* Run a training loop
  * Calculate loss
  * Calculate local gradients
  * update model params
 
Example:
Data science salary dataset
* The target is the salary, so a continuous quantity _> regression problem -> last layer is a linear layer
* Use Mean Squared Loss as loss function, as linear regression

In [51]:
from torch.utils.data import TensorDataset

# create the model
model = nn.Sequential(
    nn.Linear(4, 2),
    nn.Linear(2, 1)
)

# create dataset
dataset = TensorDataset(
            torch.tensor(features).float(),
            torch.tensor(target).float()
        )
dataloader = DatLoader(dataset, batch_size=4, shuffle=True)

# create optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

NameError: name 'features' is not defined

In [None]:
# the training loop
for epoch in range(num_epochs):
    for data in dataloader:
        # set gradients to 0
        optimizer.zero_grad()
        # get feature and target
        feature, target = data
        # forward pass
        pred = model(feature)
        # compute loss and gradients
        loss = criterion(pred, target)
        loss.backward()
        # update model params
        optimizer.step()

Calculating the loss using numpy and pytorch
* using MSELoss (alternative: L2 loss, mean absolute error)

In [None]:
y_hat = np.array(10)
y = np.array(1)

# Calculate the MSELoss using NumPy
mse_numpy = np.square(y_hat - y).mean()

# Create the MSELoss function
criterion = nn.MSELoss()

# Calculate the MSELoss using the created loss function
mse_pytorch = criterion(torch.tensor(y_hat).float(), torch.tensor(y).float())
print(mse_pytorch)