# Training our Neural Net

First, prepare training data.

In [33]:
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import datasets, transforms

# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,)),
                              ])
# Download and load the training data
trainset = datasets.MNIST('~/.pytorch/MNIST_data/', download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
print (trainloader)

<torch.utils.data.dataloader.DataLoader object at 0x0000018D0A75B370>


## Backpropogation

NOT SURE IF CORRECT

In [34]:
x = torch.randn(3,3, requires_grad=True)
print("x:", x)

y = x**2
print("y:", y)

## grad_fn shows the function that generated this variable
print("y.grad_fn:", y.grad_fn)

y.retain_grad()

z = y.mean()
print("z:", z)

z.backward()
print("x.grad:", x.grad)
print("2*x/9:", 2*x/9)
print("y.grad:", y.grad)

x: tensor([[-0.9185, -0.6129,  0.0382],
        [-0.5686,  0.0330,  0.1796],
        [ 1.7340,  0.5142, -0.7649]], requires_grad=True)
y: tensor([[8.4366e-01, 3.7560e-01, 1.4577e-03],
        [3.2326e-01, 1.0900e-03, 3.2247e-02],
        [3.0069e+00, 2.6440e-01, 5.8501e-01]], grad_fn=<PowBackward0>)
y.grad_fn: <PowBackward0 object at 0x0000018D09E84940>
z: tensor(0.6037, grad_fn=<MeanBackward0>)
x.grad: tensor([[-0.2041, -0.1362,  0.0085],
        [-0.1263,  0.0073,  0.0399],
        [ 0.3853,  0.1143, -0.1700]])
2*x/9: tensor([[-0.2041, -0.1362,  0.0085],
        [-0.1263,  0.0073,  0.0399],
        [ 0.3853,  0.1143, -0.1700]], grad_fn=<DivBackward0>)
y.grad: tensor([[0.1111, 0.1111, 0.1111],
        [0.1111, 0.1111, 0.1111],
        [0.1111, 0.1111, 0.1111]])


## Loss and Autograd together

In [35]:
# Build a feed-forward network
model = nn.Sequential(nn.Linear(784, 128), # Flattened MNIST image sizes (28x28) & Linear layer maps
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10), # 10 output layers for each digit
                      nn.LogSoftmax(dim=1))  # this line is extra comparing to earlier nn.Sequential calls

criterion = nn.NLLLoss()
images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)

logits = model(images)
loss = criterion(logits, labels)

print('Before backward pass: \n', model[0].weight.grad)

loss.backward()

print('After backward pass: \n', model[0].weight.grad)

Before backward pass: 
 None
After backward pass: 
 tensor([[ 2.6610e-05,  2.6610e-05,  2.6610e-05,  ...,  2.6610e-05,
          2.6610e-05,  2.6610e-05],
        [ 4.8405e-04,  4.8405e-04,  4.8405e-04,  ...,  4.8405e-04,
          4.8405e-04,  4.8405e-04],
        [ 4.1136e-04,  4.1136e-04,  4.1136e-04,  ...,  4.1136e-04,
          4.1136e-04,  4.1136e-04],
        ...,
        [-1.5820e-03, -1.5820e-03, -1.5820e-03,  ..., -1.5820e-03,
         -1.5820e-03, -1.5820e-03],
        [ 3.8668e-03,  3.8668e-03,  3.8668e-03,  ...,  3.8668e-03,
          3.8668e-03,  3.8668e-03],
        [ 9.1510e-04,  9.1510e-04,  9.1510e-04,  ...,  9.1510e-04,
          9.1510e-04,  9.1510e-04]])


## Training the network!

In [36]:
from torch import optim

# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.SGD(model.parameters(), lr=0.01)
print(model.parameters())
for parameter in model.parameters():
    print(parameter.shape)

<generator object Module.parameters at 0x0000018D0A745510>
torch.Size([128, 784])
torch.Size([128])
torch.Size([64, 128])
torch.Size([64])
torch.Size([10, 64])
torch.Size([10])


In [37]:
print('Initial weights - ', model[0].weight)

images, labels = next(iter(trainloader))
images.resize_(64, 784) # another way to flatten

# Clear the gradients, do this because gradients are accumulated
optimizer.zero_grad()

# Forward pass, then backward pass, then update weights
output = model(images)
loss = criterion(output, labels)
loss.backward()
print('Gradient -', model[0].weight.grad)

# Take an update step and few the new weights
optimizer.step()
print('Updated weights - ', model[0].weight)

Initial weights -  Parameter containing:
tensor([[-0.0164, -0.0335,  0.0185,  ..., -0.0261,  0.0184, -0.0218],
        [ 0.0349, -0.0225, -0.0349,  ...,  0.0076,  0.0175,  0.0006],
        [-0.0347,  0.0137, -0.0325,  ...,  0.0286,  0.0035, -0.0252],
        ...,
        [-0.0282,  0.0017, -0.0025,  ..., -0.0073,  0.0283,  0.0160],
        [ 0.0292,  0.0062,  0.0017,  ...,  0.0139,  0.0218,  0.0102],
        [ 0.0330, -0.0005,  0.0019,  ...,  0.0161,  0.0098, -0.0028]],
       requires_grad=True)
Gradient - tensor([[ 0.0012,  0.0012,  0.0012,  ...,  0.0012,  0.0012,  0.0012],
        [-0.0010, -0.0010, -0.0010,  ..., -0.0010, -0.0010, -0.0010],
        [ 0.0016,  0.0016,  0.0016,  ...,  0.0016,  0.0016,  0.0016],
        ...,
        [-0.0058, -0.0058, -0.0058,  ..., -0.0058, -0.0058, -0.0058],
        [-0.0003, -0.0003, -0.0003,  ..., -0.0003, -0.0003, -0.0003],
        [ 0.0002,  0.0002,  0.0002,  ...,  0.0002,  0.0002,  0.0002]])
Updated weights -  Parameter containing:
tensor([[-0.

### Training for real

Now we'll put this algorithm into a loop so we can go through all the images. Some nomenclature, one pass through the entire dataset is called an *epoch*. So here we're going to loop through `trainloader` to get our training batches. For each batch, we'll doing a training pass where we calculate the loss, do a backwards pass, and update the weights.

>**Final Project:** This is the training pass for our network. If implemented correctly, you should see the training loss drop with each epoch.

1. Training Pass, calculate loss
2. Backwards Pass
3. Update Weights

In [None]:
# Steps:
# Clear the gradients
# Forward pass
# Calc loss
# Back pass to computer grads
# Update model


epochs = 5 # Number of training cycles

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for images, labels in trainloader:
        images = images.view(images.size(0), -1) # Flatten our images to 784 (28x28)

        # Zero out the gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(images)

        # Calculate loss
        loss = criterion(output, labels)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()

        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(trainloader):.4f}")

Epoch 1/5, Loss: 0.2557
Epoch 2/5, Loss: 0.2362


In [None]:
import matplotlib.pyplot as plt
import numpy as np
x = np.arange(938)
print (f"Length of trainloader:{len(trainloader)}")
nplosses = np.array(losses)
plt.figure(figsize=[12, 6])
plt.plot(x, losses[0], color='yellow', label="1st")
plt.plot(x, losses[1], color='b', label="2nd")
plt.plot(x, losses[2], color='r', label="3rd")
plt.plot(x, losses[3], color='g', label="4th")
plt.plot(x, losses[4], color='orange', label="5th")
plt.xlabel('batch idx')
plt.ylabel('loss')
plt.legend()
plt.show()