In [1]:
import numpy as np

import torch
import torch.nn as nn
from torch import optim
from torchvision import datasets, transforms


import matplotlib.pyplot as plt

# prepare dataset

In [2]:
# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])
transform = transforms.ToTensor() # PIL image [0, 255] int === ToTensor() ==> PyTorch Tensor [0, 1] float

# Download and load the training data
trainset = datasets.MNIST(root = '~/.pytorch/MNIST_data/', download=True, train=True, transform=transform) # target_transform
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

# Define the model

In [3]:
model = nn.Sequential(nn.Linear(784, 128),
                     nn.ReLU(),
                     nn.Linear(128, 64),
                     nn.ReLU(),
                     nn.Linear(64, 10)).cuda() 
# Note that we do not define SOFTMAX here.
# Note: In Tensorflow / Keras, you need to define SOFTMAX.

In [4]:
print(model)

Sequential(
  (0): Linear(in_features=784, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=10, bias=True)
)


In [5]:
criterion = nn.CrossEntropyLoss()

$w_{new} = w_{old} - \eta*\text{Grad}_{w}(\text{one sample}, \text{criterion})$

In [6]:
learning_rate = 0.003
optimizer = optim.SGD(model.parameters(), lr = learning_rate)

# Train the model

In [7]:
my_device = 'cuda'

epochs = 5

for epoch in range(epochs):
    
    running_loss = 0
    
    for images, labels in trainloader:
        # Flatten - N*C*H*W ===> N*(C*H*W) <=> 64*784
        images = images.view(images.shape[0], -1)
        
        # Clear the gradients
        optimizer.zero_grad()
        
        # forward computation
        output = model(images.to(my_device))
        
        loss = criterion(output, labels.to(my_device))
        
        # backward pass
        loss.backward()
        
        # update
        optimizer.step()
        
        running_loss += loss.item()
    else:
        print(f"Training loss = {running_loss/len(trainloader)}")

Training loss = 2.2574854995141913
Training loss = 1.979426091032496
Training loss = 1.3257901071231248
Training loss = 0.8532887792536444
Training loss = 0.6291384800537817


# Prediction / Inference

In [None]:
images, labels = next(iter(trainloader))

In [22]:
data_index = 0
images[data_index].shape  # torch.Size([1, 28, 28])

input_vector = images[data_index].view(1, 784)

# Turn off gradient computation
with torch.no_grad():
    scores = model(input_vector.to(my_device))

print(scores) # logits

tensor([[ 1.4960, -4.6603, -0.8917,  5.5743, -3.6700,  4.3824, -2.4348, -3.8741,
          3.2556, -0.9287]], device='cuda:0')


In [29]:
print(nn.Softmax(dim=1)(scores))

import torch.nn.functional as F
print(F.softmax(scores))

tensor([[1.1905e-02, 2.5239e-05, 1.0934e-03, 7.0295e-01, 6.7948e-05, 2.1344e-01,
         2.3368e-04, 5.5402e-05, 6.9173e-02, 1.0537e-03]], device='cuda:0')
tensor([[1.1905e-02, 2.5239e-05, 1.0934e-03, 7.0295e-01, 6.7948e-05, 2.1344e-01,
         2.3368e-04, 5.5402e-05, 6.9173e-02, 1.0537e-03]], device='cuda:0')


  after removing the cwd from sys.path.


In [31]:
print(torch.argmax(nn.Softmax(dim=1)(scores)))

tensor(3, device='cuda:0')


In [32]:
print(torch.topk(nn.Softmax(dim=1)(scores), 1))

torch.return_types.topk(
values=tensor([[0.7029]], device='cuda:0'),
indices=tensor([[3]], device='cuda:0'))


In [24]:
labels

tensor([5, 8, 5, 5, 0, 9, 3, 8, 7, 5, 4, 3, 4, 5, 1, 2, 8, 2, 3, 9, 1, 8, 1, 7,
        5, 0, 0, 2, 0, 3, 8, 1, 3, 4, 8, 2, 7, 2, 1, 5, 0, 3, 0, 0, 6, 7, 3, 0,
        1, 1, 7, 3, 2, 6, 8, 1, 2, 3, 8, 1, 8, 7, 5, 4])

In [25]:
labels[0]

tensor(5)