In [1]:
import torch
import torch.nn as nn
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
from torch import optim

In [2]:
print(f"Is CUDA supported by this system?{torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device:{torch.cuda.current_device()}")
        
print(f"Name of current CUDA device:{torch.cuda.get_device_name(cuda_id)}")
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Is CUDA supported by this system?True
CUDA version: 10.2
ID of current CUDA device:0
Name of current CUDA device:GeForce RTX 2070


In [3]:
trainset = datasets.MNIST('', download=True, train=True, transform=transforms.ToTensor()) # ToTensor normalizes the image tensor [0, 255] -> [0, 1]
testset = datasets.MNIST('', download=True, train=False, transform=transforms.ToTensor())

train_loader = DataLoader(trainset, batch_size=64, shuffle=True)
test_loader = DataLoader(testset, batch_size=64, shuffle=True)

In [4]:
x = trainset[0] # first data point

In [5]:
image = x[0]
label = x[1]
print("shape of the first image:", image.shape) # This has the batch dimension.
print("label:", label)
image.reshape(-1, 784).shape

shape of the first image: torch.Size([1, 28, 28])
label: 5


torch.Size([1, 784])

In [6]:
# model configuration
input_size = 784
hidden_size = [128, 64]
output_size = 10

class Network(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        
        super().__init__()
        
        self.layer1 = nn.Linear(input_size, hidden_size[0]) # params = 784 * 128
        self.layer2 = nn.Linear(hidden_size[0], hidden_size[1]) # params = 128 * 64
        self.layer3 = nn.Linear(hidden_size[1], output_size) # params = 64*10
        
        self.relu = nn.ReLU()
        self.LogSoftmax = nn.LogSoftmax(dim=1) # dim = 0 if we are consuming without the batch dimension, 1 if batch.
        
    
    def forward(self, x):
        out = self.relu(self.layer1(x))
        out = self.relu(self.layer2(out))
        out = self.layer3(out)
#         print("output before softmax", out)
        out = self.LogSoftmax(out)
#         print("output after softmax", out)
        return out
        

In [7]:
model = Network(input_size, hidden_size, output_size)
model.to(device) # don't need assignment model = ...

Network(
  (layer1): Linear(in_features=784, out_features=128, bias=True)
  (layer2): Linear(in_features=128, out_features=64, bias=True)
  (layer3): Linear(in_features=64, out_features=10, bias=True)
  (relu): ReLU()
  (LogSoftmax): LogSoftmax(dim=1)
)

In [8]:
# model.forward(image.flatten().unsqueeze(dim=0)) # batch conversion when LogSoftmax expects a batch dimension
model.forward(image.reshape(-1, 784).to(device))
# model.forward(image.flatten()) # no batch conversion when LogSoftmax expects a single datapoint

tensor([[-2.4122, -2.4036, -2.2590, -2.4293, -2.1535, -2.2461, -2.2768, -2.4252,
         -2.2977, -2.1717]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)

In [9]:
lossFunction = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [29]:
# Training
n_epochs = 50
for epoch in range(n_epochs):
    print("epoch:", epoch)
    epoch_loss = 0
    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)
#         print("device", device)
#         print("images device", images.device)
        images = images.reshape(-1, 784) # rows of image vectors
        
        # forward pass
        output = model(images) # output  = rows of single values
#         print("shape of output for batch", output.shape)
#         print("shape of labels for batch", labels.shape)
        
        batch_loss = lossFunction(output, labels) # here labels are actually saying which index is the ground truth of each output.
#         print(output)
#         print(labels)
#         print(loss)
        optimizer.zero_grad() # clear previous grad
        batch_loss.backward() # compute new grads
        optimizer.step() # update parameters with new grads
        
        epoch_loss += batch_loss.item() # convert to scalar
        
        # save model every 10 epochs
        if epoch % 10 == 0:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                
            }, f'checkpoint-{epoch}.pt')
        
    print(f"Epoch{epoch}, Average training loss: {epoch_loss / len(train_loader)}")
        

epoch: 0
Epoch0, Average training loss: 0.21934600315019012
epoch: 1
Epoch1, Average training loss: 0.20857365878183703
epoch: 2
Epoch2, Average training loss: 0.19797064492236705
epoch: 3
Epoch3, Average training loss: 0.18869457457826208
epoch: 4
Epoch4, Average training loss: 0.17991030223365787
epoch: 5
Epoch5, Average training loss: 0.17206940696890483
epoch: 6
Epoch6, Average training loss: 0.16489926258177517
epoch: 7
Epoch7, Average training loss: 0.15708390915238146
epoch: 8
Epoch8, Average training loss: 0.15064337494562685
epoch: 9
Epoch9, Average training loss: 0.14432046441301735
epoch: 10
Epoch10, Average training loss: 0.13830590589261893
epoch: 11
Epoch11, Average training loss: 0.13279174375476868
epoch: 12
Epoch12, Average training loss: 0.1275785527209928
epoch: 13
Epoch13, Average training loss: 0.12259984824623761
epoch: 14
Epoch14, Average training loss: 0.11785550653112373
epoch: 15
Epoch15, Average training loss: 0.11346915383725914
epoch: 16
Epoch16, Average tr

KeyboardInterrupt: 

In [27]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels  in test_loader:
        labels = labels.to(device)
        images = images.reshape(-1, 784).to(device)
        out = model(images)
#         print("shape of out", out.shape)
#         print("shape of torch.max(out, 1)")
#         print(torch.max(out, 1)) # max of every row which is at dim 1
#         break
        _, predicted = torch.max(out, 1) # _ is the max value in row, predicted is actually the argmax (or index of that value)
#         print("labels size", labels.shape[0])
        total += labels.shape[0]
        correct += (predicted == labels).sum().item()
#         break
    print(f"Testing accuracy: {100 * correct / total}")
    
        
        

Testing accuracy: 93.68


In [28]:
# save the model
torch.save(model, 'ffn_mnist.pt')

In [33]:
# load model with state_dict
loadedModel = Network(input_size, hidden_size, output_size)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
checkpoint = torch.load('checkpoint-10.pt')
loadedModel.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']

loadedModel.to(device)
loadedModel.eval()


Network(
  (layer1): Linear(in_features=784, out_features=128, bias=True)
  (layer2): Linear(in_features=128, out_features=64, bias=True)
  (layer3): Linear(in_features=64, out_features=10, bias=True)
  (relu): ReLU()
  (LogSoftmax): LogSoftmax(dim=1)
)

In [34]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels  in test_loader:
        labels = labels.to(device)
        images = images.reshape(-1, 784).to(device)
        out = loadedModel(images)
#         print("shape of out", out.shape)
#         print("shape of torch.max(out, 1)")
#         print(torch.max(out, 1)) # max of every row which is at dim 1
#         break
        _, predicted = torch.max(out, 1) # _ is the max value in row, predicted is actually the argmax (or index of that value)
#         print("labels size", labels.shape[0])
        total += labels.shape[0]
        correct += (predicted == labels).sum().item()
#         break
    print(f"Testing accuracy: {100 * correct / total}")

Testing accuracy: 95.86
