In [33]:
import torch
import torchvision
from torchvision import transforms, datasets

train = datasets.MNIST("", train=True, download=True,
                  transform = transforms.Compose([
                      transforms.ToTensor()
                  ]))

test = datasets.MNIST("", train=False, download=True,
                  transform = transforms.Compose([
                      transforms.ToTensor()
                  ]))

trainset = torch.utils.data.DataLoader(train, batch_size=10, shuffle=True)
testset = torch.utils.data.DataLoader(test, batch_size=10, shuffle=True)

In [34]:
# nn provides an OO interface (initialsing) while F is functional (parameters)
import torch.nn as nn
import torch.nn.functional as F

In [35]:
# python: Net inherits methods from nn.Module, calling super will call the nn.Module (parent) init function.

class Net(nn.Module):
    '''
    __init__: 
        - fc1 is fully connected first layer
            - the first layer fc1 is input, it outputs to fc2 so we pass the same size from the output of fc1 to the input of fc2
            - the output layer fc4 has a size of 10 to represent each of our ten digits
        - nn.Linear(input, output), 
            - input (size of each input sample) are our images, each image is 28 by 28 pixels so a flattend image is 28*28=784
            - output (size of each output sample) will have 3 layers of 64 neurons for our hidden layers

    forward: 
        - Most people use the same optimizer function like relu, things change with regards to output, for us the output is multi class and log softmax 
          works to give a probability distribution. Note that the size of the output is the number if outputs, one for each of our ten digits.
        - F.relu() is being run over an entire layers output, relu (rectifide linear) is the activation function or optimizer, it is a sigmoid
          for firing or not firing of 'neurons'
        - F.log_softmax() takes the output and dimension to output a probability distribution, recall that the goal with output 
          is to see which neurons fired, so whichever one is closer to 1 then that is the 'most fired' neuron. Because we are dealing with 
          multiple classes we want a probability distribution on the output.
    '''
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 64)   # input to hidden
        self.fc2 = nn.Linear(64, 64)      # hidden
        self.fc3 = nn.Linear(64, 64)      # hidden
        self.fc4 = nn.Linear(64, 10)      # hidden to output
        
    def forward(self, x):
        x = F.relu(self.fc1(x))   # x is the output ( nn.Linear(64, x) ), pass result of fc1 to relu
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)           # for the output we want to output a probability distribution

        # probability distribution log_softmax, dimension is 1 as we assume a flat multi dim array tensor for final x (output)
        return F.log_softmax(x, dim=1)
        
        
net = Net()   
print(net)

Net(
  (fc1): Linear(in_features=784, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=10, bias=True)
)


In [36]:
X = torch.rand((28,28))
X = X.view(-1, 28*28)    # -1 tells the reshape/view to expect data at any size Tensor(multi dimen array)

In [37]:
output = net(X)

In [38]:
output # these are the outputs !

tensor([[-2.1502, -2.3297, -2.3511, -2.3300, -2.3915, -2.2951, -2.1804, -2.2525,
         -2.3569, -2.4240]], grad_fn=<LogSoftmaxBackward>)

In [39]:
output.max() # max value will be at index, the index is the predicted value

tensor(-2.1502, grad_fn=<MaxBackward1>)