## **Multi-Layer Perceptron (MLP) with Pytorch**

In [53]:
import torch
from torch import nn


## Step 1. Create Model Class 
- initialize the parameters of the model: input_dim, nb_hidden and the output_dim
- Define the fc layers and the non-linearity function
- define the forward : define how the output is computed

Here we are a define a simple neural network with 2 fc layer and one non-linearity function 

In [54]:
#create model class :specify the input, the dim_hidden,output_dim
#the non_linearity function doesn't affect the dimension of the data

class MLP(nn.Module):
    def __init__(self, input_dim,hidden_dim, output_dim):
        super(MLP, self).__init__()
        #linear_function
        self.fc1=nn.Linear(input_dim,hidden_dim)
        #non-linearity
        self.tanh = nn.Tanh()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self,x):
        #lin funct
        out = self.fc1(x) #output of the 1st layer
        #non-lin funct
        out = self.tanh(out) # output of non_lin funct
        #lin funct (readout)
        out = self.fc2(out) #logits
        return out
        

In [55]:
class MLP2(nn.Module):
    def __init__(self, input_dim,hidden_dim, output_dim):
        super(MLP2, self).__init__()
        #linear_function
        self.fc1=nn.Linear(input_dim,hidden_dim)
        self.fc2=nn.Linear(hidden_dim,hidden_dim)
        #non-linearity
        self.Relu = nn.ReLU() 
        self.fc3=nn.Linear(hidden_dim,hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self,x):
        #lin funct
        out = self.fc1(x) #output of the 1st layer
        out = self.fc2(out)
        #non-lin funct
        out = self.Relu(out) # output of non_lin funct # we can use F.relu(out)
        out = self.fc3(out)
        #lin funct (readout)
        out = self.fc4(out) #logits
        return out

## Step 2. Dataset Loader
* Loading the dataset : specify the root where the dataset is, load training and testing datasets, transforms, dowload=true if you want to dowload from pytorch
* Data loader : Make the dataset iterable by define the batch_size, shuffle if you want to shuffle or not the dataset.

In [56]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
 

In [57]:
 ## Loading Data
    
train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)

test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())

## Data Loader

batch_size = 100
n_iters = 3000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)


train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

## Using th model MLP()

In [58]:
input_dim = 28*28  #size of the image
hidden_dim = 100   # number of neurons
output_dim = 10  #Because we have 10 classes

model = MLP(input_dim, hidden_dim, output_dim)

In [59]:
model

MLP(
  (fc1): Linear(in_features=784, out_features=100, bias=True)
  (tanh): Tanh()
  (fc2): Linear(in_features=100, out_features=10, bias=True)
)

## Step 3. Loss function

Because we are going through a classification problem, cross entropy function is required to compute the loss between our softmax outputs and our binary labels.

In [60]:
criterion = nn.CrossEntropyLoss()

## Step 4. Optimizer
* Update the model's parameters at every iteration
* We are using an optimization algorithm called Stochastic Gradient Descent (SGD) 
*torch.optim to see the other loss function

In [61]:
learning_rate = 0.1 #Learning rate determines how fast the algorithm learns. 

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) 

In [62]:
print(model.parameters())

<generator object Module.parameters at 0x7f1428c3fcd0>


In [63]:
# get the length of the list of t
print(len(list(model.parameters())))  #return the number of layers
print(list(model.parameters())[0].size()) # fc1 Parameters 
print(list(model.parameters())[1].size())  # fc1 bais Parameters
print(list(model.parameters())[2].size())
print(list(model.parameters())[3].size())  #the size of the output

4
torch.Size([100, 784])
torch.Size([100])
torch.Size([10, 100])
torch.Size([10])


## Step 5. Train Model
* Process
- Convert inputs to tensors with gradient accumulation capabilities
- Clear gradient buffers
- Get output given inputs : the forward
- Get loss
- Do backward
- Update parameters using gradients 
- REPEAT the process

In [64]:
iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images with gradient accumulation capabilities
        images = images.view(-1, 28*28).requires_grad_()

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        outputs = model(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                # Load images with gradient accumulation capabilities
                images = images.view(-1, 28*28).requires_grad_()

                # Forward pass only to get logits/output
                outputs = model(images)

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (predicted == labels).sum()

            accuracy = 100 * correct / total

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))


Iteration: 500. Loss: 0.2778734862804413. Accuracy: 91
Iteration: 1000. Loss: 0.19817772507667542. Accuracy: 92
Iteration: 1500. Loss: 0.1722957193851471. Accuracy: 93
Iteration: 2000. Loss: 0.383125901222229. Accuracy: 94
Iteration: 2500. Loss: 0.1928856521844864. Accuracy: 94
Iteration: 3000. Loss: 0.22457529604434967. Accuracy: 95


In [87]:
#model's state_dict

In [103]:
path = '/home/aims/Downloads/Transfer_learning/checkpoint.pth'

In [104]:
torch.save(model.state_dict(), path)        #state_dict(): the dict

In [105]:
# import os 
# os.getcwd()

'/home/aims/Downloads/Transfer_learning'

In [106]:
torch.load(path)

OrderedDict([('fc1.weight',
              tensor([[-0.0046,  0.0006, -0.0235,  ..., -0.0025,  0.0009, -0.0252],
                      [ 0.0318,  0.0326,  0.0221,  ..., -0.0277, -0.0340, -0.0222],
                      [ 0.0122,  0.0229, -0.0033,  ..., -0.0071, -0.0286,  0.0008],
                      ...,
                      [ 0.0340, -0.0275,  0.0113,  ..., -0.0258, -0.0242,  0.0091],
                      [ 0.0152,  0.0054, -0.0102,  ..., -0.0207, -0.0142, -0.0322],
                      [-0.0226, -0.0002,  0.0327,  ...,  0.0048, -0.0144, -0.0341]])),
             ('fc1.bias',
              tensor([ 8.9256e-03,  1.5081e-01,  1.3231e-01,  1.6254e-01,  1.6533e-01,
                      -2.3301e-02,  1.1067e-01, -1.4146e-01,  7.6401e-02, -2.0366e-01,
                      -1.2693e-01,  8.7816e-02,  2.1667e-01,  1.1925e-01,  1.0255e-01,
                       1.2123e-01,  9.9078e-02, -7.2657e-02, -1.7623e-01,  1.6506e-01,
                       2.4597e-02, -8.8836e-02, -2.4959e-01, -8.

save the model to checkpoints

* resquires_grad: the possibility to compute the gradient to the parameters
* optimizer.zero_grad() : clear gradient
* outputs = model(images): compute the forward
* loss.backward() : compute backprogation of the loss respect to the parameters
* optimizer.step(): update the parameters


##  Using medel MPL2()

In [35]:
model_1 = MLP2(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
learning_rate = 0.1 #Learning rate determines how fast the algorithm learns. 

optimizer = torch.optim.SGD(model_1.parameters(), lr=learning_rate) 
iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images with gradient accumulation capabilities
        images = images.view(-1, 28*28).requires_grad_()

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        outputs = model_1(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                # Load images with gradient accumulation capabilities
                images = images.view(-1, 28*28).requires_grad_()

                # Forward pass only to get logits/output
                outputs = model_1(images)

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (predicted == labels).sum()

            accuracy = 100 * correct / total

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

Iteration: 500. Loss: 0.30891722440719604. Accuracy: 91
Iteration: 1000. Loss: 0.23510482907295227. Accuracy: 93
Iteration: 1500. Loss: 0.22717426717281342. Accuracy: 94
Iteration: 2000. Loss: 0.10250414907932281. Accuracy: 95
Iteration: 2500. Loss: 0.12175756692886353. Accuracy: 95
Iteration: 3000. Loss: 0.2126990705728531. Accuracy: 96


As we can see with more layers we got a high accuracy.

1. Arch
2. dataset and dataloader
3. Train and evaluation
4. parameter tuning
5. save model