## Import all the necesary libraries

In [1]:
import numpy as np
import torch
import sys
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils import data
from torchvision import transforms
from torchvision.datasets import MNIST

import matplotlib.pyplot as plt
import time

from sklearn.metrics import confusion_matrix, classification_report

cuda = torch.cuda.is_available()
cuda

False

## Download the MNIST Train and Test set & Compute basic data statistics
It is always important to know the shape of the data, as well as the min/max and mean/variance

In [2]:
train = MNIST('./data', train=True, download=True, transform=transforms.ToTensor())
test = MNIST('./data', train=False, download=True, transform=transforms.ToTensor())
train_data = train.train_data
train_data = train.transform(train_data.numpy())

## Dataloader

In [3]:
class MyDataset(data.Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.Y)

    def __getitem__(self,index):
        X = self.X[index].float().reshape(-1) #flatten the input
        Y = self.Y[index].long()
        return X,Y

Using the torch.utils.data DataLoader, we shuffle the data and set the batch size to be 256

In [4]:
num_workers = 8 if cuda else 0 
    
# Training
train_dataset = MyDataset(train.train_data, train.train_labels)

train_loader_args = dict(shuffle=True, batch_size=256, num_workers=num_workers, pin_memory=True) if cuda\
                    else dict(shuffle=True, batch_size=16)
train_loader = data.DataLoader(train_dataset, **train_loader_args)

# Testing
test_dataset = MyDataset(test.test_data, test.test_labels)

test_loader_args = dict(shuffle=False, batch_size=256, num_workers=num_workers, pin_memory=True) if cuda\
                    else dict(shuffle=False, batch_size=1)
test_loader = data.DataLoader(test_dataset, **test_loader_args)

In [5]:
# train_loader = dataloader.DataLoader(train, **train_loader_args) 
# test_loader = dataloader.DataLoader(test, **test_loader_args)

## Define our Neural Network Model 
We define our model using the torch.nn.Module class

In [6]:
# SIMPLE MODEL DEFINITION
class Simple_MLP(nn.Module):
    def __init__(self, size_list):
        super(Simple_MLP, self).__init__()
        layers = []
        self.size_list = size_list
        for i in range(len(size_list) - 2):
            layers.append(nn.Linear(size_list[i],size_list[i+1]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(size_list[-2], size_list[-1]))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)



## Create the model and define the Loss and Optimizer
We create an instance of our Simple_MLP model by passing it a list of layer sizes. The input layer contains 28*28 = 784 neurons. We define a single hidden layer of size 256, and our output will be the class probabilities for each of the 10 digits (0-9). 

Since this is a classification task, we will use Cross Entropy Loss. We define our criterion using the torch.nn.CrossEntropyLoss.

In order to train our network, we use the torch.optim.SGD optimizer.

In [7]:
model = Simple_MLP([784, 256, 10])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
device = torch.device("cuda" if cuda else "cpu")
model.to(device)
print(model)

Simple_MLP(
  (net): Sequential(
    (0): Linear(in_features=784, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=10, bias=True)
  )
)


## Create a function that will train the network for one epoch

In [8]:
def train_epoch(model, train_loader, criterion, optimizer, epoch, lr):
    model.train()

    running_loss = 0.0
    predictions = []
    ground_truth = []
    loss_counter = 1000
    loss_den = 0
    
    start_time = time.time()
    for batch_idx, (data, target) in enumerate(train_loader):   
            data = data.to(device)
            target = target.to(device)
        
            #previous model
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total_predictions = target.size(0)
            correct_predictions = (predicted == target).sum().item()
            acc = (correct_predictions/total_predictions)*100.0
            
            loss = criterion(outputs, target)
            
            #new model
            model_new = Simple_MLP([784, 256, 10])
            model_new.net[0].weight = nn.Parameter(torch.from_numpy(np.random.normal(model.net[0].weight.detach().cpu().numpy(), scale = lr)).float())
            model_new.net[2].weight = nn.Parameter(torch.from_numpy(np.random.normal(model.net[2].weight.detach().cpu().numpy(), scale = lr)).float())
            model_new.to(device)
            
            outputs = model_new(data)
            _, predicted = torch.max(outputs.data, 1)
            total_predictions = target.size(0)
            correct_predictions = (predicted == target).sum().item()
            acc_new = (correct_predictions/total_predictions)*100.0
            
            loss_new = criterion(outputs, target)
            
            if batch_idx == 0:
                print('Batch 0:', loss.item())
        
            if loss_new.item() < loss.item():
                loss_den += 1
                loss_counter = loss_new.item()
                running_loss += loss_new.item()
                model = model_new
                #calculuating confusion matrix
                predictions += list(predicted.detach().cpu().numpy())
                ground_truth += list(target.detach().cpu().numpy())
        
        
    
    end_time = time.time()
    
    running_loss /= loss_den
    print('Training Loss: ', running_loss, 'Time: ',end_time - start_time, 's')
    
    return running_loss, model

## Create a function that will evaluate our network's performance on the test set

In [9]:
def test_model(model, test_loader, criterion):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        total_predictions = 0.0
        correct_predictions = 0.0
        
        predictions = []
        ground_truth = []

        for batch_idx, (data, target) in enumerate(test_loader):   
            data = data.to(device)
            target = target.to(device)

            outputs = model(data)

            _, predicted = torch.max(outputs.data, 1)
            total_predictions += target.size(0)
            correct_predictions += (predicted == target).sum().item()

            loss = criterion(outputs, target).detach()
            running_loss += loss.item()
            
            #calculuating confusion matrix
            predictions += list(predicted.detach().cpu().numpy())
            ground_truth += list(target.detach().cpu().numpy())
    

        running_loss /= len(test_loader)
        acc = (correct_predictions/total_predictions)*100.0
        print('Testing Loss: ', running_loss)
        print('Testing Accuracy: ', acc, '%')
        return running_loss, acc


## Train the model for N epochs
We call our training and testing functions in a loop, while keeping track of the losses and accuracy. 

In [10]:
n_epochs = 40
lr = 0.001
Train_loss = []
Test_loss = []
Test_acc = []

for i in range(n_epochs):
    train_loss, model = train_epoch(model, train_loader, criterion, optimizer, i, lr)
    test_loss, test_acc = test_model(model, test_loader, criterion)
    Train_loss.append(train_loss)
    Test_loss.append(test_loss)
    Test_acc.append(test_acc)
    print('='*20)

Batch 0: 32.05805587768555
Training Loss:  24.719215086548047 Time:  61.748836040496826 s
Testing Loss:  29.67355846779604
Testing Accuracy:  36.5 %
Batch 0: 37.76864242553711


KeyboardInterrupt: 