<a href="https://colab.research.google.com/github/ayulockin/debugNNwithWandB/blob/master/Cifar10_pytorch_wandb_Dropout_BN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports and Setups

In [1]:
!pip install wandb -q

[K     |████████████████████████████████| 1.4MB 3.5MB/s 
[K     |████████████████████████████████| 92kB 14.4MB/s 
[K     |████████████████████████████████| 102kB 13.8MB/s 
[K     |████████████████████████████████| 460kB 56.6MB/s 
[K     |████████████████████████████████| 102kB 15.1MB/s 
[K     |████████████████████████████████| 71kB 11.8MB/s 
[K     |████████████████████████████████| 71kB 9.1MB/s 
[?25h  Building wheel for watchdog (setup.py) ... [?25l[?25hdone
  Building wheel for gql (setup.py) ... [?25l[?25hdone
  Building wheel for shortuuid (setup.py) ... [?25l[?25hdone
  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
  Building wheel for graphql-core (setup.py) ... [?25l[?25hdone


In [0]:
import wandb

In [3]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://app.wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 69f60a7711ce6b8bbae91ac6d15e45d6b1f1430e
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[32mSuccessfully logged in to Weights & Biases![0m


In [0]:
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt
import numpy as np

#### For GPU

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## CIFAR10 Dataset

In [6]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32,
                                          shuffle=True)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=32,
                                         shuffle=False)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

  0%|          | 0/170498071 [00:00<?, ?it/s]

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


170500096it [00:01, 90227723.22it/s]                               


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [7]:
trainset

Dataset CIFAR10
    Number of datapoints: 50000
    Root location: ./data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
           )

## Base model



In [0]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 32, 3, 1)
        self.conv3 = nn.Conv2d(32, 64, 3, 1)
        self.conv4 = nn.Conv2d(64, 64, 3, 1)

        self.pool1 = torch.nn.MaxPool2d(2)
        self.pool2 = torch.nn.MaxPool2d(2)

        self.fc1 = nn.Linear(1600, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):
        ## Conv 1st Block
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool1(x)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.pool2(x)
        
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        output = F.log_softmax(x, dim=1)
        return output

In [0]:
def train(model, device, train_loader, optimizer, epoch, steps_per_epoch):
  # Switch model to training mode. This is necessary for layers like dropout, batchnorm etc which behave differently in training and evaluation mode
  model.train()
  train_total = 0
  train_correct = 0

  # We loop over the data iterator, and feed the inputs to the network and adjust the weights.
  for batch_idx, (data, target) in enumerate(train_loader, start=0):
    if batch_idx > steps_per_epoch:
      break
    # Load the input features and labels from the training dataset
    data, target = data.to(device), target.to(device)
    
    # Reset the gradients to 0 for all learnable weight parameters
    optimizer.zero_grad()
    
    # Forward pass: Pass image data from training dataset, make predictions about class image belongs to (0-9 in this case)
    output = model(data)
    
    # Define our loss function, and compute the loss
    loss = F.nll_loss(output, target)

    scores, predictions = torch.max(output.data, 1)
    train_total += target.size(0)
    train_correct += int(sum(predictions == target))
            
    # Backward pass: compute the gradients of the loss w.r.t. the model's parameters
    loss.backward()
    
    # Update the neural network weights
    optimizer.step()

  acc = round((train_correct / train_total) * 100, 2)
  print('Epoch [{}], Loss: {}, Accuracy: {}, '.format(epoch, loss.item(), acc), end='')
  wandb.log({'Train Loss': loss.item(), 'Train Accuracy': acc})


In [0]:
def test(model, device, test_loader, classes):
  # Switch model to evaluation mode. This is necessary for layers like dropout, batchnorm etc which behave differently in training and evaluation mode
  model.eval()
  
  test_loss = 0
  test_total = 0
  test_correct = 0

  with torch.no_grad():
      for data, target in test_loader:
          # Load the input features and labels from the test dataset
          data, target = data.to(device), target.to(device)
          
          # Make predictions: Pass image data from test dataset, make predictions about class image belongs to (0-9 in this case)
          output = model(data)
          
          # Compute the loss sum up batch loss
          test_loss += F.nll_loss(output, target, reduction='sum').item()
          
          scores, predictions = torch.max(output.data, 1)
          test_total += target.size(0)
          test_correct += int(sum(predictions == target))
          
  acc = round((test_correct / test_total) * 100, 2)
  print(' Test_loss: {}, Test_accuracy: {}'.format(test_loss/test_total, acc))
  wandb.log({'Test Loss': test_loss/test_total, 'Test Accuracy': acc})


## Let's train it

In [11]:
net = Net().to(device)
print(net)

optimizer = optim.Adam(net.parameters())

Net(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=1600, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=10, bias=True)
)


In [12]:
wandb.init(project='dropoutbn')
wandb.watch(net, log='all')

for epoch in range(10):
  train(net, device, trainloader, optimizer, epoch, steps_per_epoch=50000//32)
  test(net, device, testloader, classes)

print('Finished Training')

Epoch [0], Loss: 0.762065589427948, Accuracy: 45.91,  Test_loss: 1.1432553052902221, Test_accuracy: 59.17
Epoch [1], Loss: 1.5598825216293335, Accuracy: 65.06,  Test_loss: 0.9075411169052124, Test_accuracy: 67.82
Epoch [2], Loss: 0.5700869560241699, Accuracy: 72.46,  Test_loss: 0.8052263621330261, Test_accuracy: 72.72
Epoch [3], Loss: 0.5809231400489807, Accuracy: 77.45,  Test_loss: 0.7728524807929993, Test_accuracy: 73.53
Epoch [4], Loss: 0.6602230072021484, Accuracy: 81.08,  Test_loss: 0.8378544267654419, Test_accuracy: 72.13
Epoch [5], Loss: 0.6705617904663086, Accuracy: 84.43,  Test_loss: 0.8033297297954559, Test_accuracy: 74.79
Epoch [6], Loss: 0.10596936196088791, Accuracy: 87.31,  Test_loss: 0.9438407429695129, Test_accuracy: 73.73
Epoch [7], Loss: 0.17943860590457916, Accuracy: 89.68,  Test_loss: 0.9510077392578125, Test_accuracy: 73.43
Epoch [8], Loss: 0.17485202848911285, Accuracy: 91.63,  Test_loss: 1.0446744371414185, Test_accuracy: 74.01
Epoch [9], Loss: 0.0326172858476638

## Dropout


In [0]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 32, 3, 1)
        self.conv3 = nn.Conv2d(32, 64, 3, 1)
        self.conv4 = nn.Conv2d(64, 64, 3, 1)

        self.pool1 = torch.nn.MaxPool2d(2)
        self.pool2 = torch.nn.MaxPool2d(2)

        self.drop1 = torch.nn.Dropout()
        self.drop2 = torch.nn.Dropout()

        self.fc1 = nn.Linear(1600, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):
        ## Conv 1st Block
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.drop1(x)
        x = self.pool1(x)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.drop1(x)
        x = self.pool2(x)
        
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        output = F.log_softmax(x, dim=1)
        return output

In [0]:
del net

In [15]:
net = Net().to(device)
optimizer = optim.Adam(net.parameters())

wandb.init(project='dropoutbn')
wandb.watch(net, log='all')

for epoch in range(10):
  train(net, device, trainloader, optimizer, epoch, steps_per_epoch=50000//32)
  test(net, device, testloader, classes)

print('Finished Training')

Epoch [0], Loss: 1.3624159097671509, Accuracy: 46.28,  Test_loss: 1.4335350509643554, Test_accuracy: 55.48
Epoch [1], Loss: 1.0600898265838623, Accuracy: 61.22,  Test_loss: 1.2644049947738647, Test_accuracy: 59.83
Epoch [2], Loss: 0.7622907161712646, Accuracy: 66.91,  Test_loss: 1.1126571146011353, Test_accuracy: 63.32
Epoch [3], Loss: 0.8105753064155579, Accuracy: 70.28,  Test_loss: 1.042283620452881, Test_accuracy: 66.07
Epoch [4], Loss: 0.709924578666687, Accuracy: 73.27,  Test_loss: 1.0024064876556396, Test_accuracy: 66.34
Epoch [5], Loss: 0.432053804397583, Accuracy: 75.23,  Test_loss: 0.9317248403549194, Test_accuracy: 69.53
Epoch [6], Loss: 0.6623861789703369, Accuracy: 77.09,  Test_loss: 0.8552054161071777, Test_accuracy: 71.3
Epoch [7], Loss: 0.6431245803833008, Accuracy: 78.63,  Test_loss: 0.8708825876235962, Test_accuracy: 72.19
Epoch [8], Loss: 0.502291202545166, Accuracy: 79.91,  Test_loss: 0.849602875995636, Test_accuracy: 71.89
Epoch [9], Loss: 1.0329746007919312, Accura

Note:

1. The model didn't overfit. 
2. It took longer to converge. This can be because ensemble learning take time. Not every neuron was available while learning.

## Batch Normalization

In [0]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 32, 3, 1)
        self.conv3 = nn.Conv2d(32, 64, 3, 1)
        self.conv4 = nn.Conv2d(64, 64, 3, 1)

        self.pool1 = torch.nn.MaxPool2d(2)
        self.pool2 = torch.nn.MaxPool2d(2)

        self.bn1 = torch.nn.BatchNorm2d(32)
        self.bn2 = torch.nn.BatchNorm2d(64)

        self.fc1 = nn.Linear(1600, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):
        ## Conv 1st Block
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.relu(self.conv2(x))
        x = self.pool1(x)

        x = self.conv3(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.relu(self.conv4(x))
        x = self.pool2(x)
        
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        output = F.log_softmax(x, dim=1)
        return output

In [0]:
del net

In [21]:
net = Net().to(device)
optimizer = optim.Adam(net.parameters())

wandb.init(project='dropoutbn')
wandb.watch(net, log='all')

for epoch in range(10):
  train(net, device, trainloader, optimizer, epoch, steps_per_epoch=50000//32)
  test(net, device, testloader, classes)

print('Finished Training')

Epoch [0], Loss: 1.2230021953582764, Accuracy: 52.4,  Test_loss: 1.0370957496643067, Test_accuracy: 63.41
Epoch [1], Loss: 0.9987068176269531, Accuracy: 69.23,  Test_loss: 0.8546534811973572, Test_accuracy: 70.51
Epoch [2], Loss: 0.3769473433494568, Accuracy: 75.29,  Test_loss: 0.7601750534057617, Test_accuracy: 74.39
Epoch [3], Loss: 0.4631640613079071, Accuracy: 79.41,  Test_loss: 0.748601777267456, Test_accuracy: 75.26
Epoch [4], Loss: 0.6603724956512451, Accuracy: 82.82,  Test_loss: 0.8692990935325623, Test_accuracy: 72.67
Epoch [5], Loss: 0.08839832246303558, Accuracy: 85.81,  Test_loss: 0.75582248544693, Test_accuracy: 76.97
Epoch [6], Loss: 0.1988542228937149, Accuracy: 88.21,  Test_loss: 0.8076319061279297, Test_accuracy: 75.69
Epoch [7], Loss: 0.42038971185684204, Accuracy: 90.4,  Test_loss: 0.9096485228061676, Test_accuracy: 75.28
Epoch [8], Loss: 0.4379485845565796, Accuracy: 91.9,  Test_loss: 1.0178819646835326, Test_accuracy: 75.0
Epoch [9], Loss: 0.8714055418968201, Accur

Note:
1. The model converges quickly.
2. Because the model is still simple for the standard of BN, overfitting occured. 

Let's see Batch Normalization and Dropout in action together.

## Batch Normalization and Dropout

In [0]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 32, 3, 1)
        self.conv3 = nn.Conv2d(32, 64, 3, 1)
        self.conv4 = nn.Conv2d(64, 64, 3, 1)

        self.pool1 = torch.nn.MaxPool2d(2)
        self.pool2 = torch.nn.MaxPool2d(2)

        self.bn1 = torch.nn.BatchNorm2d(32)
        self.bn2 = torch.nn.BatchNorm2d(64)

        self.drop1 = torch.nn.Dropout()
        self.drop2 = torch.nn.Dropout()

        self.fc1 = nn.Linear(1600, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):
        ## Conv 1st Block
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.relu(self.conv2(x))
        x = self.drop1(x)
        x = self.pool1(x)

        x = self.conv3(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.relu(self.conv4(x))
        x = self.drop2(x)
        x = self.pool2(x)
        
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        output = F.log_softmax(x, dim=1)
        return output

In [0]:
del net

In [30]:
net = Net().to(device)
optimizer = optim.Adam(net.parameters())

wandb.init(project='dropoutbn')
wandb.watch(net, log='all')

for epoch in range(10):
  train(net, device, trainloader, optimizer, epoch, steps_per_epoch=50000//32)
  test(net, device, testloader, classes)

print('Finished Training')

Epoch [0], Loss: 1.483834981918335, Accuracy: 50.59,  Test_loss: 1.3474351978302002, Test_accuracy: 55.21
Epoch [1], Loss: 1.0609911680221558, Accuracy: 64.46,  Test_loss: 1.1758262783050537, Test_accuracy: 61.54
Epoch [2], Loss: 0.6946431398391724, Accuracy: 70.01,  Test_loss: 0.9932758785247803, Test_accuracy: 67.75
Epoch [3], Loss: 0.5579437017440796, Accuracy: 73.12,  Test_loss: 0.9274280029296875, Test_accuracy: 70.64
Epoch [4], Loss: 0.40007323026657104, Accuracy: 75.78,  Test_loss: 0.8967650428771973, Test_accuracy: 71.24
Epoch [5], Loss: 0.22875693440437317, Accuracy: 77.66,  Test_loss: 0.9264094530105591, Test_accuracy: 69.76
Epoch [6], Loss: 0.4890390634536743, Accuracy: 79.2,  Test_loss: 0.8634264698028564, Test_accuracy: 71.48
Epoch [7], Loss: 0.571390688419342, Accuracy: 80.38,  Test_loss: 0.8696962162017823, Test_accuracy: 70.65
Epoch [8], Loss: 0.2613355219364166, Accuracy: 81.8,  Test_loss: 0.8504735969543457, Test_accuracy: 70.83
Epoch [9], Loss: 0.28765809535980225, A

Note:

1. Overfitting was avoided. 
2. The model converged better as compared to Dropout layer. 