# Deep Learning Project
## CIFAR 10 dropout notebook

#### Gaussian, no batch norm

### Imports

In [0]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

### Download the dataset, transform it and divide into batches

In [0]:
# Using ``torchvision``, it’s extremely easy to load CIFAR10.

# The output of torchvision datasets are PILImage images of range [0, 1].
# We transform them to Tensors of normalized range [0, 1].

# This trasnform is the GLOBAL CONTRAST NORMALIZATION, for which the parameters have been calculated at the end of this notebook
transform =transforms.Compose(
    [      
    transforms.ToTensor(),  # Dataset images are in Pillow format, but we need them as tensors
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2427, 0.2392, 0.2559))
    ]
)

trainSet = torchvision.datasets.CIFAR10(
    root='./data',  # where the dataset is/will be stored
    train=True,  # creates the dataset FROM the train set
    download=False,  # If True, downloads the dataset from the internet
    transform=transform
)


# This combines a dataset and a sampler, and provides an iterable over the given dataset
trainLoader = torch.utils.data.DataLoader(
    trainSet,  # The dataset
    batch_size=100,  # How many samples per batch to load
    shuffle=True, # Data is reshuffled at every epoch
    num_workers=2 # How many processes to use for data loading
)

classes = (
    'plane',
    'car',
    'bird',
    'cat',
    'deer',
    'dog',
    'frog',
    'horse',
    'ship',
    'truck'
)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting ./data/cifar-10-python.tar.gz to ./data


### Model with Gaussian dropout
**In this case, we had to implement the dropout manually**

In [0]:
# DEFINING A CONVOLUTIONAL NEURAL NETWORK

# Copy the neural network from the Neural Networks section before and modify it to
# take 3-channel images (instead of 1-channel images as it was defined).

class Net(nn.Module):

    def __init__(self):
         
        super(Net, self).__init__()
        
        self.bn1 = nn.BatchNorm1d(num_features=2048)
        
        self.bn2 = nn.BatchNorm1d(num_features=1024)
        
        self.bnc1 = nn.BatchNorm2d(num_features=96)
        
        self.bnc2 = nn.BatchNorm2d(num_features=128)
        
        self.bnc3 = nn.BatchNorm2d(num_features=256) 
        
        self.conv1 = nn.Conv2d(3, 96, 5, padding=2, stride = 1)

        self.conv2 = nn.Conv2d(96, 128, 5,padding=2, stride = 1)
        
        self.conv3 = nn.Conv2d(128,256, 5, padding=2, stride = 1)

        self.fc1 = nn.Linear(2304, 2048)

        self.fc2 = nn.Linear(2048, 1024)
        
        self.fc3 = nn.Linear(1024,10)
        
        self.pool = nn.MaxPool2d(3, 2)
        
        
        self.p1=0.9
        self.p2=0.75
        self.p3=0.5
        
        self.sigma1 = np.sqrt((1-self.p1)/self.p1)
        self.sigma2 = np.sqrt((1-self.p2)/self.p2)
        self.sigma3 = np.sqrt((1-self.p3)/self.p3)

    def gaussian_dropout(self, x, std, mean):
        epsilon = torch.randn(x.size()) * std + mean
        epsilon = epsilon.cuda()
        return epsilon * x
        
    def forward(self, x):

        x = self.gaussian_dropout(x, self.sigma1, 1)  # Comment this out to remove dropout
        x = (self.pool(F.relu((self.bnc1(self.conv1(x))))))
        
        x = self.gaussian_dropout(x, self.sigma2, 1)  # Comment this out to remove dropout
        x = (self.pool(F.relu(self.bnc2((self.conv2(x))))))
        
        x = self.gaussian_dropout(x, self.sigma2, 1)  # Comment this out to remove dropout
        x = (self.pool(F.relu(self.bnc3((self.conv3(x))))))
        
        x = x.view(-1, 2304)
        x = self.gaussian_dropout(x, self.sigma3, 1)  # Comment this out to remove dropout
        x = (F.relu(self.bn1(self.fc1(x))))
                
        x = self.gaussian_dropout(x, self.sigma3, 1)  # Comment this out to remove dropout
        x = (F.relu(self.bn2(self.fc2(x))))
            
        x = self.gaussian_dropout(x, self.sigma3, 1)  # Comment this out to remove dropout
        x = self.fc3(x)
        
        return x

## Initialization
### Then we set all parameters for the testing phase

For the Linear layer, we decided to initialize the weights through a uniform distribution. Following the work of "*Understanding the difficulty of training deep feedforward neural networks*" by Glorot et al., we fill in the tensors with values sampled from $\mathit{U}(-a, a)$, where 

$a = \mathrm{gain} \cdot \sqrt{\frac{6}{\mathrm{fan\_in} + \mathrm{fan\_out}}} $

This is also known as Glorot initialization.

In [0]:
use_gpu = torch.cuda.is_available()
print("Using GPU" if use_gpu else "Not using GPU")

def init_weights(m):
    if type(m) == nn.Linear:
        #weight initialisation
        nn.init.xavier_uniform_(
            m.weight, 
            gain=nn.init.calculate_gain('relu') # this returns the recommended gain for the given non linearity
        )
        #nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='relu')
        #constant bias.
        m.bias.data.fill_(-0.05)
    if type(m) == nn.Conv2d:
        #weight initialisation
        nn.init.xavier_uniform_(m.weight, gain = nn.init.calculate_gain('relu'))
        #nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='relu')
        #constant bias
        m.bias.data.fill_(-0.05)

net = Net()  # Initialize the Net class
net.apply(init_weights) # This applies the initialization recursively over all modules of Net

if use_gpu:
    net = net.cuda()

# Define a Loss function and optimizer
# Let's use a Classification Cross-Entropy loss and SGD with momentum.
# This already has a SOFTMAX layer 
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(
    net.parameters(),
    lr=0.002, momentum=0.8, weight_decay=0.001
)

#optimizer =optim.Adam(net.parameters(), lr=0.01, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=False, )

#scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 0.0012, .00044, step_size_up=2000, step_size_down=None, mode='triangular', 
                                                    #       gamma=1.0, scale_fn=None, scale_mode='cycle', cycle_momentum=True, base_momentum=0.8, 
                                                   #        max_momentum=0.9, last_epoch=-1)

Using GPU


## Finally, we train our model

**First, we define the ZCA Whitening function and apply it during training**

In [0]:

from tensorboardX import SummaryWriter
writer = SummaryWriter("./runs")

def zca(inputs):
    
    initial_shape = inputs.shape
  # print(inputs.shape)
  
    inputs=inputs.reshape(initial_shape[0],-1) # flattening
  # print(inputs.shape)

    X = inputs
    X = X - X.mean(axis=0)
  # compute the covariance of the image data

    cov = np.cov(X, rowvar=True)   # cov is (N, N)
  # singular value decomposition
    U,S,V = np.linalg.svd(cov)     # U is (N, N), S is (N,)
  # build the ZCA matrix
    epsilon = 1e-15
    zca_matrix = np.dot(U, np.dot(np.diag(1.0/np.sqrt(S + epsilon)), U.T))
  # transform the image data       zca_matrix is (N,N)
    zca = np.dot(zca_matrix,X)    # zca is (N, 3072)

    return zca.reshape(initial_shape)

In [0]:
# TRAIN THE NETWORK

# This is when things start to get interesting.
# We simply have to loop over our data iterator, and feed the inputs to the
# network and optimize.

net.train()

epochs = 200
indice = 0

for epoch in range(epochs):  # loop over the dataset multiple times
    running_loss = 0.0

    for i, data in enumerate(trainLoader, 0):

        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        
        # ZCA whitening
        inputs = zca(inputs)
        
        inputs = torch.tensor(inputs, dtype=torch.float)
        
        if use_gpu:
            inputs = inputs.cuda()
            labels = labels.cuda()

        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
       
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        writer.add_scalar('gaussian_BN', loss, indice)

        running_loss += loss.item()
        # print statistics       
        if i % 100 == 99:    # print every 1000 mini-batches
            print('[%d, %5d] loss: %.3f' %(epoch + 1, i + 1, running_loss/100))
            running_loss = 0.0
            
        indice += 1
        #scheduler.step()
        #print('\nLearning rate at this epoch is: %0.9f' % scheduler.get_lr()[0])
print("Done")

[1,   100] loss: 2.617
[1,   200] loss: 2.297
[1,   300] loss: 2.289
[1,   400] loss: 2.262
[1,   500] loss: 2.221
[2,   100] loss: 2.190
[2,   200] loss: 2.161
[2,   300] loss: 2.135
[2,   400] loss: 2.106
[2,   500] loss: 2.084
[3,   100] loss: 2.070
[3,   200] loss: 2.056
[3,   300] loss: 2.053
[3,   400] loss: 2.030
[3,   500] loss: 2.009
[4,   100] loss: 2.004
[4,   200] loss: 1.984
[4,   300] loss: 1.968
[4,   400] loss: 1.961
[4,   500] loss: 1.967
[5,   100] loss: 1.951
[5,   200] loss: 1.925
[5,   300] loss: 1.937
[5,   400] loss: 1.908
[5,   500] loss: 1.902
[6,   100] loss: 1.889
[6,   200] loss: 1.867
[6,   300] loss: 1.865
[6,   400] loss: 1.844
[6,   500] loss: 1.833
[7,   100] loss: 1.816
[7,   200] loss: 1.812
[7,   300] loss: 1.782
[7,   400] loss: 1.801
[7,   500] loss: 1.748
[8,   100] loss: 1.743
[8,   200] loss: 1.720
[8,   300] loss: 1.724
[8,   400] loss: 1.717
[8,   500] loss: 1.708
[9,   100] loss: 1.676
[9,   200] loss: 1.692
[9,   300] loss: 1.661
[9,   400] 

In [0]:
torch.load({
        'epoch': epochs,
        'model_state_dict': net.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }, "./gaussian_BN.pt")

### And finally we get to testing

# Testing phase

In [0]:
net.eval()

testSet = torchvision.datasets.CIFAR10(
    root='./data',  # where the dataset is/will be stored
    train=False,  # creates the dataset FROM the test set
    download=True,  # If True, downloads the dataset from the internet
    transform=transform
)


# This combines a dataset and a sampler, and provides an iterable over the given dataset
testLoader = torch.utils.data.DataLoader(
    testSet,  # The dataset
    batch_size=10,  # How many samples per batch to load
    shuffle=True, # Data is reshuffled at every epoch
    num_workers=2 # How many processes to use for data loading
)

correct = 0
total = 0
for images, labels in testLoader:
  
  # We repeat ZCA whitening to replicate the normalization of the input of the training phase
    #images = zca(images)
    #images = torch.tensor(images, dtype=torch.float)

    if use_gpu:
        images = images.cuda()
        labels = labels.cuda()


    predicted = net(images)

    for i in range(len(predicted)):
        value, index = predicted[i].max(0)
        total += 1
        if labels[i] == index:
            correct += 1

print("Total accuracy:", correct/total)

NameError: ignored