In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np

In this tutorial we will Train a ConvNet to classify images from Cifar10 databaset.

The Cifar10 dataset is included in torch-vision package which we installed in our first tutorial.

In this image classification tutorial we classify imgage from 10 classes given below - 

plane, car, bird, cat, deer, dog ,  frog, horse, ship, truck

By specifying <b>train = True</b> we will load the training data from the dataset and test dataset with <b>train = False</b>

In [5]:
from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler

# number of subprocesses to use for data loading
num_workers = 1
# how many samples per batch to load
batch_size =  10 

# convert data to a normalized torch.FloatTensor
transform = transforms.Compose([
    transforms.ToTensor(),
    #transforms.CenterCrop(size=224),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

# choose the training and test datasets
train_data = datasets.CIFAR10('data', train=True,
                              download=True, transform=transform)
test_data = datasets.CIFAR10('data', train=False,
                             download=True, transform=transform)

# prepare data loaders (combine dataset and sampler)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size
                                           , num_workers=num_workers)

test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, 
    num_workers=num_workers)

Files already downloaded and verified
Files already downloaded and verified


transforms from torchvision Packages gives us immense number of image transformation / augmentation capability , some of the examples given in below sample code -

```python 
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

data_transforms_train = transforms.Compose([transforms.RandomResizedCrop(size=256, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(), # randomly flip
    transforms.RandomRotation(degrees=15),  # Random rotation
    transforms.CenterCrop(size=224), # Center image crop
    transforms.ColorJitter(),                                            
    transforms.ToTensor(),
    transforms.Normalize(mean,std)])

data_transforms_test = transforms.Compose([transforms.Resize(size=256),
    transforms.CenterCrop(size=224), 
    transforms.ToTensor(),
    transforms.Normalize(mean,std)])
```

Documentation Link - https://pytorch.org/docs/stable/torchvision/transforms.html

##### Please note it is advisable that you use different transformation operation in train & test set (as shown above)
- for the simplicity I have used a single transofrmation for tranin & test set in the code.

#### Examine the datasets

#### Get a batch from the training data
The DataLoader object divides the dataset into batches. Here we examine the first batch of the training data set

In [8]:
images_batch, labels_batch = next(iter(train_loader))

#### Check out the shape of this batch of images
The first dimension gives the number of images. The next dimension represents the number of channels. The last two give the image size

In [10]:
images_batch.shape , labels_batch.shape

(torch.Size([10, 3, 32, 32]), torch.Size([10]))

# Defining the archietecture

In [11]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # convolutional layer (sees 32x32x3 image tensor)
        #torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True)
        self.conv1 = nn.Conv2d(3, 16, 3, padding=1)
        # convolutional layer (sees 16x16x16 tensor) -> calulation shown below 
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        # convolutional layer (sees 8x8x32 tensor)
        self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
        # max pooling layer
        self.pool = nn.MaxPool2d(2, 2)
        # linear layer (64 * 4 * 4 -> 500)
        self.fc1 = nn.Linear(64 * 4 * 4, 500)
        # linear layer (500 -> 10)
        self.fc2 = nn.Linear(500, 10)
        # dropout layer (p=0.25)
        self.dropout = nn.Dropout(0.25)

    def forward(self, x):
        # add sequence of convolutional and max pooling layers
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        # flatten image input
        x = x.view(-1, 64 * 4 * 4)
        # add dropout layer
        x = self.dropout(x)
        # add 1st hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        # add dropout layer
        x = self.dropout(x)
        # add 2nd hidden layer, with relu activation function
        x = self.fc2(x)
        return x

# create a complete CNN
model = Net()
print(model)

Net(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=1024, out_features=500, bias=True)
  (fc2): Linear(in_features=500, out_features=10, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)


Output volume can be calculated with below formula:

- Input: n X n X nc 
- Filter: f X f X nc
- Padding: p
- Stride: s
- Output: [((n+2p-f)/s)+1] X [((n+2p-f)/s)+1] X nc’   (height X width X no of output channels)

nc is the number of channels in the input and filter, while nc’ is the number of filters.

From the above structure you can see that height/width is getting reduced and number of channels are getting incresed.

Example calulating the output of first convolution + pooling layer operation - 

Input image shape - 32(n) X 32(n) X 3(nc)

#### 1. ConVNet filter operation - self.conv1 = nn.Conv2d(3, 16, 3, padding=1)

torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True)

Filter shape - 3 (f) X 3 (f) X 3(nc) 
Padding : P = 1
Stride : s = 1 (default value)
output channels - 16 (kernel_size)

putting it in the formula given above - 

[((n+2p-f)/s)+1] X [((n+2p-f)/s)+1] X nc’ 

[((32 + 2X1 - 3) / 1) + 1)] X [((32 + 2X1 - 3) / 1)) + 1)] X 16

output shape -> 32 X 32 X 16

##### 2. output of conv1 is passed through max pooling layer.

self.pool = nn.MaxPool2d(2, 2) -> filter of 2 X 2.

this will shrink the height & width by half , however no of channels will remain same.

input to the pooling layer - 32 X 32 X 16

output of the pooing layer - 32/2 X 32/2 X 16 -> 16 X 16 X 16

##### Defining the learning rate , loss function and Optimizer

In [12]:
learning_rate = 0.001

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), 
                             lr=learning_rate)

### Start the tranining

In [13]:
total_step = len(train_loader)
num_epochs = 10


for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 2000 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

Epoch [1/10], Step [2000/5000], Loss: 1.0409
Epoch [1/10], Step [4000/5000], Loss: 1.0061
Epoch [2/10], Step [2000/5000], Loss: 0.8220
Epoch [2/10], Step [4000/5000], Loss: 0.8714
Epoch [3/10], Step [2000/5000], Loss: 0.5746
Epoch [3/10], Step [4000/5000], Loss: 0.4862
Epoch [4/10], Step [2000/5000], Loss: 0.6535
Epoch [4/10], Step [4000/5000], Loss: 0.4694
Epoch [5/10], Step [2000/5000], Loss: 0.4779
Epoch [5/10], Step [4000/5000], Loss: 0.2416
Epoch [6/10], Step [2000/5000], Loss: 0.6477
Epoch [6/10], Step [4000/5000], Loss: 0.3305
Epoch [7/10], Step [2000/5000], Loss: 1.1355
Epoch [7/10], Step [4000/5000], Loss: 0.4103
Epoch [8/10], Step [2000/5000], Loss: 0.5287
Epoch [8/10], Step [4000/5000], Loss: 0.4093
Epoch [9/10], Step [2000/5000], Loss: 0.5138
Epoch [9/10], Step [4000/5000], Loss: 0.3859
Epoch [10/10], Step [2000/5000], Loss: 0.4446
Epoch [10/10], Step [4000/5000], Loss: 0.2691


### Evaluating the model on Test Data

In [14]:
model.eval()  
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:

        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the model on the 10000 test images: {}%'\
          .format(100 * correct / total))

Accuracy of the model on the 10000 test images: 72.84%


: 