# Image Classification CNN

Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

Device Configuration and Hyper-Parameters

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_epochs = 4
batch_size = 4
learning_rate = 0.001

Defining Transform from image space and Loading in CIFAR10 Datasets

In [3]:
# Define transform that converts image to tensor 
# then normalizes it to range [-1, 1]
transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]
)

# load CIFAR10 train and test datasets
train_dataset = torchvision.datasets.CIFAR10(root='./data', 
                                             train=True, download=True, transform=transform)

test_dataset = torchvision.datasets.CIFAR10(root='./data', 
                                            train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=batch_size, shuffle=True)

test_loader = torch.utils.data.DataLoader(test_dataset, 
                                          batch_size=batch_size, shuffle=False)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


Defining CNN

In [4]:
class ConvNet(nn.Module):
    
    # Defining Architecture of CNN
    def __init__(self):
        super(ConvNet, self).__init__()
        # First Convolutional Layer and 2x2 Max Pooling
        # (3, 6, 5) = (3 Input RGB Channels, 
        # 6 Output Channels (# of filters applied to image)), 
        # 5x5 Kernel Size)
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        # Second Convolutional Layer and 2x2 Max Pooling
        # (6, 16, 5) = (6 Input Channels, 
        # 16 Output Channels (# of filters applied to image)), 
        # 5x5 Kernel Size)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # Fully connected layers for classification
        # First fully connected layer gets the pooled output of the second convolutional layer
        # flattened output is (16 * 5 * 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        # Output of last fully connected layer is 10 (number of classes)
        self.fc3 = nn.Linear(84, 10)

    # Defining forward pass on input x
    def forward(self, x):
        # Passing through first convolutional layer and ReLU activation function then pooling
        x = self.pool(F.relu(self.conv1(x)))
        # Passing through second convolutional layer and ReLU activation function then pooling
        x = self.pool(F.relu(self.conv2(x)))
        # Flattening output of convolutional layer to 1D tensor
        #  -1 = batch size, 16 * 5 * 5 = flattened output size
        x = x.view(-1, 16 * 5 * 5)
        # Passing to 1st Fully connected layer and Relu
        x = F.relu(self.fc1(x))
        # Passing to 2nd Fully connected layer and Relu
        x = F.relu(self.fc2(x))
        # Passing to final Fully connected layer no Relu or softmax (done in loss function)
        x = self.fc3(x)
        
        return x
        

Declaring CNN Instance and Training Loop

In [5]:
# Declaring CNN model and moving it to GPU
model = ConvNet().to(device)
# Defining loss function criterion and optimizer model (Stochastic Gradient Descent)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# Defining total # of batches in training set as one epoch
n_total_steps = len(train_loader)

# Iterating over epochs
for epoch in range(num_epochs):
    # Iterates through the training dataset and assigns each batch to a tuple (images, labels)
    for i, (images, labels) in enumerate(train_loader):
        #  Original Shape: [4, 3, 32, 32] (4 images, 3 RGB channels, 32x32 pixels)
        images = images.to(device)
        labels = labels.to(device)
        # Forward Pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        # Backwards and Optimization Pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 2000 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')
        
print('Finished Training')
# torch.save(model.state_dict(), 'cnn-model.pth')
# print('Saved PyTorch Model State to cnn-model.pth')

Epoch [1/4], Step [2000/12500], Loss: 2.3169
Epoch [1/4], Step [4000/12500], Loss: 2.3105
Epoch [1/4], Step [6000/12500], Loss: 2.2887
Epoch [1/4], Step [8000/12500], Loss: 2.3095
Epoch [1/4], Step [10000/12500], Loss: 2.2967
Epoch [1/4], Step [12000/12500], Loss: 2.2790
Epoch [2/4], Step [2000/12500], Loss: 2.0645
Epoch [2/4], Step [4000/12500], Loss: 1.7463
Epoch [2/4], Step [6000/12500], Loss: 2.0743
Epoch [2/4], Step [8000/12500], Loss: 2.0387
Epoch [2/4], Step [10000/12500], Loss: 2.1045
Epoch [2/4], Step [12000/12500], Loss: 1.7326
Epoch [3/4], Step [2000/12500], Loss: 1.6991
Epoch [3/4], Step [4000/12500], Loss: 2.1486
Epoch [3/4], Step [6000/12500], Loss: 1.7564
Epoch [3/4], Step [8000/12500], Loss: 1.4739
Epoch [3/4], Step [10000/12500], Loss: 1.2541
Epoch [3/4], Step [12000/12500], Loss: 1.4238
Epoch [4/4], Step [2000/12500], Loss: 1.2920
Epoch [4/4], Step [4000/12500], Loss: 1.9099
Epoch [4/4], Step [6000/12500], Loss: 1.3822
Epoch [4/4], Step [8000/12500], Loss: 1.2982
Epoc

Evaluating CNN accuracy

In [6]:
with torch.no_grad():
    # Defining accuracy counters
    n_correct = 0
    n_samples = 0
    n_class_correct = [0 for i in range(10)]
    n_class_samples = [0 for i in range(10)]
    # Iterating over test batches
    for images, labels in test_loader:
        # Loading images and labels to GPU
        images = images.to(device)
        labels = labels.to(device)
        # Running forward pass on images
        outputs = model(images)
        # Finding Total Accuracy
        _, predicted = torch.max(outputs, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()
        # Finding Classwise Accuracy
        for i in range(batch_size):
            label = labels[i]
            pred = predicted[i]
            if (label == pred):
                n_class_correct[label] += 1
            n_class_samples[label] += 1
    
    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network: {acc} %')
    
    for i in range(10):
        acc = 100.0 * n_class_correct[i] / n_class_samples[i]
        print(f'Accuracy of {classes[i]}: {acc} %')

Accuracy of the network: 46.32 %
Accuracy of plane: 50.2 %
Accuracy of car: 76.5 %
Accuracy of bird: 23.2 %
Accuracy of cat: 20.7 %
Accuracy of deer: 30.4 %
Accuracy of dog: 47.1 %
Accuracy of frog: 59.6 %
Accuracy of horse: 59.2 %
Accuracy of ship: 56.8 %
Accuracy of truck: 39.5 %


Testing on own image

In [7]:
# OPEN IMAGE
my_image_path = 'data/car.jpeg'
my_image = Image.open(my_image_path)

# CROP AND RESIZE TO 32X32
left = (my_image.width - min(my_image.width, my_image.height)) / 2
top = (my_image.height - min(my_image.width, my_image.height)) / 2
right = (my_image.width + min(my_image.width, my_image.height)) / 2
bottom = (my_image.height + min(my_image.width, my_image.height)) / 2
my_image = my_image.crop((left, top, right, bottom))
my_image = my_image.resize((32, 32))
# my_image.show()

# TRANSFORM TO TENSOR
tensor_image = transform(my_image)
print(tensor_image.shape)

# FORWARD PASS THROUGH MODEL
print("input shape:", tensor_image.shape)
my_output = model(tensor_image.unsqueeze(0))
print("output shape:", my_output.shape)
# FIND MAX PREDICTION
_, predicted = torch.max(my_output, 1)
print("Prediction shape:", predicted.shape)
# MAP PREDICTION TO CLASS LABELS
predicted_class = classes[predicted.item()]
print("Item Prediction:", predicted_class)

torch.Size([3, 32, 32])
input shape: torch.Size([3, 32, 32])
output shape: torch.Size([1, 10])
Prediction shape: torch.Size([1])
Item Prediction: car
