In [1]:
import os

import numpy as np
import pandas as pd

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, DataLoader, Subset
from torchvision.datasets import ImageFolder
import torchvision.models as models

import matplotlib.pyplot as plt
import random

import shutil
from pathlib import Path

# this is the same function from the lecture; included here to easily set the seed for random functions
def set_seed(seed):
    """
    Use this to set ALL the random seeds to a fixed value and take out any randomness from cuda kernels
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.benchmark = True  #uses the inbuilt cudnn auto-tuner to find the fastest convolution algorithms. -
    torch.backends.cudnn.enabled   = True

    return True

# set the device to GPU if available:
device = 'cpu'
if torch.cuda.device_count() > 0 and torch.cuda.is_available():
    print("Cuda installed! Running on GPU!")
    device = 'cuda'
else:
    print("No GPU available!")
    
    
torch.cuda.empty_cache()

Cuda installed! Running on GPU!


In [2]:
out_path = './out'

# check if we have the output working directory, if not then create it
if not os.path.isdir(out_path):
    os.mkdir(out_path)
    
print(os.listdir(out_path))

[]


# Introduction

- In this notebook we will construct the Ensembler that was used to make the final submission for our group project.

- This ensembler is based on a weighted vote from the models that were deemed to provide good performance and variant architectures

- If we had been given more time, we would have constructed an ensemble classifier, with a prediction derived from *per class* weights. Since we were short on time, we applied a weighting based on overall accuracy. This nonetheless gave very good performance.

### Processing the Data

- In this section we process the input test data to get it ready for our model

- This consists of normalizing the test data based on our training data statistics. The normalized sample, $\hat x$ is given by:

$$ \hat x = \frac{x - \bar X}{\sigma_X}$$

$$
\begin{align}
&\bar x: \text{a single sample image}\\
&\bar X: \text{mean for all the training data, per RGB channel}\\
&\sigma_X: \text{standard deviation for all the training data, per RGB channel}
\end{align}
$$


- The ***CustomImageTensorDataset*** function, which was used through the project, is used here to apply the normalization transformation to the samples in the test data


- The ***ImageFolderWithPaths*** function is used to get the images including their corresponding filenames. This allows the construction of the csv meeting the required submission format

In [3]:
# define a custom image dataset class - this will be reused whenever we wish to load data into a dataloader
class CustomImageTensorDataset(Dataset):
    def __init__(self, data, targets, transform=None):
        """
        Args:
            data (Tensor): A tensor containing the data e.g. images
            targets (Tensor): A tensor containing all the labels
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data = data
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample, label = self.data[idx], self.targets[idx]        
        sample = torch.from_numpy(sample).permute(2, 0, 1)/255.
        if self.transform:
            sample = self.transform(sample)
        return sample, label
    
# this is needed for the test function below. It loads the test dataset correctly
class ImageFolderWithPaths(ImageFolder):
    """Custom dataset that also returns image file paths. Extends
    torchvision.datasets.ImageFolder
    Inspiration for this function from https://gist.github.com/andrewjong/6b02ff237533b3b2c554701fb53d5c4d
    """
    # override the __getitem__ method. this is the method that dataloader calls
    def __getitem__(self, index):
        # this is what ImageFolder normally returns 
        original_tuple = super(ImageFolderWithPaths, self).__getitem__(index)
        # the image file path
        path = self.imgs[index][0]
        # make a new tuple that includes original and the path
        tuple_with_path = (original_tuple + (path,))
        return tuple_with_path

train_path = './train/'
test_path = './test/'

# grab the dataset structure (this does not read files in, rather it creates a map of files and labels)
train_data = ImageFolder(train_path, transform=None)

# check that we have the files in the right place
print('number of training samples (expect 100000):', len(train_data))
print('number of training set classes (expect 200):', len(os.listdir(train_path)))
print('number of test classes (expect 1):', len(os.listdir(test_path)))
print('number of test images (expect 10000):', len(os.listdir(test_path+'images')))

number of training samples (expect 100000): 100000
number of training set classes (expect 200): 200
number of test classes (expect 1): 2
number of test images (expect 10000): 10001


#### Finding the Normalization Statistics

In [4]:
def get_stats(data_set):
    """
    This function gets the normalization statistics from a given data set.
    It was inspired from the discussion at https://discuss.pytorch.org/t/computing-the-mean-and-std-of-dataset/34949
    Args:
        data_set (torch.utils.data.Dataset): This is the data_set from which we wish to derive the
            normalization statistics
        
    Returns:
        X_set (np.array): An array containing all the input images in a combined array
        y_set (np.array): An array containing all the corresponding inputs in a combined array
        meanRGB/n (np.array): An array containing the means per RGB channels
        stdRGB/n (np.array): An array containing the standard deviations per RGB channels
        
    """
    n = len(data_set)
    X_set, y_set = [], []    
    meanRGB = np.array([0., 0., 0.])
    stdRGB = np.array([0., 0., 0.])

    # Here we loop over each image, update the mean and standard deviation
    for i, (img, label) in enumerate(data_set):
        X = np.array(img)
        X_set.append(X)
        y_set.append(label)
        meanRGB += [(X[:, :, i]/255.).mean() for i in range(3)]
        stdRGB += [(X[:, :, i]/255.).std() for i in range(3)]

    # Return the combined image and class arrays, and then the averaged mean and standard deviation
    return np.array(X_set), np.array(y_set), meanRGB/n, stdRGB/n
    
X_train, y_train, means, stdevs = get_stats(train_data)
print('means:', means, 'stdevs: ', stdevs)

means: [0.48024579 0.44807218 0.39754775] stdevs:  [0.2301945  0.22647534 0.2261424 ]


In [5]:
from torchvision.transforms import Compose, ToTensor, Normalize, Resize

# batch_size = 64
test_batch_size = 100

# # resize the images so they are the right size for out model architectures
# train_transform = Compose([
#     Resize(224),
#     ToTensor(),
#     Normalize(mean=means, std=stdevs)
# ])

# resize and normalize the test data
test_transform = Compose([
    Resize(224),
    ToTensor(),
    Normalize(mean=means, std=stdevs)
])

# train_dataset = CustomImageTensorDataset(X_train, y_train, transform=train_transform)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

# transform and load test data
test_data = ImageFolderWithPaths(test_path, transform=test_transform)
test_loader = DataLoader(test_data, batch_size=test_batch_size, shuffle=False, num_workers=0)

In [6]:
# check the data loader has the correct length given our batch size
assert len(test_loader) == 10000/test_batch_size

### Models Used in The Ensembler Prediction

- Below is a list of how to load and ensemble the different models. There is also a demonstration of how to how to use the ***modified_test*** function to make individual predictions


- The ***modified_test*** function is used to make predictions on an input dataset. Where it differs from the ***test*** function used elsewhere in the project is that it also returns the log softmax values too. This is to enable it to be used to construct a weighted prediction based on all the models

In [7]:
# data_loader must be with paths
def modified_test(model, data_loader):
    """
    Explanation:
        This function makes predictions on the test data from a single model
    Args:
        model (Tensor): A pytorch neural net model
        data_loader (DataLoader): A pytorch  dataloader for the test data    
    Returns:
        log_probs (np.array): An array containing the log-likelihood from the softmax classifier
        y_preds (np.array): An array  containing the class label predictions based on the softmax classifier
        file_names: (np.array): An array containing the filenames; this is used to construct the kaggle submission
    
    """
    model.eval()
    log_probs, y_preds, file_names = [], [], []
    for X, y, file_paths in data_loader:
        with torch.no_grad():
            # make sure we can utilize the GPU if available
            X, y = torch.from_numpy(np.array(X)).to(device), y.to(device)
            
            # forward pass through the model
            a2 = model(X)
            
            # we will return this too so as to be able to work out a proper weighted average
            log_prob = F.log_softmax(a2, dim=1)

            # make the predictions based off our model
            y_pred = log_prob.max(1)[1]
            
            # fix filenames
            img_names = [name.split('/')[-1] for name in file_paths]
            
            # set up the return arrays
            log_probs.append(log_prob.cpu().numpy())
            y_preds.append(y_pred.cpu().numpy())
            file_names.extend(img_names)

    return np.concatenate(log_probs, 0), np.concatenate(y_preds, 0), np.array(file_names)

#### Resnet18

This is the first model we submitted to Kaggle. It got a score of **0.70671** on the public leaderboard. We did not use this model in the final construction, but it gives an example of how to make a prediction using the ***modified_test*** function.

In [8]:
resnet18_ = models.resnet18(pretrained=True)

# adjust the architecture for our data set
num_ftrs = resnet18_.fc.in_features
resnet18_.fc = nn.Linear(num_ftrs, 200)

# utilize GPU if possible
resnet18_ = resnet18_.to(device)

# load the saved weights
resnet18_.load_state_dict(torch.load('RESNET_DROPOUT.pth'))

<All keys matched successfully>

In [9]:
# make predictions using resnet18
resnet18_probs, resnet18_preds, resnet18_file_names = modified_test(resnet18_, test_loader)

#### Wide Resnet 101-2

This is the second model architecture we submitted to Kaggle. This got a score of **0.81555** on the public leaderboard when we trained it on a subset of the data. It got a score of **0.84109** once trained on the whole training data set. It was included in the ensembler.

In [10]:
wide_resnet101_2_ = models.wide_resnet101_2(pretrained=True).to(device)

# adjust the architecture for our data set
num_ftrs = wide_resnet101_2_.fc.in_features
wide_resnet101_2_.fc = nn.Linear(num_ftrs, 200)

# utilize GPU if possible
wide_resnet101_2_ = wide_resnet101_2_.to(device)

# load the saved weights
wide_resnet101_2_.load_state_dict(torch.load('RESNET_101_wide_fulldata.pth'))

<All keys matched successfully>

#### Densenet-121

This was not a model we submitted to Kaggle individually. However in our training and validation tests we found this model had good performance, so we included it in the ensembler.

In [11]:
densenet121_ = models.densenet121(pretrained=True)

# adjust the architecture for our data set
num_ftrs = densenet121_.classifier.in_features
densenet121_.classifier = nn.Linear(num_ftrs, 200)

# utilize GPU if possible
densenet121_ = densenet121_.to(device)

# load the saved weights
densenet121_.load_state_dict(torch.load('DenseNet_3.pth'))

<All keys matched successfully>

#### Resnet 152

This was not a model we submitted to Kaggle individually. However again, in our training and validation tests we found this model had good performance, so we included it in the ensembler.

In [12]:
resnet152_ = models.resnet152(pretrained=True)

# adjust the architecture for our data set
num_ftrs = resnet152_.fc.in_features
resnet152_.fc = nn.Linear(num_ftrs, 200)

# utilize GPU if possible
resnet152_ = resnet152_.to(device)

# load the saved weights
resnet152_.load_state_dict(torch.load('resnet152_1.pth'))

<All keys matched successfully>

#### Resnet 50

This was again a good model not submitted to Kaggle individually. However we included it in the ensembler as it had good validation performance.

In [13]:
resnext50_ = models.resnext50_32x4d(pretrained=True)

# adjust the architecture for our data set
num_ftrs = resnext50_.fc.in_features
resnext50_.fc = nn.Linear(num_ftrs, 200)

# utilize GPU if possible
resnext50_ = resnext50_.to(device)

# load the saved weights
resnext50_.load_state_dict(torch.load('resnext50_32x4d_full_train (1).pth'))

<All keys matched successfully>

## Ensemble Classifier

- For the ensemble classifier we use a weighted voting algorithm. Ideally we would have weights per model for *each* class. These weights would be trained based on some unseen data. However, as we used all the training data in training our models, we had to find another way to weight the models.


- The method we use is to weight each model based on its validation accuracy. This is definitely not ideal, but under the constraints we faced in terms of time, it was the most robust option. An alternative is to not have individual weights at all and simply count the predicted labels and pick the mode of these. We felt however, that this would not take into account each models performance on the test dataset, so we preferred a weighted approach.


- The ***voting_classifier*** function takes in a list of models, $m_i$, their corresponding weights, $w_i$, and the data loader for the test data on which we want to make predictions. The voting algorithm to make prediction $\hat x$:

$$ \hat l = \frac{\Sigma_1^m w_i ~ L_i}{m} $$

where

$$
\begin{align}
& \hat l: \text{the aggregate log softmax array}\\
& w_i: \text{weight for model} ~ i\\
& L_i: \text{array for log softmax for an batch of samples}
\end{align}
$$

Then to get the prediction, $\hat x$, we simply take the maximum values per class in $\hat l$. The code below makes it clearer.

In [14]:
def voting_classifier(model_list, weights, data_loader):
    """
    Args:
        models (list): A list of pretrained models that should be ensembled
        weights (list): A list of weights to apply to each corresponding model
        data_loader (Dataloader): The test data Dataloader
    """
    for model in model_list:
        model.eval()

    y_preds, file_names = [], []
    total_weight = sum(weights)
    
    # loop over each data point, get vote from each model
    for X, y, file_paths in data_loader:
        with torch.no_grad():
            # make sure we can utilize the GPU if available
            X, y = torch.from_numpy(np.array(X)).to(device), y.to(device)
            
            # fix filenames
            img_names = [name.lower().split('/')[-1] for name in file_paths]
            
            # for each model, make a prediction for each image in the batch
            model_preds = []
            
            # this will store our aggregate result
            aggregate_prediction = torch.zeros([test_batch_size, 200]).to(device)
            
            # aggregate the votes for each model, using the log of the softmax
            for model, weight in zip(model_list, weights):
                # forward pass through the model (col: batch_element, row: number of classes)
                a2 = model(X)
                # get log probability and weight it
                log_prob = F.log_softmax(a2, dim=1)*weight
                aggregate_prediction += log_prob
            
            # calculate the weighted average, on which we will find the maximum per class values
            aggregate_prediction/total_weight
            
            # make the predictions based off our model
            y_pred = aggregate_prediction.max(1)[1] 
            
            y_preds.append(y_pred.cpu().numpy())
            file_names.extend(img_names)

    return np.concatenate(y_preds, 0), np.array(file_names)

## Submitting on Kaggle

In [15]:
# we pick 4 models which we want to combine into the ensemble
model_list = [densenet121_, resnet152_, resnext50_, wide_resnet101_2_]

# these weights are based on the accuracy on the Kaggle leaderboard, or the validation accuracy
#   while training the models
weights = [0.74, 0.809, 0.807, 0.84109]

assert len(model_list) == len(weights)

In [16]:
# this function constructs and stores the csv which is submitted to kaggle
def to_kaggle(filenames, y_preds):
    # construct dataframe from the results
    submission = pd.DataFrame({'filename': filenames, 'label': y_preds})
    print(submission.head())
    submission.to_csv('voting.csv', index=False)

# # Make the predictions and construct the output dataframe
y_preds, filenames = voting_classifier(model_list, weights, test_loader)

to_kaggle(filenames, y_preds)

         filename  label
0     test_0.jpeg    107
1     test_1.jpeg     40
2    test_10.jpeg     74
3   test_100.jpeg     90
4  test_1000.jpeg    138
