# Exercise multimodal recognition: RGB-D scene recognition

This exercise consists of three parts: two tutorials and the deliverable. The students must modify the code of the tutorial part, and write and discuss the results in the deliverable part that will be used to evaluate the exercise.

If you are not familiar with jupyter notebooks please check __[this tutorial](https://jupyter-notebook.readthedocs.io/en/latest/examples/Notebook/What%20is%20the%20Jupyter%20Notebook.html)__ first.

# Part 1 (tutorial): RGB baseline

In this tutorial, you will use a pretrained convolutional network and replace the classifier for the target dataset using PyTorch. The code is loosely based on the __[PyTorch transfer learning tutorial](http://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html)__. Just execute the code sequentially, paying attention to the comments.

In [1]:
%matplotlib inline

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy

import itertools

plt.ion()   # interactive mode

<matplotlib.pyplot._IonContext at 0x7f6780102d00>

Load Data
---------

We will use torchvision, torch.utils.data and RGBDutils packages for loading the
data. The dataset is structured hierarchically in splits\modalities\classes (check the folder).

In [2]:
# Data augmentation and normalization for training
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# Modalities
modality = 'rgb'
# modality = 'hha' # Uncomment to switch to HHA modality

# Path to the dataset
# data_dir = '/home/mcv/datasets/sunrgbd_lite'
data_dir = 'sunrgbd_lite'

# Preparing dataset and dataloaders
partitions = ['train', 'val', 'test']
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x, modality),
                                          data_transforms[x])
                  for x in partitions}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
                                             shuffle=True, num_workers=4)
              for x in partitions}
dataset_sizes = {x: len(image_datasets[x]) for x in partitions}
class_names = image_datasets['train'].classes

use_gpu = torch.cuda.is_available()

FileNotFoundError: [Errno 2] No such file or directory: 'sunrgbd_lite/train/rgb'

In [None]:
image_datasets

**Visualize a few images**

Let's visualize a few images to get familiar with the dataset.

In [None]:
def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated


# Get a batch of training data
inputs, classes = next(iter(dataloaders['train']))
inputs, classes = inputs[0:4], classes[0:4]
# Make a grid from batch
out = torchvision.utils.make_grid(inputs)

imshow(out, title=[class_names[x] for x in classes])

Training the model
------------------

Now, let's write a general function to train a model. Details:

-  Uses Adam algorithm for gradient descent.
-  Early stoping using best validation accuracy

In [None]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            print('Phase %s' % phase)
            if phase == 'train':
                if scheduler != None:
                    scheduler.step()
                model.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0.0

            # Iterate over data.
            for data in dataloaders[phase]:
                # get the inputs
                inputs, labels = data
                # wrap them in Variable
                if use_gpu:
                    inputs = Variable(inputs.cuda())
                    labels = Variable(labels.cuda())
                else:
                    inputs, labels = Variable(inputs), Variable(labels)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                outputs = model(inputs)
                _, preds = torch.max(outputs.data, 1)
                loss = criterion(outputs, labels)

                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # statistics
                running_loss += loss.data * inputs.size(0)
                # running_loss += loss.data[0] * inputs.size(0) # Pytorch 0.4
                running_loss += loss.data.item() * inputs.size(0) # Pytorch 1
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

And now, a function to evaluate the model on a particular set.

In [None]:
def evaluate_model(model, partition, criterion):
    since = time.time()

    model.train(False)  # Set model to evaluate mode

    running_loss = 0.0
    running_corrects = 0.0

    # Iterate over data.
    for data in dataloaders[partition]:
        # get the inputs
        inputs, labels = data
        # wrap them in Variable
        if use_gpu:
            inputs = Variable(inputs.cuda())
            labels = Variable(labels.cuda())
        else:
            inputs, labels = Variable(inputs), Variable(labels)

        # forward
        outputs = model(inputs)
        _, preds = torch.max(outputs.data, 1)
        loss = criterion(outputs, labels)

        # statistics
        # running_loss += loss.data[0] * inputs.size(0) # Pytorch 0.4
        running_loss += loss.data.item() * inputs.size(0) # Pytorch 1
        running_corrects += torch.sum(preds == labels.data)

    test_loss = running_loss / dataset_sizes[partition]
    test_acc = running_corrects / dataset_sizes[partition]

    
    print()

    time_elapsed = time.time() - since
    print('Tested in {:.0f}m {:.0f}s Loss: {:.4f} Acc: {:.4f}'.format(
        time_elapsed // 60, time_elapsed % 60, test_loss, test_acc))

    return test_acc, test_loss

The network
----------------------

The architecture of the network is shown in the following figure:
<img src="figures/rgb_network.png" />

The following code creates the RGB network by (downloading and) instantiating an AlexNet trained on ImageNet.

In [None]:
# Instantiate the model
model = models.alexnet(pretrained=True)

# You can visualize the network
print(model)

Set up the training/fine tuning parameters
----------------------

The following code creates the optimization criterio and set per-layer training rates to better control the fine tuning and training process. We use a very simple model in which all layers are frozen except the last fully connected one, i.e. the classifier, so it should be easy to improve the performance.

In [None]:
for param in model.parameters():
    param.requires_grad = False
    
num_classes = len(class_names)
c = model.classifier
num_ftrs = c[6].in_features
model.classifier = nn.Sequential(c[0],c[1],c[2],c[3],c[4],c[5],nn.Linear(num_ftrs, num_classes))

if use_gpu:
    model = model.cuda()

criterion = nn.CrossEntropyLoss()

learning_rate =0.001
perlayer_optim = [
    {'params': model.features[0].parameters(), 'lr': 0.00}, # conv1
    {'params': model.features[3].parameters(), 'lr': 0.00}, # conv2
    {'params': model.features[6].parameters(), 'lr': 0.00}, # conv3
    {'params': model.features[8].parameters(), 'lr': 0.00}, # conv4
    {'params': model.features[10].parameters(), 'lr': 0.00}, # conv5
    {'params': model.classifier[1].parameters(), 'lr': 0.000}, # fc6
    {'params': model.classifier[4].parameters(), 'lr': 0.000}, # fc7
    {'params': model.classifier[6].parameters(), 'lr': 0.001} # fc8
]
for param in itertools.chain(model.features[0].parameters(),model.features[3].parameters(),
                             model.features[6].parameters(),model.features[8].parameters(),
                             model.features[10].parameters(),model.classifier[1].parameters(),
                             model.classifier[4].parameters(),model.classifier[6].parameters()):
    param.requires_grad = True
    
optimizer = torch.optim.Adam(perlayer_optim, lr=learning_rate)

Train and evaluate the model
-----------------

It shouldn't take more than 2 mins to train with the GPU in the server.

In [None]:
# Train
model = train_model(model, criterion, optimizer, None, num_epochs=25)
    
# Evaluate
train_acc, _ = evaluate_model(model, 'train', criterion)
val_acc, _ = evaluate_model(model, 'val', criterion)
test_acc, _ = evaluate_model(model, 'test', criterion)
print('Accuracy. Train: %1.2f%% val: %1.2f%% test: %1.2f%%' % 
      (train_acc*100, val_acc*100, test_acc*100))