# Correction of the convolutional encoder-decoder practical exercise

by Loic Landrieu

In [3]:
#[1] import and installations
import torch
import numpy as np
import torchnet as tnt
import functools
import mock
import math
from mpl_toolkits import mplot3d
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import random
import torch.nn.functional as nnf
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from functools import partial

In [0]:
#[2] Authentification
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

downloaded = drive.CreateFile({'id':'1qB9Gt9UCTokkSM4Fvy-srQUU9ir0Bke7'})
downloaded.GetContentFile('landCover.hdf5') 

In [0]:
#[4] building the train and test sets
data_file = h5py.File("landCover.hdf5",'r')
train_obs = data_file['train_observation'][:]
train_gt = data_file['train_gt'][:]
test_obs = data_file['test_observation'][:]
test_gt = data_file['test_gt'][:]
n_train = train_obs.shape[0]
n_test = test_obs.shape[0]
class_names = ["Urban", "Water", "Fields", "Road", "Vegetation", "Buildings"]


print("%d tiles for training, %d tiles for testing" % (n_train, n_test))

In [0]:
#[4] data loader

def augment(obs, gt):
  """augmentation function
  Leave untouched for now until question XX
  """
  return obs, gt#does nothing, leaves like that for now
  #random gaussian noise
  sigma, clip= 0.01, 0.02 # https://github.com/charlesq34/pointnet/blob/master/provider.py#L74
  obs = obs + np.clip(sigma * np.random.randn(*obs.shape), -1*clip, clip).astype(np.float32)
  #random rotation 0 90 180 270 degree
  #n_turn =  np.random.randint(4)   #removed because detrmental, see Q14
  #obs = np.rot90(obs, k=n_turn, axes=(1,2)).copy()
  #gt = np.rot90(gt, k=n_turn, axes=(0,1)).copy()
  return obs, gt


def tile_loader(tile_index, train = True, cuda = 1):
  """
  load a tile and returns the obseravtion and associated ground truth
  INPUT:
  tile_index = int, index of the tile
  train = int, train = 1 iff in the train set
  cuda = int, cuda = 1 if using GPUs
  OUTPUT
  obs, [256 x 256 x 4] float array containing the observation
  gt, [256 x 256] uint8 array, containing the pixels semantic labels  
  """
  if train:
    obs = train_obs[tile_index,:,:,:].transpose(2,0,1) #put channels first
    gt = train_gt[tile_index,:,:]
  else:
    obs = test_obs[tile_index,:,:,:].transpose(2,0,1)
    gt = test_gt[tile_index,:,:]
    
  if train: #augmentation - for training only
    obs, gt = augment(obs, gt)   
  
  obs = torch.from_numpy(obs)
  gt = torch.from_numpy(gt)
  
  if cuda:
    obs = obs.cuda()
   
  return obs, gt.long()

#putting the dataset into the ListDataset wrapper
test_set  = tnt.dataset.ListDataset(list(range(n_test)),functools.partial(tile_loader, train=False))
train_set = tnt.dataset.ListDataset(list(range(n_train)),functools.partial(tile_loader, train=True))


In [0]:
#[5] functions used for visualization
def view_rgb(tile, ax = None):
  """ show the rgb values of the tile in figure ax"""
  if ax==None:
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1, aspect='equal')
  tile_corrected = np.minimum(np.maximum(tile[:3,:,], 0), 1) #normalization
  ax.imshow(tile_corrected.transpose(0,2).transpose(0,1)) #put channels back as dim 3
  plt.axis('off')
  
def view_infrared(tile, ax = None):
  """ show the infrared tile in figure ax"""
  if ax==None:
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1, aspect='equal')
  tile_corrected = np.minimum(np.maximum(tile[3,:,:], 0), 1) #normalization
  ax.imshow(tile_corrected,cmap='hot')
  plt.axis('off')

def view_labels(label, ax = None, mask = None):
  """ show the ground truth with a colorcode corresponding to labels"""
  if ax==None:
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1, aspect='equal')
  if mask is not None:
    label[mask] = 99
  n_pixel = label.shape[1]
  colors = np.zeros((n_pixel,n_pixel,3))
  colors[np.where(label==99)] = [1  ,1  ,1  ] #not labelled
  colors[np.where(label==0)]  = [1  ,0.8,0.8] #building limit
  colors[np.where(label==1)]  = [0  ,0  ,1  ] #water
  colors[np.where(label==2)]  = [0.9,0.9,0  ] #fields
  colors[np.where(label==3)]  = [0.5,0.5,0.5] #road
  colors[np.where(label==4)]  = [0  ,.8  ,0  ] #vegetation
  colors[np.where(label==5)]  = [1,  0  ,0  ] #building
  ax.imshow(colors)
  plt.axis('off')
  
def view_error(pred, gt, ax = None):
  """ show the error between pred and gt with colorcode:
 green when 'gt'='pred', red when 'gt'!='pred' and black
 when unnannotated (gt = 0)"""
  if ax==None:
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1, aspect='equal')
  n_pixel = gt.shape[1]
  colors = np.zeros((n_pixel,n_pixel,3))
  colors[np.where(pred==gt.squeeze())] = [0, 1, 0] #correct prediction
  colors[np.where(pred!=gt.squeeze())] = [1, 0, 0] #error
  colors[np.where(gt.squeeze()==99)] = [0, 0, 0]   #unannotated 
  ax.imshow(colors)
  plt.axis('off')
  
def viewer(n_shown = 3, category = 'cig', train = True, model = None, use_mask = False):
  """ plot 'n_shown' random tiles train/test set with the following visuals:
  if 'c' in category : rgb color
  if 'i' in category : infrared
  if 'g' in category : ground truth
  if 'p' in category : prediction
  if 'e' in category : error
  Note that for 'p' or 'e' ou need to add a trained model as input
  
  """
  n_category = len(category) #number of types of image to show
  fig = plt.figure(figsize=(n_category * 5, n_shown * 5)) #adapted dimension
  
  subplot_index = 1 #keep track of current subplot
  
  #chose random tiles
  tile_indices = np.random.choice(n_train, n_shown) if train \
  else np.random.choice(n_test, n_shown)
  
  for tile_index in tile_indices:
    
    tile, gt = tile_loader(tile_index, train = train, cuda=0)
    
    if 'c' in category:
      ax = fig.add_subplot(n_shown, n_category, subplot_index, aspect='equal')
      if subplot_index <= n_category : 
        ax.set(title='RGB')
      view_rgb(tile, ax = ax)
      subplot_index += 1
    if 'i' in category:  
      ax = fig.add_subplot(n_shown, n_category, subplot_index, aspect='equal')
      if subplot_index <= n_category : 
        ax.set(title='Infrared')
      view_infrared(tile, ax = ax)
      subplot_index += 1
    if 'g' in category:  
      ax = fig.add_subplot(n_shown, n_category, subplot_index, aspect='equal')
      if subplot_index <= n_category : 
        ax.set(title='Ground Truth')
      view_labels(gt, ax = ax)
      subplot_index += 1
    if 'p' in category:  
      ax = fig.add_subplot(n_shown, n_category, subplot_index, aspect='equal')
      if subplot_index <= n_category : 
        ax.set(title='Prediction')
      pred = model(tile[None,:,:,:].cuda()).cpu().argmax(1).squeeze()
      if use_mask:
        pred[gt==99] = 99
      view_labels(pred, ax = ax)
      subplot_index += 1  
    if 'e' in category: 
      ax = fig.add_subplot(n_shown, n_category, subplot_index, aspect='equal')
      if subplot_index <= n_category : 
        ax.set(title='Error')
      pred = model(tile[None,:,:,:].cuda()).cpu().argmax(1).squeeze()
      view_error(pred, gt, ax = ax)
      subplot_index += 1
  plt.show()

In [0]:
#[6]
viewer(n_shown = 3, category = 'cig', train = True)

In [0]:
#[7]
class ConfusionMatrix:
  def __init__(self, n_class, class_names):
    self.CM = np.zeros((n_class, n_class))
    self.n_class = n_class
    self.class_names = class_names
  
  def clear(self):
    self.CM = np.zeros((self.n_class, self.n_class))
    
  def add_batch(self, gt, pred):
    self.CM +=  confusion_matrix(gt, pred, labels = list(range(self.n_class)))
    
  def overall_accuracy(self):#percentage of correct classification
    return 100*self.CM.trace() / self.CM.sum()

  def class_IoU(self, show = 1):
    ious = np.full(self.n_class, 0.)
    for i_class in range(self.n_class):
      ious[i_class] = self.CM[i_class, i_class] / \
        (-self.CM[i_class, i_class] \
        + self.CM[i_class, :].sum()
        + self.CM[:, i_class].sum())
    if show:
      print('  |  '.join('{} : {:3.2f}%'.format(name, 100*iou) for name, iou in zip(self.class_names,ious)))
    #do not count classes that are not present in the dataset in the mean IoU
    return 100*np.nansum(ious) / (np.logical_not(np.isnan(ious))).sum()

In [0]:
#[8]
m = ConfusionMatrix(6, class_names)
m.add_batch(np.array([0,1,1,5,2,0,0,4,0,5,3]), np.array([0,1,0,5,2,0,1,4,0,5,3]))
m.add_batch(np.array([0,1,5,1,2,1,0,2,3]), np.array([0,1,1,1,2,1,0,2,3]))
print(m.CM)
print("OA = %3.2f%%" % (m.overall_accuracy()))
m.class_IoU()
m.clear()

$\textbf{Why is it necessary that d4 = d6 and d2 = d8?}$

The maxpool indices used to go from $x_1$ to $x_2$ (width $d_2$) must be compatible with the unpool layer used to go from $y_3$ to $y_2$ (width $d_8$), and same for the second maxpool/unpool pair.

Many people mention concatenation, but there is no constraint on width when concatenating tensors: you can concatenate two tensors of size $H_1\times W_1 \times D_1$ and $H_2\times W_2 \times D_2$ for arbitrary width $D_1$ and $D_2$, as long as $H_1=H_2$ and $W_1=W_2$. The feature map size is not controlled by the convolution width, but by the organisation of maxpools.


In [0]:
#[9]
class SegNet(nn.Module):
  """
  SegNet network for semantic segmentation
  """
  
  def __init__(self, n_channels, encoder_conv_width, decoder_conv_width, n_class, cuda = 1):
    """
    initialization function
    n_channels, int, number of input channel
    encoder_conv_width, int list, size of the feature maps of convs for the encoder
    decoder_conv_width, int list, size of the feature maps of convs for the decoder
    n_class = int,  the number of classes
    """
    super(SegNet, self).__init__() #necessary for all classes extending the module class
    
    assert((encoder_conv_width[3] == encoder_conv_width[5]) \
     and (encoder_conv_width[1] == decoder_conv_width[1]))
    
    self.maxpool=nn.MaxPool2d(2,2,return_indices=True) #maxpooling layer
    self.unpool=nn.MaxUnpool2d(2,2) #unpooling layer
    #encoder
    self.c1 = nn.Sequential(nn.Conv2d(n_channels,encoder_conv_width[0],3,padding=1, padding_mode='reflection'),nn.BatchNorm2d(encoder_conv_width[0]),nn.ReLU(True))
    self.c2 = nn.Sequential(nn.Conv2d(encoder_conv_width[0],encoder_conv_width[1],3,padding=1, padding_mode='reflection'),nn.BatchNorm2d(encoder_conv_width[1]),nn.ReLU(True))
    self.c3 = nn.Sequential(nn.Conv2d(encoder_conv_width[1],encoder_conv_width[2],3,padding=1, padding_mode='reflection'),nn.BatchNorm2d(encoder_conv_width[2]),nn.ReLU(True))
    self.c4 = nn.Sequential(nn.Conv2d(encoder_conv_width[2],encoder_conv_width[3],3,padding=1, padding_mode='reflection'),nn.BatchNorm2d(encoder_conv_width[3]),nn.ReLU(True))
    self.c5 = nn.Sequential(nn.Conv2d(encoder_conv_width[3],encoder_conv_width[4],3,padding=1, padding_mode='reflection'),nn.BatchNorm2d(encoder_conv_width[4]),nn.ReLU(True))
    self.c6 = nn.Sequential(nn.Conv2d(encoder_conv_width[4],encoder_conv_width[5],3,padding=1, padding_mode='reflection'),nn.BatchNorm2d(encoder_conv_width[5]),nn.ReLU(True))
    #decoder
    self.c7=nn.Sequential(nn.Conv2d(encoder_conv_width[5]+encoder_conv_width[3],decoder_conv_width[0],3,padding=1, padding_mode='reflection'),nn.BatchNorm2d(decoder_conv_width[0]),nn.ReLU(True))
    self.c8=nn.Sequential(nn.Conv2d(decoder_conv_width[0],decoder_conv_width[1],3,padding=1, padding_mode='reflection'),nn.BatchNorm2d(decoder_conv_width[1]),nn.ReLU(True))       
    self.c9=nn.Sequential(nn.Conv2d(encoder_conv_width[1] + decoder_conv_width[1],decoder_conv_width[2],3,padding=1, padding_mode='reflection'),nn.BatchNorm2d(decoder_conv_width[2]),nn.ReLU(True))
    self.c10=nn.Sequential(nn.Conv2d(decoder_conv_width[2],decoder_conv_width[3],3,padding=1, padding_mode='reflection'),nn.BatchNorm2d(decoder_conv_width[3]),nn.ReLU(True))
    #final classifying layer
    self.classifier=nn.Conv2d(decoder_conv_width[3],n_class,3,padding=1, padding_mode='reflection')

    #weight initialization

    self.c1[0].apply(self.init_weights)
    self.c2[0].apply(self.init_weights)
    self.c3[0].apply(self.init_weights)
    self.c4[0].apply(self.init_weights)
    self.c5[0].apply(self.init_weights)
    self.c6[0].apply(self.init_weights)
    self.c7[0].apply(self.init_weights)
    self.c8[0].apply(self.init_weights)
    self.c9[0].apply(self.init_weights)
    self.c10[0].apply(self.init_weights)
    self.classifier.apply(self.init_weights)
    
    if cuda: #put the model on the GPU memory
      self.cuda()
    
  def init_weights(self,layer): #gaussian init for the conv layers
    nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
    
  def forward(self,input):
    """
    the function called to run inference
    """  
    #encoder
    #level 1
    x1 = self.c2(self.c1(input))
    x2, indices_a_b =self.maxpool(x1)
    #level 2
    x3=self.c4(self.c3(x2))
    x4, indices_b_c =self.maxpool(x3)
    #level 3
    x5 = self.c6(self.c5(x4))
    #decoder
    #level 2       
    y4 = self.unpool(x5, indices_b_c)
    y3 = self.c8(self.c7(torch.cat((y4,x3),1)))
    #level 1       
    y2 = self.unpool(y3, indices_a_b)
    y1 = self.c10(self.c9(torch.cat((y2,x1),1)))
    #output         
    out = self.classifier(y1)
    return out

In [0]:
#[10]
#==================TEST===============================
#we consider the first point cloud from the training set
tile, gt = tile_loader(0)
segnet = SegNet(4,[16,16,32,32,64,32], [32,16,32,16],6)
print(segnet)
print('Total number of parameters: {}'.format(sum([p.numel() for p in segnet.parameters()])))
pred = segnet(tile[None,:,:,:]) #the None indicate a batch size of 1
assert(pred.shape == torch.Size([1,6,256,256]))


In [0]:
#[11]
def train(model, optimizer, args):
  """train for one epoch"""
  model.train() #switch the model in training mode
  
  #the loader function will take care of the batching
  loader = torch.utils.data.DataLoader(train_set, \
         batch_size=args.batch_size, shuffle=True, drop_last=True)
  #tqdm will provide some nice progress bars
  loader = tqdm(loader, ncols=500)
  
  #will keep track of the loss
  loss_meter = tnt.meter.AverageValueMeter()
  cm = ConfusionMatrix(args.n_class, class_names = class_names)

  for index, (tiles, gt) in enumerate(loader):
    
    optimizer.zero_grad() #put gradient to zero
    
    pred = model(tiles) #compute the prediction

    loss = nn.functional.cross_entropy(pred.cpu(),gt, ignore_index=99)

    loss.backward() #compute gradients

    for p in model.parameters(): #we clip the gradient at norm 1
      p.grad.data.clamp_(-1, 1)
    
    optimizer.step() #one SGD step
    
    loss_meter.add(loss.item())
    labeled = np.where(gt.view(-1)!=99)[0] #select gt with a label
    #need to put the prediction back on the cpu and convert to numpy
    cm.add_batch(gt.view(-1)[labeled], pred.argmax(1).view(-1)[labeled].cpu().detach().numpy())
    
  return cm, loss_meter.value()[0]

def eval(model, args):
  """eval on test/validation set"""
  
  model.eval() #switch in eval mode
  
  loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=False, drop_last=False)
  
  loader = tqdm(loader, ncols=500)
  
  loss_meter = tnt.meter.AverageValueMeter()
  cm = ConfusionMatrix(args.n_class, class_names = class_names)

  with torch.no_grad():
    for index, (tiles, gt) in enumerate(loader):
      
      pred = model(tiles) #compute the prediction

      loss = nn.functional.cross_entropy(pred.cpu(),gt, ignore_index=99)
      
      loss_meter.add(loss.item())
      labeled = np.where(gt.view(-1)!=99)[0] #select gt with a label
      #need to put the prediction back on the cpu and convert to numpy
      cm.add_batch(gt.view(-1)[labeled], pred.argmax(1).view(-1)[labeled].cpu().detach().numpy())
  return cm, loss_meter.value()[0]


def train_full(args):
  """The full training loop"""
  #initialize the model
  
  model = SegNet(args.n_channel, args.conv_width, args.dconv_width, args.n_class)

  print('Total number of parameters: {}'.format(sum([p.numel() for p in model.parameters()])))
  
  #define the optimizer
  #adam optimizer is always a good guess for classification
  optimizer = optim.Adam(model.parameters(), lr=args.lr)
  
  TESTCOLOR = '\033[104m'
  TRAINCOLOR = '\033[100m'
  NORMALCOLOR = '\033[0m'
  
  for i_epoch in range(args.n_epoch):
    #train one epoch
    cm_train, loss_train = train(model, optimizer, args)
    print(TRAINCOLOR)
    print('Epoch %3d -> Train Overall Accuracy: %3.2f%% Train mIoU : %3.2f%% Train Loss: %1.4f' % (i_epoch, cm_train.overall_accuracy(), cm_train.class_IoU(), loss_train) + NORMALCOLOR)

    if (i_epoch == args.n_epoch - 1) or (args.n_epoch_test != 0 and i_epoch % args.n_epoch_test == 0 and i_epoch > 0):
      #periodic testing
      cm_test, loss_test = eval(model, args)
      print(TESTCOLOR)
      print('Test Overall Accuracy: %3.2f%% Test mIoU : %3.2f%%  Test Loss: %1.4f' % (cm_test.overall_accuracy(), cm_test.class_IoU(), loss_test) + NORMALCOLOR)
      viewer(n_shown = 1, train = False, model = model, category = 'cigpe', use_mask = False)
  return model

In [0]:
#[12]
args = mock.Mock() #stores the parameters
args.n_epoch = 50
args.n_epoch_test = int(1) #periodicity of evaluation on test set
args.batch_size = 16
args.n_class = 6
args.n_channel = 4
args.conv_width = [16,16,32,32,64,32]
args.dconv_width = [32,16,32,16]
args.cuda = 1
args.lr = 5e-3

trained_model = train_full(args)


$\textbf{Comment on the evolution of the performance on the training set and test set.}$

Train accuracy is higher than test accuracy, test accuracy fluctuates a lot.

$\textbf{Propose an explanation as to why the pink class surrounding the buildings was added.}$

The pink class helps the network to retrieve believabe shape for buildings. If one were to remove this class (set it to 99, or 5, an easy change in the loader function), they would observe that the buildings in the same city blocks would be merged into one big blob. This is because in the ground truth, the area in between buildings is unannotated, meaning that the network is not penalized no matter what it predicts there.

By adding a thin outline around buildings, the networks produces delimitated builings, and fill in between wih the pink class 'urban'. Note that removing the urban class should not impact the actual prediction score because it affects unannotated pixels which are not counted in the scores, but impact the visual quality of the results. 

In [0]:
#[13]
viewer(n_shown=5, category = 'cigpe', model = trained_model, train = False, use_mask = True)

In [0]:
#[14]
def view_embeddings(fmap, ax = None):
  if ax==None:
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1, aspect='equal')
  fmap_dim = fmap.shape[1]
  n_pix = fmap.shape[2]
  #we use a pca to project the emebddings to a RGB space
  pca = PCA(n_components=3)
  pca.fit(np.eye(fmap_dim))
  #we need to adapt dimension and memory allocation to CPU
  fmap_ = fmap.cpu().detach().numpy().squeeze().reshape((fmap_dim, n_pix * n_pix)).transpose(1,0)
  color = pca.transform(fmap_)
  #we normalize for visibility
  color = np.maximum(np.minimum(((color - color.mean(1, keepdims = True) +0.5) / (2 * color.std(1, keepdims = True))), 1), 0)
  color = color.reshape((n_pix, n_pix,3), order= 'C')
  ax.imshow(color)
  plt.axis('off')

def view_U(model, tile_index = None, train = False):
  if tile_index is None:
    tile_index = np.random.randint(n_train) if train \
    else np.random.randint(n_test)
  tile, gt = tile_loader(tile_index, train = train, cuda=1)
    
  input = tile[None,:,:,:]
  x1 = model.c2(model.c1(input))
  x2, indices_1_2 = model.maxpool(x1)
  #level 2
  x3 = model.c4(model.c3(x2))
  x4, indices_2_3 = model.maxpool(x3)
  #level 3
  x5 = model.c6(model.c5(x4))
  #level 2       
  y4 = model.unpool(x5, indices_2_3,x2.size())
  y3 = model.c8(model.c7(torch.cat((y4,x3),1)))
  #level 1       
  y2 = model.unpool(y3, indices_1_2,x1.size())
  y1 = model.c10(model.c9(torch.cat((y2,x1),1)))
  #output       
  out = model.classifier(y1)
  pred = out.argmax(1)

  fig = plt.figure(figsize=(25, 10)) #adapted dimension
  ax = fig.add_subplot(3, 7, 1, aspect=1)
  ax.set(title='Input : %d x %d x %d' %(tile.shape))
  view_rgb(tile.cpu(), ax)
  ax = fig.add_subplot(3, 7, 2, aspect=1)
  ax.set(title='x1 : %d x %d x %d' %(x1.shape[1:]))
  view_embeddings(x1, ax)
  ax = fig.add_subplot(3, 7, 9, aspect=1)
  ax.set(title='x2 : %d x %d x %d' %(x2.shape[1:]))
  view_embeddings(x2, ax)
  ax = fig.add_subplot(3, 7, 10, aspect=1)
  ax.set(title='x3 : %d x %d x %d' %(x3.shape[1:]))
  view_embeddings(x3, ax)
  ax = fig.add_subplot(3, 7, 17, aspect=1)
  ax.set(title='x4 : %d x %d x %d' %(x4.shape[1:]))
  view_embeddings(x4, ax)
  ax = fig.add_subplot(3, 7, 18, aspect=1)
  ax.set(title='x5 : %d x %d x %d' %(x5.shape[1:]))
  view_embeddings(x5, ax)
  ax = fig.add_subplot(3, 7, 11, aspect=1)
  ax.set(title='y4 : %d x %d x %d' %(y4.shape[1:]))
  view_embeddings(y4, ax)
  ax = fig.add_subplot(3, 7, 12, aspect=1)
  ax.set(title='y3 : %d x %d x %d' %(y3.shape[1:]))
  view_embeddings(y3, ax)
  ax = fig.add_subplot(3, 7, 5, aspect=1)
  ax.set(title='y2 : %d x %d x %d' %(y2.shape[1:]))
  view_embeddings(y2, ax)
  ax = fig.add_subplot(3, 7, 6, aspect=1)
  ax.set(title='y1 : %d x %d x %d' %(y1.shape[1:]))
  view_embeddings(y1, ax)
  ax = fig.add_subplot(3, 7, 7, aspect=1)
  ax.set(title='Output : 6 x %d x %d' %(tile.shape[1:]))
  view_labels(pred.cpu().detach().numpy().squeeze(), ax, mask=gt==99)
  ax = fig.add_subplot(3, 7, 14, aspect=1)
  ax.set(title='Ground Truth : 6 x %d x %d' %(tile.shape[1:]))
  view_labels(gt, ax)
  ax = fig.add_subplot(3, 7, 21, aspect=1)
  ax.set(title='Error')
  view_error(pred.cpu().detach().numpy().squeeze(), gt.numpy(), ax)  

In [0]:
#[15]]
view_U(trained_model,2)

$\textbf{Q11}:$ large batches produce more accurate gradients and less iterations per epoch, and should be used with larger learning rate. Vice versa.

This exercise works with a large spectrum of batch sizes as long as you adjust the learning rate in consequence.

$\textbf{Q12}:$ Drop out shouldn't help too much, as the batch norms already robustigy the network. In general, excep if you observe itense overfitting, don't use dropout.

$\textbf{Q13}:$ should produce a small increase in IoU for the cost of a small decrease in OA. It makes sense since the unweighted cross-entropy is a surrogate for the OA, in which all classes are treated equally.

$\textbf{Q14}:$ Jittering (Gaussian noise) should help a little bit to robustify the network, but marginally as the data are not so noisy to start with.

Random rotations on the other hand decreases performance by a lot, for a nontrivial reason. Since Spot6 is heliosynchronous, shadows of buildings are always on the same side of buildings. This allows the newtwork to get cues on the height of buidlings. By adding a random rotation, it makes this height regression much harder. This is a prime example of (geometric) information leaking accidentaly.

So, should we do a random rotation? If we only want to use Spot6 in a small area, then we might as well exploit the capacity of the network to expoit the shadows and remove the random rotation. If we want to apply the samenetwork to different areas / satelite, then we should add the random rotation to prevent the network from learning area-specific informations.

I saw many people saying that we should duplicate the data. This is not true and should be avoided at all cost. To add a random rotation (0,90,180 or 270 degrees) you just need to add this operation randomly, on-the-fly in the loader function. This will act exactly the same way that if you duplicated your training set by 4 and added the rotations there. But with the latter you now need 4 times as much disk space.

$\textbf{Q15}:$ networks too large tends to overfit. Training performance should increase monotonically with network size, while the test performance should start to decrease at some point.

The default configuration is pretty wel chosen, but some of you had good results with slightly larger networks.

Many people commplained about memory crashes and blamed google collab. The GPU on google are TeslaV100 with 16GB of memory, and which cost over 3000$. So this is not the problem.

If you try very large networks, you should decrease the batch size so that it fits in memory. There are other tricks like memory mongering which can help. But for such an easy task, you don't need such alrge networks. 200.000 parameters should be the absolute maximum.

$\textbf{Q16}:$ Should marginally help. Allows to, use very large initial learning rate (0.05) which are subsequentialy decreased.

$\textbf{Q17}:$ straihgtforward. Should not help too much because more stages mostly increases the receptive field, which is large enough with 2 stages.

$\textbf{Q18}:$ should help a lot to stabilize the test performance.

$\textbf{Q19}:$ straightfoward from Q18. Pseudocode:
```
input = args # a configuration to evaluate
performance = empty #the chosen metric
p = random permutation
size_fold = size_dataset / nfold
for i in range(nfold)
  train = full_dataset[p[i*size_fold: (i+1)*size_fold]]
  test = the rest
  full_train(train, args) #train a configuration config on train
  performance += evaluation(test) 
performance = performance / nfold #average of the performance on the whole dataset with no leakage
return performance
```



$\textbf{Q20}:$ could potentially help a little, but mostly for the visual quality of the prediction.

Loss annealing should be used because the output of the newtork will be initially very noisy, and a TV prior on random noise will slow down learning (will push towards an uniform prediction). Once the network gets confident enough, the TV prior can intervene and help increase spatial regularity.



In [0]:
def compute_adjacency_graph(n = 256, cuda = 1):
  """
  compute the 4-adjacency graph for a square image of size n
  """
  edges = np.zeros((2*n**2,4), dtype='int64')
  index_edg = 0
  for i in range(n):
    for j in range(n):
      if i < n-1:
        edges[index_edg,:] = [i,j,i+1,j]
        index_edg += 1
      if j < n-1:
        edges[index_edg,:] = [i,j,i,j+1]
        index_edg += 1
  return edges[:index_edg,:]

In [0]:
def TV(x,edges):
  return torch.abs(x[:,:,edges[:,0],edges[:,1]] - x[:,:,edges[:,2],edges[:,3]]).sum() / edges.shape[0] / x.shape[0]

In [0]:
#[11]
def train_TV(model, optimizer, edges, weight_TV, args):
  """train for one epoch"""
  model.train() #switch the model in training mode
  
  #the loader function will take care of the batching
  loader = torch.utils.data.DataLoader(train_set, \
         batch_size=args.batch_size, shuffle=True, drop_last=True)
  #tqdm will provide some nice progress bars
  loader = tqdm(loader, ncols=500)
  
  #will keep track of the loss
  loss_meter = tnt.meter.AverageValueMeter()
  loss_acc_meter = tnt.meter.AverageValueMeter()
  loss_TV_meter = tnt.meter.AverageValueMeter()
  cm = ConfusionMatrix(args.n_class, class_names = class_names)

  for index, (tiles, gt) in enumerate(loader):
    
    optimizer.zero_grad() #put gradient to zero
    
    pred = model(tiles) #compute the prediction

    loss_acc = nn.functional.cross_entropy(pred.cpu(),gt, ignore_index=99)
    loss_TV = TV(pred.cpu(), edges)

    loss = loss_acc + loss_TV * weight_TV

    loss.backward() #compute gradients

    for p in model.parameters(): #we clip the gradient at norm 1
      p.grad.data.clamp_(-1, 1)
    
    optimizer.step() #one SGD step
    
    loss_acc_meter.add(loss_acc.item())
    loss_TV_meter.add(loss_TV.item())
    loss_meter.add(loss.item())
    labeled = np.where(gt.view(-1)!=99)[0] #select gt with a label
    #need to put the prediction back on the cpu and convert to numpy
    cm.add_batch(gt.view(-1)[labeled], pred.argmax(1).view(-1)[labeled].cpu().detach().numpy())
    
  return cm, loss_meter.value()[0], loss_acc_meter.value()[0], loss_TV_meter.value()[0] 

def eval_TV(model, edges, weight_TV, args):
  """eval on test/validation set"""
  
  model.eval() #switch in eval mode
  
  #the loader function will take care of the batching
  loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=False, drop_last=False)
  #tqdm will provide some nice progress bars
  loader = tqdm(loader, ncols=500)
  
  #will keep track of the loss
  loss_meter = tnt.meter.AverageValueMeter()
  loss_acc_meter = tnt.meter.AverageValueMeter()
  loss_TV_meter = tnt.meter.AverageValueMeter()
  cm = ConfusionMatrix(args.n_class, class_names = class_names)

  for index, (tiles, gt) in enumerate(loader):
     
    pred = model(tiles) #compute the prediction

    loss_acc = nn.functional.cross_entropy(pred.cpu(),gt, ignore_index=99)
    loss_TV = TV(pred.cpu(), edges)

    loss = loss_acc + loss_TV * weight_TV
    
    loss_acc_meter.add(loss_acc.item())
    loss_TV_meter.add(loss_TV.item())
    loss_meter.add(loss.item())
    labeled = np.where(gt.view(-1)!=99)[0] #select gt with a label
    #need to put the prediction back on the cpu and convert to numpy
    cm.add_batch(gt.view(-1)[labeled], pred.argmax(1).view(-1)[labeled].cpu().detach().numpy())
    
  return cm, loss_meter.value()[0], loss_acc_meter.value()[0], loss_TV_meter.value()[0] 
  return cm, loss_meter.value()[0]

def train_full_TV(args):
  """The full training loop"""
  #initialize the model
  
  model = SegNet(args.n_channel, args.conv_width, args.dconv_width, args.n_class)

  edges = compute_adjacency_graph(256)

  print('Total number of parameters: {}'.format(sum([p.numel() for p in model.parameters()])))
  
  #define the optimizer
  #adam optimizer is always a good guess for classification
  optimizer = optim.Adam(model.parameters(), lr=args.lr)
  
  TESTCOLOR = '\033[104m'
  TRAINCOLOR = '\033[100m'
  NORMALCOLOR = '\033[0m'
  
  for i_epoch in range(args.n_epoch):
    #train one epoch
    weight_TV = args.weight_TV_base * i_epoch / args.n_epoch
    cm_train, loss_train, loss_acc_train, loss_TV_train = train_TV(model, optimizer, edges, weight_TV, args)
    print(TRAINCOLOR)
    print('Epoch %3d -> Overall Accuracy: %3.2f%% mIoU : %3.2f%% Acc Loss: %1.4f TV Loss: %1.4f Loss: %1.4f' % \
          (i_epoch, cm_train.overall_accuracy(), cm_train.class_IoU(), loss_acc_train, loss_TV_train, loss_train) + NORMALCOLOR)

    if (i_epoch == args.n_epoch - 1) or (args.n_epoch_test != 0 and i_epoch % args.n_epoch_test == 0 and i_epoch > 0):
      #periodic testing
      cm_test, loss_test, loss_acc_test, loss_TV_test = eval_TV(model, edges, weight_TV, args)
      print(TESTCOLOR)
      print('Epoch %3d -> Overall Accuracy: %3.2f%% mIoU : %3.2f%% Acc Loss: %1.4f TV Loss: %1.4f Loss: %1.4f' % \
          (i_epoch, cm_test.overall_accuracy(), cm_test.class_IoU(), loss_acc_test, loss_TV_test, loss_test) + NORMALCOLOR)
      viewer(n_shown = 1, train = False, model = model, category = 'cigpe', use_mask = False)
  return model

In [0]:
#[12]
args = mock.Mock() #stores the parameters
args.n_epoch = 50
args.n_epoch_test = int(5) #periodicity of evaluation on test set
args.batch_size = 16
args.n_class = 6
args.n_channel = 4
args.conv_width = [16,16,32,32,64,32]
args.dconv_width = [32,16,32,16]
args.cuda = 1
args.lr = 5e-3
args.weight_TV_base = 0.01

trained_model = train_full_TV(args)
