# Coding and Debugging Logs for Segmentation-Depth PRoject

This document will record the whole story how I do this project in terms of coding and core ideas.

## Debugging Logs
- 2019/03/12/: Error: `Shape must be rank 0 but is rank 1 for 'BatchDataset' (op: 'BatchDataset') with input shapes: [], [1].` It is due to the ',' in `self.batch_size = batch_size,`. I changed it to 'self.batch_size = batch_size';
- UserWarning: An unusually high number of `Iterator.get_next()` calls was detected. This often indicates that `Iterator.get_next()` is being called inside a training loop, which will cause gradual slowdown and eventual resource exhaustion. If this is the case, restructure your code to call `next_element = iterator.get_next()` once outside the loop, and use `next_element` as the input to some computation that is invoked inside the loop. warnings.warn(GET_NEXT_CALL_WARNING_MESSAGE)

## Customed Caffe layers: im2parity, im2dist and DistLoss
The following is the Caffe layers defined in [Segmentation-Aware Convolutional Networks Using Local Attention Masks](https://github.com/aharley/segaware) project. Let us give detailed analysis.

### 1) im2col_layer in `~/seg-depth/segaware-master/caffe/src/caffe/layers/im2col_layer.cu`:
```cpp
#include <vector>

#include "caffe/layers/im2col_layer.hpp"
#include "caffe/util/im2col.hpp"

namespace caffe {

template <typename Dtype>
void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->gpu_data();
  Dtype* top_data = top[0]->mutable_gpu_data();
  const int num_kernels = channels_ * top[0]->count(channel_axis_ + 1);
  for (int n = 0; n < num_; ++n) {
    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
      im2col_gpu(bottom_data + n * bottom_dim_, channels_,
          bottom[0]->shape(channel_axis_ + 1),
          bottom[0]->shape(channel_axis_ + 2),
          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
          pad_.cpu_data()[0], pad_.cpu_data()[1],
          stride_.cpu_data()[0], stride_.cpu_data()[1],
          dilation_.cpu_data()[0], dilation_.cpu_data()[1],
          top_data + n * top_dim_);
    } else {
      im2col_nd_gpu(bottom_data + n * bottom_dim_, num_spatial_axes_,
          num_kernels, bottom[0]->gpu_shape() + channel_axis_,
          top[0]->gpu_shape() + channel_axis_,
          kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
          dilation_.gpu_data(), top_data + n * top_dim_);
    }
  }
}

template <typename Dtype>
void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  const Dtype* top_diff = top[0]->gpu_diff();
  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
  for (int n = 0; n < num_; ++n) {
    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
      col2im_gpu(top_diff + n * top_dim_, channels_,
          bottom[0]->shape(channel_axis_ + 1),
          bottom[0]->shape(channel_axis_ + 2),
          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
          pad_.cpu_data()[0], pad_.cpu_data()[1],
          stride_.cpu_data()[0], stride_.cpu_data()[1],
          dilation_.cpu_data()[0], dilation_.cpu_data()[1],
          bottom_diff + n * bottom_dim_);
    } else {
      col2im_nd_gpu(top_diff + n * top_dim_, num_spatial_axes_, bottom_dim_,
          bottom[0]->gpu_shape() + channel_axis_,
          top[0]->gpu_shape() + channel_axis_,
          kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
          dilation_.gpu_data(), bottom_diff + n * bottom_dim_);
    }
  }
}


INSTANTIATE_LAYER_GPU_FUNCS(Im2colLayer);

}  // namespace caffe
```

### 2) im2parity_layer in `~/seg-depth/segaware-master/caffe/src/caffe/layers/im2parity_layer.cu`:
```cpp
#include <vector>

#include "caffe/layers/im2parity_layer.hpp"
#include "caffe/util/im2parity.hpp"

namespace caffe {

template <typename Dtype>
void Im2parityLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
				      const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->gpu_data();
  Dtype* top_data = top[0]->mutable_gpu_data();
  const int num_kernels = channels_ * top[0]->count(channel_axis_ + 1);
  for (int n = 0; n < num_; ++n) {
    im2parity_gpu(bottom_data + n * bottom_dim_, channels_,
  		bottom[0]->shape(channel_axis_ + 1),
  		bottom[0]->shape(channel_axis_ + 2),
  		kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
  		pad_.cpu_data()[0], pad_.cpu_data()[1],
  		stride_.cpu_data()[0], stride_.cpu_data()[1],
  		dilation_.cpu_data()[0], dilation_.cpu_data()[1],
  		has_ignore_label_, ignore_label_,
  		top_data + n * top_dim_);
  }
  // const Dtype* okok = bottom[0]->cpu_data();
  // for (int i=0; i < 12; ++i)
  //   LOG(ERROR) << "bottom_data[" << i << "] = " << okok[i];
  // const Dtype* okok = top[0]->cpu_data();
  // for (int i=0; i < top[0]->count(); ++i)
  //   LOG(ERROR) << "top_data[" << i << "] = " << okok[i];
  // const Dtype* okok = diff_.cpu_data();
  // for (int i=0; i < diff_.count(); ++i)
  //   LOG(ERROR) << "diff_data[" << i << "] = " << okok[i];
  // LOG(ERROR) << "done forward!!" << std::endl;

}

template <typename Dtype>
void Im2parityLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  NOT_IMPLEMENTED;
}


INSTANTIATE_LAYER_GPU_FUNCS(Im2parityLayer);

}  // namespace caffe
```

### 3) im2dist_layer in `~/seg-depth/segaware-master/caffe/src/caffe/layers/im2dist_layer.cu`:

```cpp
#include <vector>
#include "caffe/layers/im2dist_layer.hpp"
#include "caffe/util/im2dist.hpp"

namespace caffe {

template <typename Dtype>
void Im2distLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
				      const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->gpu_data();
  Dtype* top_data = top[0]->mutable_gpu_data();
  Dtype* diff_data = diff_.mutable_gpu_data();
  const int num_kernels = channels_ * top[0]->count(channel_axis_ + 1);
  for (int n = 0; n < num_; ++n) {
    im2dist_gpu(bottom_data + n * bottom_dim_, channels_,
		bottom[0]->shape(channel_axis_ + 1),
		bottom[0]->shape(channel_axis_ + 2),
		kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
		pad_.cpu_data()[0], pad_.cpu_data()[1],
		stride_.cpu_data()[0], stride_.cpu_data()[1],
		dilation_.cpu_data()[0], dilation_.cpu_data()[1],
		top_data + n * top_dim_,
		diff_data + n * diff_dim_, norm_,
		remove_center_, remove_bounds_);
  }
  // const Dtype* embs = top[0]->cpu_data();
  // // for (int i=190; i < 210; i++){
  // for (int i=0; i < top[0]->count(); i++) {
  //   LOG(ERROR) << "for example, im2dist[" << i << "] = " << embs[i];
  // }
}

template <typename Dtype>
void Im2distLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  const Dtype* top_diff = top[0]->gpu_diff();
  const Dtype* diff_data = diff_.gpu_data();
  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
  for (int n = 0; n < num_; ++n) {
    dist2im_gpu(top_diff + n * top_dim_, 
  		diff_data + n * diff_dim_, 
  		channels_,
  		bottom[0]->shape(channel_axis_ + 1),
  		bottom[0]->shape(channel_axis_ + 2),
  		kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
  		pad_.cpu_data()[0], pad_.cpu_data()[1],
  		stride_.cpu_data()[0], stride_.cpu_data()[1],
  		dilation_.cpu_data()[0], dilation_.cpu_data()[1],
  		bottom_diff + n * bottom_dim_, norm_,
		remove_center_, remove_bounds_);
  }
}

INSTANTIATE_LAYER_GPU_FUNCS(Im2distLayer);

}  // namespace caffe
```

### 4) dist_loss_gpu_kernel() in `~/seg-depth/segaware-master/caffe/src/caffe/layers/dist_loss_layer.cu`:
```cpp
#include <vector>
#include "caffe/layers/dist_loss_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

// void dist_loss_gpu_kernel(const int n, const Dtype* dist_col, const Dtype* parity_col,
template <typename Dtype>
__global__ void dist_loss_gpu_kernel(const int n, const Dtype* dist_col, const Dtype* parity_col,
    const int height_col, const int width_col, const int channels_col, 
    const bool has_ignore_label, const Dtype ignore_label, 
    const Dtype alpha, const Dtype beta, 
    Dtype* diff_col) {
  // for (int index = 0; index < n; ++index) {
  CUDA_KERNEL_LOOP(index, n) {
    int w_out = index % width_col;
    int h_index = index / width_col;
    int h_out = h_index % height_col;
    const Dtype* dist_col_ptr = dist_col;
    const Dtype* parity_col_ptr = parity_col;
    Dtype* diff_col_ptr = diff_col;
    dist_col_ptr += h_out * width_col + w_out;
    parity_col_ptr += h_out * width_col + w_out;
    diff_col_ptr += h_out * width_col + w_out;
    for (int i = 0; i < channels_col; ++i) {
      const Dtype dist =  *dist_col_ptr;
      int parity = *parity_col_ptr;
      if (has_ignore_label && parity==ignore_label) {
	continue;
      } else {
	if (parity)
	  *diff_col_ptr = max(dist-alpha, Dtype(0));
	else
	  *diff_col_ptr = -max(beta-dist, Dtype(0));
      }
      dist_col_ptr += height_col * width_col;
      parity_col_ptr += height_col * width_col;
      diff_col_ptr += height_col * width_col;
    }
  }
}

template <typename Dtype>
void DistLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  int num = bottom[0]->num();
  int height_col = bottom[0]->height();
  int width_col = bottom[0]->width();
  int channels_col = bottom[0]->channels();
  int count = bottom[0]->count();
  // start one kernel per position, then within go through the channels
  int num_kernels = height_col * width_col;

  const Dtype* dist_col = bottom[0]->gpu_data();
  const Dtype* parity_col = bottom[1]->gpu_data();
  Dtype* diff_col = diff_.mutable_gpu_data();
  Dtype loss = 0;
  caffe_gpu_set(height_col * width_col * channels_col, Dtype(0), diff_col);
  // NOLINT_NEXT_LINE(whitespace/operators)
  dist_loss_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
                                CAFFE_CUDA_NUM_THREADS>>>(
      num_kernels, dist_col, parity_col, 
      height_col, width_col, channels_col, 
      has_ignore_label_, ignore_label_, 
      alpha_, beta_, 
      diff_col);
  CUDA_POST_KERNEL_CHECK;
  caffe_gpu_asum(count,diff_col,&loss);
  const Dtype* dist_cpu = diff_.cpu_data();
  top[0]->mutable_cpu_data()[0] = loss / count;
}

template <typename Dtype>
void DistLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  int count = bottom[0]->count();
  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
  const Dtype* diff_col = diff_.gpu_data();
  caffe_copy(count, diff_col, bottom_diff);
  caffe_gpu_scal(count, Dtype(1) / count, bottom_diff);

  // Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
  // const Dtype* diff_col = diff_.cpu_data();
  // caffe_copy(count, diff_col, bottom_diff);
  // caffe_scal(count, Dtype(1) / num / height_col / width_col, bottom_diff);
}

INSTANTIATE_LAYER_GPU_FUNCS(DistLossLayer);

}  // namespace caffe
```