In [1]:
import numpy as np
import matplotlib.pyplot as plt
import cv2 as cv
import os, sys
import glob
import argparse
import time
import csv

import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import custom_transforms
import models
from utils import tensor2array, save_checkpoint, save_path_formatter, log_output_tensorboard

from loss_functions import photometric_reconstruction_loss, explainability_loss, smooth_loss
from loss_functions import compute_depth_errors, compute_pose_errors
from inverse_warp import pose_vec2mat
from logger import TermLogger, AverageMeter
from tensorboardX import SummaryWriter

%matplotlib inline

In [2]:

parser = argparse.ArgumentParser(description='Structure from Motion Learner training on KITTI and CityScapes Dataset',
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('data', metavar='DIR',
                    help='path to dataset')
parser.add_argument('--dataset-format', default='sequential', metavar='STR',
                    help='dataset format, stacked: stacked frames (from original TensorFlow code) '
                    'sequential: sequential folders (easier to convert to with a non KITTI/Cityscape dataset')
parser.add_argument('--sequence-length', type=int, metavar='N', help='sequence length for training', default=3)
parser.add_argument('--rotation-mode', type=str, choices=['euler', 'quat'], default='euler',
                    help='rotation mode for PoseExpnet : euler (yaw,pitch,roll) or quaternion (last 3 coefficients)')
parser.add_argument('--padding-mode', type=str, choices=['zeros', 'border'], default='zeros',
                    help='padding mode for image warping : this is important for photometric differenciation when going outside target image.'
                         ' zeros will null gradients outside target image.'
                         ' border will only null gradients of the coordinate outside (x or y)')
parser.add_argument('--with-gt', action='store_true', help='use depth ground truth for validation. '
                    'You need to store it in npy 2D arrays see data/kitti_raw_loader.py for an example')
parser.add_argument('--with-pose', action='store_true', help='use pose ground truth for validation. '
                    'You need to store it in text files of 12 columns see data/kitti_raw_loader.py for an example '
                    'Note that for kitti, it is recommend to use odometry train set to test pose')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                    help='number of data loading workers')
parser.add_argument('--epochs', default=200, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--epoch-size', default=0, type=int, metavar='N',
                    help='manual epoch size (will match dataset size if not set)')
parser.add_argument('-b', '--batch-size', default=4, type=int,
                    metavar='N', help='mini-batch size')
parser.add_argument('--lr', '--learning-rate', default=2e-4, type=float,
                    metavar='LR', help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum for sgd, alpha parameter for adam')
parser.add_argument('--beta', default=0.999, type=float, metavar='M',
                    help='beta parameters for adam')
parser.add_argument('--weight-decay', '--wd', default=0, type=float,
                    metavar='W', help='weight decay')
parser.add_argument('--print-freq', default=10, type=int,
                    metavar='N', help='print frequency')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--pretrained-disp', dest='pretrained_disp', default=None, metavar='PATH',
                    help='path to pre-trained dispnet model')
parser.add_argument('--pretrained-exppose', dest='pretrained_exp_pose', default=None, metavar='PATH',
                    help='path to pre-trained Exp Pose net model')
parser.add_argument('--seed', default=0, type=int, help='seed for random functions, and network initialization')
parser.add_argument('--log-summary', default='progress_log_summary.csv', metavar='PATH',
                    help='csv where to save per-epoch train and valid stats')
parser.add_argument('--log-full', default='progress_log_full.csv', metavar='PATH',
                    help='csv where to save per-gradient descent train stats')
parser.add_argument('-p', '--photo-loss-weight', type=float, help='weight for photometric loss', metavar='W', default=1)
parser.add_argument('-m', '--mask-loss-weight', type=float, help='weight for explainabilty mask loss', metavar='W', default=0)
parser.add_argument('-s', '--smooth-loss-weight', type=float, help='weight for disparity smoothness loss', metavar='W', default=0.1)
parser.add_argument('--log-output', action='store_true', help='will log dispnet outputs and warped imgs at validation step')
parser.add_argument('-f', '--training-output-freq', type=int,
                    help='frequence for outputting dispnet outputs and warped imgs at training for all scales. '
                         'if 0, will not output',
                    metavar='N', default=0)
parser.add_argument('--dispnet_type', default='single', metavar='STR',
                    help='dispnet type, single: current frame (from original code) '
                    'triple: use frame n, n+1, n-1 as input for dispnet (to capture parallax from motion)')



_StoreAction(option_strings=['--dispnet_type'], dest='dispnet_type', nargs=None, const=None, default='single', type=None, choices=None, help='dispnet type, single: current frame (from original code) triple: use frame n, n+1, n-1 as input for dispnet (to capture parallax from motion)', metavar='STR')

In [3]:
# args = parser.parse_args(args=['--data', '/mnt/TempData/openDateset/KITTI/SfmLearner', 
#  '-b4', '-m0.2', '-s0.1', '--epoch-size', '3000', '--sequence-length', '3', '--log-output'])
args = parser.parse_args(args=['/mnt/TempData/openDateset/KITTI/SfmLearner', 
 '-b4', '-m0.2', '-s0.1', '--epoch-size', '3000', '--sequence-length', '3', '--log-output'])

In [5]:
args.dispnet_type

'single'

In [6]:
from datasets.sequence_folders import SequenceFolder


In [7]:
# Data loading code
normalize = custom_transforms.Normalize(mean=[0.5, 0.5, 0.5],
                                        std=[0.5, 0.5, 0.5])
train_transform = custom_transforms.Compose([
    custom_transforms.RandomHorizontalFlip(),
    custom_transforms.RandomScaleCrop(),
    custom_transforms.ArrayToTensor(),
    normalize
])
valid_transform = custom_transforms.Compose([custom_transforms.ArrayToTensor(), normalize])


In [8]:
print("=> fetching scenes in '{}'".format(args.data))
train_set = SequenceFolder(
    args.data,
    transform=train_transform,
    seed=args.seed,
    train=True,
    sequence_length=args.sequence_length
)
val_set = SequenceFolder(
    args.data,
    transform=valid_transform,
    seed=args.seed,
    train=False,
    sequence_length=args.sequence_length,
)
print('{} samples found in {} train scenes'.format(len(train_set), len(train_set.scenes)))
print('{} samples found in {} valid scenes'.format(len(val_set), len(val_set.scenes)))
train_loader = torch.utils.data.DataLoader(
    train_set, batch_size=args.batch_size, shuffle=True,
    num_workers=args.workers, pin_memory=True)
val_loader = torch.utils.data.DataLoader(
    val_set, batch_size=args.batch_size, shuffle=False,
    num_workers=args.workers, pin_memory=True)


=> fetching scenes in '/mnt/TempData/openDateset/KITTI/SfmLearner'
Shifts:  [-1, 0, 1]
Shifts:  [-1, 0, 1]
42290 samples found in 58 train scenes
3398 samples found in 8 valid scenes


In [9]:
for i, (tgt_img, ref_imgs, intrinsics, intrinsics_inv) in enumerate(train_loader):
    break
print("tgt_img.shape: ", tgt_img.shape)
print("ref_imgs.shape: ")
for img in ref_imgs:
    print(img.shape)


tgt_img.shape:  torch.Size([4, 3, 128, 416])
ref_imgs.shape: 
torch.Size([4, 3, 128, 416])
torch.Size([4, 3, 128, 416])


In [18]:
torch.stack([tgt_img, ref_imgs[0], ref_imgs[1]], dim=1).shape

torch.Size([4, 2, 3, 128, 416])

In [19]:
torch.stack([tgt_img, tgt_img], dim=1).shape

torch.Size([4, 2, 3, 128, 416])

In [10]:
disp_net = models.DispNetS()
print(disp_net)

DispNetS(
  (conv1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): ReLU(inplace=True)
    (2): Conv2d(32, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
    (3): ReLU(inplace=True)
  )
  (conv2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (3): ReLU(inplace=True)
  )
  (conv3): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
  )
  (conv4): Sequential(
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
  )
  (conv5): Sequential(
    (0): Conv2d(256, 512, ker