In [1]:
#!/usr/bin/env python3

# Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
# This work is licensed under the NVIDIA Source Code License - Non-commercial. Full
# text can be found in LICENSE.md

"""Train a UCN on image segmentation database."""

import torch
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
from torch.utils.data import DataLoader, Dataset

import argparse
import pprint
import numpy as np
import sys
import os
import os.path as osp
import cv2
import matplotlib.pyplot as plt

import _init_paths
import datasets
import networks
from fcn.config import cfg, cfg_from_file, get_output_dir
from fcn.train import train_segnet
from datasets.factory import get_dataset
import h5py
import json


tabletop_object_train
tabletop_object_test
tabletop_object_all
osd_object_test
ocid_object_test


In [2]:
class binDataset(Dataset):
    def __init__(self, file):
        # Loads the data from the h5 file, where file is the path to the .h5 file
        self.file = h5py.File(file, 'r')
        self.len = self.file['frame0_data'].shape[0]
        self.name = '5k dataset'
        self.num_classes = 2
    
    def __len__(self):
        return self.len

    def add_noise_to_depth(self, depth_img, noise_params):
        """ Add noise to depth image. 
            This is adapted from the DexNet 2.0 code.
            Their code: https://github.com/BerkeleyAutomation/gqcnn/blob/75040b552f6f7fb264c27d427b404756729b5e88/gqcnn/sgd_optimizer.py

            @param depth_img: a [H x W] set of depth z values
        """
        depth_img = depth_img.copy()

        # Multiplicative noise: Gamma random variable
        multiplicative_noise = np.random.gamma(noise_params['gamma_shape'], noise_params['gamma_scale'])
        depth_img = multiplicative_noise * depth_img

        return depth_img

    def add_noise_to_xyz(self, xyz_img, depth_img, noise_params):
        """ Add (approximate) Gaussian Process noise to ordered point cloud

            @param xyz_img: a [H x W x 3] ordered point cloud
        """
        xyz_img = xyz_img.copy()

        H, W, C = xyz_img.shape

        # Additive noise: Gaussian process, approximated by zero-mean anisotropic Gaussian random variable,
        #                 which is rescaled with bicubic interpolation.
        gp_rescale_factor = np.random.randint(noise_params['gp_rescale_factor_range'][0],
                                            noise_params['gp_rescale_factor_range'][1])
        gp_scale = np.random.uniform(noise_params['gaussian_scale_range'][0],
                                    noise_params['gaussian_scale_range'][1])

        small_H, small_W = (np.array([H, W]) / gp_rescale_factor).astype(int)
        additive_noise = np.random.normal(loc=0.0, scale=gp_scale, size=(small_H, small_W, C))
        additive_noise = cv2.resize(additive_noise, (W, H), interpolation=cv2.INTER_CUBIC)
        xyz_img[depth_img > 0, :] += additive_noise[depth_img > 0, :]

        return xyz_img

    def dropout_random_ellipses(self, depth_img, noise_params):
        """ Randomly drop a few ellipses in the image for robustness.
            This is adapted from the DexNet 2.0 code.
            Their code: https://github.com/BerkeleyAutomation/gqcnn/blob/75040b552f6f7fb264c27d427b404756729b5e88/gqcnn/sgd_optimizer.py

            @param depth_img: a [H x W] set of depth z values
        """
        depth_img = depth_img.copy()

        # Sample number of ellipses to dropout
        num_ellipses_to_dropout = np.random.poisson(noise_params['ellipse_dropout_mean'])

        # Sample ellipse centers
        nonzero_pixel_indices = np.array(np.where(depth_img > 0)).T # Shape: [#nonzero_pixels x 2]
        dropout_centers_indices = np.random.choice(nonzero_pixel_indices.shape[0], size=num_ellipses_to_dropout)
        dropout_centers = nonzero_pixel_indices[dropout_centers_indices, :] # Shape: [num_ellipses_to_dropout x 2]

        # Sample ellipse radii and angles
        x_radii = np.random.gamma(noise_params['ellipse_gamma_shape'], noise_params['ellipse_gamma_scale'], size=num_ellipses_to_dropout)
        y_radii = np.random.gamma(noise_params['ellipse_gamma_shape'], noise_params['ellipse_gamma_scale'], size=num_ellipses_to_dropout)
        angles = np.random.randint(0, 360, size=num_ellipses_to_dropout)

        # Dropout ellipses
        for i in range(num_ellipses_to_dropout):
            center = dropout_centers[i, :]
            x_radius = np.round(x_radii[i]).astype(int)
            y_radius = np.round(y_radii[i]).astype(int)
            angle = angles[i]

            # dropout the ellipse
            mask = np.zeros_like(depth_img)
            mask = cv2.ellipse(mask, tuple(center[::-1]), (x_radius, y_radius), angle=angle, startAngle=0, endAngle=360, color=1, thickness=-1)
            depth_img[mask == 1] = 0

        return depth_img

    def random_color_warp(self, image, d_h=None, d_s=None, d_l=None):
        """ Given an RGB image [H x W x 3], add random hue, saturation and luminosity to the image

            Code adapted from: https://github.com/yuxng/PoseCNN/blob/master/lib/utils/blob.py
        """
        H, W, _ = image.shape

        image_color_warped = np.zeros_like(image)

        # Set random hue, luminosity and saturation which ranges from -0.1 to 0.1
        if d_h is None:
            d_h = (np.random.uniform() - 0.5) * 0.2 * 256
        if d_l is None:
            d_l = (np.random.uniform() - 0.5) * 0.2 * 256
        if d_s is None:
            d_s = (np.random.uniform() - 0.5) * 0.2 * 256

        # Convert the RGB to HLS
        hls = cv2.cvtColor(image.round().astype(np.uint8), cv2.COLOR_RGB2HLS)
        h, l, s = cv2.split(hls)

        # Add the values to the image H, L, S
        # new_h = (np.round((h + d_h)) % 256).astype(np.uint8)
        new_h = np.round(np.clip(h + d_h, 0, 255)).astype(np.uint8)
        new_l = np.round(np.clip(l + d_l, 0, 255)).astype(np.uint8)
        new_s = np.round(np.clip(s + d_s, 0, 255)).astype(np.uint8)

        # Convert the HLS to RGB
        new_hls = cv2.merge((new_h, new_l, new_s)).astype(np.uint8)
        new_im = cv2.cvtColor(new_hls, cv2.COLOR_HLS2RGB)

        image_color_warped = new_im.astype(np.float32)

        return image_color_warped

    # Computes point cloud from depth image and camera intrinsics
    def compute_xyz(self, depth_img, intrinsic):
        fx = intrinsic[0][0]
        fy = intrinsic[1][1]
        px = intrinsic[0][2]
        py = intrinsic[1][2]
        height, width = depth_img.shape

        indices = np.indices((height, width), dtype=np.float32).transpose(1,2,0)
        z_e = depth_img
        x_e = (indices[..., 1] - px) * z_e / fx
        y_e = (indices[..., 0] - py) * z_e / fy
        xyz_img = np.stack([x_e, y_e, z_e], axis=-1) # Shape: [H x W x 3]
        return xyz_img
    
    def __getitem__(self, index):
        cat = 'frame0_'

        # Loads the color image
        rgb = self.file[cat + 'data'][index][...,0:3]
        rgb = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)

        # Applies color jittering
        rgb = self.random_color_warp(rgb)

        # Loads the depth image and camera intrinsics
        depth = self.file[cat + 'depth'][index]
        depth[depth < -1000] = 0
        intrinsic = json.loads(self.file[cat + 'metadata'][index])['camera']['intrinsic_matrix']

        # Adds noise and cuts out random ellipses from depth
        depth = self.add_noise_to_depth(self.dropout_random_ellipses(depth, data_loading_params), data_loading_params)

        xyz = self.compute_xyz(depth, intrinsic).astype(np.float32)

        # Adds noise to the point cloud
        xyz = self.add_noise_to_xyz(xyz, depth, data_loading_params)

        # Loads the label masks
        label = self.file[cat + 'mask'][index]
        label = (label + 1) % 65536
        label %= np.max(label)

        # Randomly flip the image/point cloud/label
        if np.random.uniform() < .5:
            rgb = np.fliplr(rgb).copy()
            depth = np.fliplr(depth).copy()
            xyz = np.fliplr(xyz).copy()
            label = np.fliplr(label).copy()

        # Turn everything into tensors and reshape as needed
        im_tensor = torch.from_numpy(rgb) / 255.0
        pixel_mean = torch.tensor(cfg.PIXEL_MEANS / 255.0).float()
        im_tensor -= pixel_mean
        image_blob = im_tensor.permute(2, 0, 1)
        sample = {'image_color': image_blob}

        depth_blob = torch.from_numpy(xyz).permute(2, 0, 1)
        sample['depth'] = depth_blob
        label_blob = torch.from_numpy(label)
        sample['label'] = label_blob.unsqueeze(0)

        return sample

data_loading_params = {

# Camera/Frustum parameters
'img_width' : 256, 
'img_height' : 256,
'near' : 0.01,
'far' : 100,
'fov' : 45, # vertical field of view in degrees

'use_data_augmentation' : True,

# Multiplicative noise
'gamma_shape' : 1000.,
'gamma_scale' : 0.001,

# Additive noise
'gaussian_scale' : 0.005, # 5mm standard dev
'gp_rescale_factor' : 4,
'gaussian_scale_range' : [0., 0.003],
'gp_rescale_factor_range' : [12, 20],

# Random ellipse dropout
'ellipse_dropout_mean' : 10, 
'ellipse_gamma_shape' : 5.0, 
'ellipse_gamma_scale' : 1.0,

# Random high gradient dropout
'gradient_dropout_left_mean' : 15, 
'gradient_dropout_alpha' : 2., 
'gradient_dropout_beta' : 5.,

# Random pixel dropout
'pixel_dropout_alpha' : 1., 
'pixel_dropout_beta' : 10.,    
}

## Sets parameters and performs the training loop

In [7]:
args = {'epochs':40000, 'startepoch':0, 'pretrained':None, 'cfg_file':None, 'solver':'sgd', 
'dataset_name':'shapenet_scene_train', 'dataset_background_name':'background_nvidia', 
'randomize':False, 'network_name':None, 'cad_name':None, 'pose_name':None}

args['network_name'] = 'seg_resnet34_8s_embedding'
args['cfg_file'] = '../experiments/cfgs/seg_resnet34_8s_embedding_cosine_rgbd_add_tabletop.yml'
args['solver'] = 'adam'
args['epochs'] = 24
BATCH_SIZE = 16


print('Called with args:')
print(args)

# Sets the config file
if args['cfg_file'] is not None:
    cfg_from_file(args['cfg_file'])

print('Using config:')
pprint.pprint(cfg)

# Sets random seeds (or not)
if not args['randomize']:
    # fix the random seeds (numpy and caffe) for reproducibility
    np.random.seed(cfg.RNG_SEED)

# Prepare dataset
cfg.MODE = 'TRAIN'
file_name = "/home/thomas/Desktop/aurmr_perception-master/outputs/train_shard_000000_prime.h5"
dataset = binDataset(file_name)
# worker_init_fn = dataset.worker_init_fn if hasattr(dataset, 'worker_init_fn') else None
num_workers = 4
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, 
    num_workers=num_workers)
print('Use dataset `{:s}` for training'.format(dataset.name))

output_dir = './output'
print('Output will be saved to `{:s}`'.format(output_dir))
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# prepare network
if args['pretrained']:
    network_data = torch.load(args['pretrained'])
    if isinstance(network_data, dict) and 'model' in network_data:
        network_data = network_data['model']
    print("=> using pre-trained network '{}'".format(args['network_name']))
else:
    network_data = None
    print("=> creating network '{}'".format(args['network_name']))

network = networks.__dict__[args['network_name']](dataset.num_classes, cfg.TRAIN.NUM_UNITS, network_data).cuda()
if torch.cuda.device_count() > 1:
    cfg.TRAIN.GPUNUM = torch.cuda.device_count()
    print("Let's use", torch.cuda.device_count(), "GPUs!")
network = torch.nn.DataParallel(network).cuda()
cudnn.benchmark = True

# prepare optimizer
assert(args['solver'] in ['adam', 'sgd'])
print('=> setting {} solver'.format(args['solver']))
param_groups = [{'params': network.module.bias_parameters(), 'weight_decay': cfg.TRAIN.WEIGHT_DECAY},
                {'params': network.module.weight_parameters(), 'weight_decay': cfg.TRAIN.WEIGHT_DECAY}]

if args['solver'] == 'adam':
    optimizer = torch.optim.Adam(param_groups, cfg.TRAIN.LEARNING_RATE,
                                    betas=(cfg.TRAIN.MOMENTUM, cfg.TRAIN.BETA))
elif args['solver'] == 'sgd':
    optimizer = torch.optim.SGD(param_groups, cfg.TRAIN.LEARNING_RATE,
                                momentum=cfg.TRAIN.MOMENTUM)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[m - args.startepoch for m in cfg.TRAIN.MILESTONES], gamma=cfg.TRAIN.GAMMA)
cfg.epochs = args['epochs']

# main loop
for epoch in range(args['startepoch'], args['epochs']):
    if args['solver'] == 'sgd':
        scheduler.step()

    # train for one epoch
    train_segnet(dataloader, network, optimizer, epoch)

    # save checkpoint
    if (epoch+1) % cfg.TRAIN.SNAPSHOT_EPOCHS == 0 or epoch == args['epochs'] - 1:
        state = network.module.state_dict()
        infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX
                    if cfg.TRAIN.SNAPSHOT_INFIX != '' else '')
        filename = ('trained_checkpoint_0.pth')
        torch.save(state, os.path.join(output_dir, filename))
        print(filename)


Called with args:
{'epochs': 24, 'startepoch': 0, 'pretrained': None, 'cfg_file': '../experiments/cfgs/seg_resnet34_8s_embedding_cosine_rgbd_add_tabletop.yml', 'solver': 'adam', 'dataset_name': 'shapenet_scene_train', 'dataset_background_name': 'background_nvidia', 'randomize': False, 'network_name': 'seg_resnet34_8s_embedding', 'cad_name': None, 'pose_name': None}
Using config:
{'ANCHOR_RATIOS': [0.5, 1, 2],
 'ANCHOR_SCALES': [8, 16, 32],
 'BACKGROUND': '',
 'CAD': '',
 'DATA_PATH': '',
 'EPS': 1e-14,
 'EXP_DIR': 'tabletop_object',
 'FEATURE_STRIDE': 16,
 'FLIP_X': False,
 'FLOW_HEIGHT': 512,
 'FLOW_WIDTH': 640,
 'GPU_ID': 0,
 'INPUT': 'RGBD',
 'INTRINSICS': [],
 'MODE': 'TRAIN',
 'NETWORK': 'VGG16',
 'PIXEL_MEANS': array([[[102.9801, 115.9465, 122.7717]]]),
 'POSE': '',
 'RIG': '',
 'RNG_SEED': 3,
 'ROOT_DIR': '/home/thomas/Desktop/UnseenObjectClustering',
 'TEST': {'ALIGN_Z_AXIS': False,
          'BBOX_REG': True,
          'BUILD_CODEBOOK': False,
          'CHECK_SIZE': False,
  



[0/24][0/313], loss 1.7054, loss intra: 0.0697, loss_inter 1.6357, lr 0.000010, time 1.42
[0/24][1/313], loss 1.5634, loss intra: 0.0721, loss_inter 1.4913, lr 0.000010, time 0.52
[0/24][2/313], loss 1.5267, loss intra: 0.0712, loss_inter 1.4555, lr 0.000010, time 0.53
[0/24][3/313], loss 1.5166, loss intra: 0.0794, loss_inter 1.4372, lr 0.000010, time 0.54
[0/24][4/313], loss 1.4093, loss intra: 0.0800, loss_inter 1.3293, lr 0.000010, time 0.53
[0/24][5/313], loss 1.6902, loss intra: 0.1000, loss_inter 1.5902, lr 0.000010, time 0.52
[0/24][6/313], loss 0.8567, loss intra: 0.0884, loss_inter 0.7683, lr 0.000010, time 0.53
[0/24][7/313], loss 0.6959, loss intra: 0.0854, loss_inter 0.6105, lr 0.000010, time 0.54
[0/24][8/313], loss 0.6087, loss intra: 0.0871, loss_inter 0.5216, lr 0.000010, time 0.54
[0/24][9/313], loss 1.2958, loss intra: 0.1187, loss_inter 1.1771, lr 0.000010, time 0.54
[0/24][10/313], loss 1.0485, loss intra: 0.1166, loss_inter 0.9319, lr 0.000010, time 0.54
[0/24][11