add 3d instance bbox

bertjiazheng · Oct 12, 2019 · 06206a7 · 06206a7
1 parent 984279c
commit 06206a7
Show file tree

Hide file tree

Showing 7 changed files with 252 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -80,6 +80,18 @@ Please refer to the [Supplementary Material](https://drive.google.com/file/d/17F
 <img src="assets/pers_layout/scene_00000_490854_2.png" width="250">
 </p>
 
+### Visualize 3D Bounding Box
+
+```bash
+python visualize_bbox.py --path /path/to/dataset --scene scene_id
+```
+
+<p align="center">
+<img src="assets/bbox/scene_00000_485142_0.png" width="250">
+<img src="assets/bbox/scene_00000_485142_1.png" width="250">
+<img src="assets/bbox/scene_00000_490854_2.png" width="250">
+</p>
+
 ## Citation
 
 Please cite `Structured3D` in your publications if it helps your research:

diff --git a/assets/bbox/scene_00000_485142_0.png b/assets/bbox/scene_00000_485142_0.png
diff --git a/assets/bbox/scene_00000_485142_1.png b/assets/bbox/scene_00000_485142_1.png
diff --git a/assets/bbox/scene_00000_490854_2.png b/assets/bbox/scene_00000_490854_2.png
diff --git a/data_organization.md b/data_organization.md
@@ -19,17 +19,19 @@ scene_<sceneID>
 │               └── <positionID>
 │                   ├── rgb_rawlight.png
 │                   ├── semantic.png
+│                   ├── instance.png
 │                   ├── albedo.png
 │                   ├── depth.png
 │                   ├── normal.png
 │                   ├── layout.json
 │                   └── camera_pose.txt
+├── bbox_3d.json
 └── annotation_3d.json
 ```
 
 # Annotation Format
 
-For each scene, we provide the primitive and relationship based structure annotation:
+We provide the primitive and relationship based structure annotation for each scene, and oriented bounding box for each object instance.
 
 **Structure annotation (`annotation_3d.json`)**: see all the room types [here](metadata/room_types.txt).
 ```
@@ -82,10 +84,24 @@ For each scene, we provide the primitive and relationship based structure annota
 }
 ```
 
-For each image, we provide semantic, albedo, depth, normal, layout annotation and camera position. Please note that we have different layout and camera annotation format for panoramic and perspective images.
+**Bounding box (`bbox_3d.json`)**: the oriented bounding box annotation in world coordinate, same as [SUN RGB-D](http://rgbd.cs.princeton.edu).
+```
+[
+  {
+    "ID"        : int,              // instance id
+    "basis"     : Matrix[flaot],    // basis of the bounding box, one row is one basis
+    "coeffs"    : List[flaot],      // radii in each dimension
+    "centroid"  : List[flaot],      // 3D centroid of the bounding box
+  }
+]
+```
+
+For each image, we provide semantic, instance, albedo, depth, normal, layout annotation and camera position. Please note that we have different layout and camera annotation format for panoramic and perspective images.
 
 **Semantic annotation (`semantic.png`)**: unsigned 8-bit integers within a PNG. We use [NYUv2](https://cs.nyu.edu/~silberman/datasets/nyu_depth_v2) 40-label set, see all the label ids [here](metadata/labelids.txt).
 
+**Instance annotation for perspective (`instance.png`)**: unsigned 16-bit integers within a PNG. The maximum value (65535) denotes *background*.
+
 **Albedo data (`albedo.png`)**: unsigned 8-bit integers within a PNG.
 
 **Depth data (`depth.png`)**: unsigned 16-bit integers within a PNG. The units are millimeters, a value of 1000 is a meter. A zero value denotes *no reading*.
@@ -102,7 +118,6 @@ x_1 y_floor_1
 ```
 
 **Layout annotation for perspecitve (`layout.json`)**: We also include the junctions that formed by line segments intersecting with each other or image boundary. We consider the visible and invisible part caused by the room structure instead of furniture.
-
 ```
 {
   "junctions":[
@@ -131,4 +146,4 @@ x_1 y_floor_1
 ```
 vx vy vz tx ty tz ux uy uz xfov yfov 1
 ```
-where `(vx, vy, vz)` is the eye viewpoint of the camera, `(tx, ty, tz)` is the view direction, `(ux, uy, uz)` is the up direction, and `xfov` and `yfov` are the half-angles of the horizontal and vertical fields of view of the camera in radians (the angle from the central ray to the leftmost/bottommost ray in the field of view), same as [Matterport3D](https://github.com/niessner/Matterport).
+where `(vx, vy, vz)` is the eye viewpoint of the camera, `(tx, ty, tz)` is the view direction, `(ux, uy, uz)` is the up direction, and `xfov` and `yfov` are the half-angles of the horizontal and vertical fields of view of the camera in radians (the angle from the central ray to the leftmost/bottommost ray in the field of view), same as [Matterport3D](https://github.com/niessner/Matterport).
diff --git a/misc/utils.py b/misc/utils.py
@@ -0,0 +1,138 @@
+"""
+Adapted from https://github.com/thusiyuan/cooperative_scene_parsing/blob/master/utils/sunrgbd_utils.py
+"""
+import numpy as np
+
+
+def normalize(vector):
+    return vector / np.linalg.norm(vector)
+
+
+def parse_camera_info(camera_info, height, width):
+    """ extract intrinsic and extrinsic matrix
+    """
+    lookat = normalize(camera_info[3:6])
+    up = normalize(camera_info[6:9])
+
+    W = lookat
+    U = np.cross(W, up)
+    V = -np.cross(W, U)
+
+    rot = np.vstack((U, V, W))
+    trans = camera_info[:3]
+
+    xfov = camera_info[9]
+    yfov = camera_info[10]
+
+    K = np.diag([1, 1, 1])
+
+    K[0, 2] = width / 2
+    K[1, 2] = height / 2
+
+    K[0, 0] = K[0, 2] / np.tan(xfov)
+    K[1, 1] = K[1, 2] / np.tan(yfov)
+
+    return rot, trans, K
+
+
+def flip_towards_viewer(normals, points):
+    points = points / np.linalg.norm(points)
+    proj = points.dot(normals[:2, :].T)
+    flip = np.where(proj > 0)
+    normals[flip, :] = -normals[flip, :]
+    return normals
+
+
+def get_corners_of_bb3d(basis, coeffs, centroid):
+    corners = np.zeros((8, 3))
+    # order the basis
+    index = np.argsort(np.abs(basis[:, 0]))[::-1]
+    # the case that two same value appear the same time
+    if index[2] != 2:
+        index[1:] = index[1:][::-1]
+    basis = basis[index, :]
+    coeffs = coeffs[index]
+    # Now, we know the basis vectors are orders X, Y, Z. Next, flip the basis vectors towards the viewer
+    basis = flip_towards_viewer(basis, centroid)
+    coeffs = np.abs(coeffs)
+    corners[0, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+    corners[1, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+    corners[2, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+    corners[3, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+
+    corners[4, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners[5, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners[6, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners[7, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners = corners + np.tile(centroid, (8, 1))
+    return corners
+
+
+def get_corners_of_bb3d_no_index(basis, coeffs, centroid):
+    corners = np.zeros((8, 3))
+    coeffs = np.abs(coeffs)
+    corners[0, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+    corners[1, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+    corners[2, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+    corners[3, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + basis[2, :] * coeffs[2]
+
+    corners[4, :] = -basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners[5, :] = basis[0, :] * coeffs[0] + basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners[6, :] = basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+    corners[7, :] = -basis[0, :] * coeffs[0] + -basis[1, :] * coeffs[1] + -basis[2, :] * coeffs[2]
+
+    corners = corners + np.tile(centroid, (8, 1))
+    return corners
+
+
+def project_3d_points_to_2d(points3d, R_ex, K):
+    """
+    Project 3d points from camera-centered coordinate to 2D image plane
+    Parameters
+    ----------
+    points3d: numpy array
+        3d location of point
+    R_ex: numpy array
+        extrinsic camera parameter
+    K: numpy array
+        intrinsic camera parameter
+    Returns
+    -------
+    points2d: numpy array
+        2d location of the point
+    """
+    points3d = R_ex.dot(points3d.T).T
+    x3 = points3d[:, 0]
+    y3 = -points3d[:, 1]
+    z3 = np.abs(points3d[:, 2])
+    xx = x3 * K[0, 0] / z3 + K[0, 2]
+    yy = y3 * K[1, 1] / z3 + K[1, 2]
+    points2d = np.vstack((xx, yy))
+    return points2d
+
+
+def project_struct_bdb_to_2d(basis, coeffs, center, R_ex, K):
+    """
+    Project 3d bounding box to 2d bounding box
+    Parameters
+    ----------
+    basis, coeffs, center, R_ex, K
+        : K is the intrinsic camera parameter matrix
+        : Rtilt is the extrinsic camera parameter matrix in right hand coordinates
+    Returns
+    -------
+    bdb2d: dict
+        Keys: {'x1', 'x2', 'y1', 'y2'}
+        The (x1, y1) position is at the top left corner,
+        the (x2, y2) position is at the bottom right corner
+    """
+    corners3d = get_corners_of_bb3d(basis, coeffs, center)
+    corners = project_3d_points_to_2d(corners3d, R_ex, K)
+    bdb2d = dict()
+    bdb2d['x1'] = int(max(np.min(corners[0, :]), 1))  # x1
+    bdb2d['y1'] = int(max(np.min(corners[1, :]), 1))  # y1
+    bdb2d['x2'] = int(min(np.max(corners[0, :]), 2*K[0, 2]))  # x2
+    bdb2d['y2'] = int(min(np.max(corners[1, :]), 2*K[1, 2]))  # y2
+    # if not check_bdb(bdb2d, 2*K[0, 2], 2*K[1, 2]):
+    #     bdb2d = None
+    return bdb2d
diff --git a/visualize_bbox.py b/visualize_bbox.py
@@ -0,0 +1,83 @@
+import os
+import json
+import argparse
+
+import numpy as np
+import skimage.io as io
+import matplotlib.pyplot as plt
+
+from misc.utils import get_corners_of_bb3d_no_index, project_3d_points_to_2d, parse_camera_info
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Structured3D 3D Bounding Box Visualization")
+    parser.add_argument("--path", required=True,
+                        help="dataset path", metavar="DIR")
+    parser.add_argument("--scene", required=True,
+                        help="scene id", type=int)
+    return parser.parse_args()
+
+
+def visualize_bbox(annos, args):
+    id2index = dict()
+    for index, object in enumerate(annos):
+        id2index[object.get('ID')] = index
+
+    scene_path = os.path.join(args.path, "scene_%05d" % (args.scene, ), "2D_rendering")
+
+    for room_id in np.sort(os.listdir(scene_path)):
+        room_path = os.path.join(scene_path, room_id, "perspective", "full")
+
+        for position_id in np.sort(os.listdir(room_path)):
+            position_path = os.path.join(room_path, position_id)
+
+            image = io.imread(os.path.join(position_path, 'rgb_rawlight.png'))
+            height, width, _ = image.shape
+
+            instance = io.imread(os.path.join(position_path, 'instance.png'))
+
+            camera_info = np.loadtxt(os.path.join(position_path, 'camera_pose.txt'))
+
+            rot, trans, K = parse_camera_info(camera_info, height, width)
+
+            plt.figure()
+            plt.imshow(image)
+
+            for index in np.unique(instance)[:-1]:
+                # for each instance in current image
+                bbox = annos[id2index[index]]
+
+                basis = np.array(bbox['basis'])
+                coeffs = np.array(bbox['coeffs'])
+                centroid = np.array(bbox['centroid'])
+
+                corners = get_corners_of_bb3d_no_index(basis, coeffs, centroid)
+                corners = corners - trans
+
+                gt2dcorners = project_3d_points_to_2d(corners, rot, K)
+
+                num_corner = gt2dcorners.shape[1] // 2
+                plt.plot(np.hstack((gt2dcorners[0, :num_corner], gt2dcorners[0, 0])),
+                         np.hstack((gt2dcorners[1, :num_corner], gt2dcorners[1, 0])), 'r')
+                plt.plot(np.hstack((gt2dcorners[0, num_corner:], gt2dcorners[0, num_corner])),
+                         np.hstack((gt2dcorners[1, num_corner:], gt2dcorners[1, num_corner])), 'b')
+                for i in range(num_corner):
+                    plt.plot(gt2dcorners[0, [i, i + num_corner]], gt2dcorners[1, [i, i + num_corner]], 'y')
+
+            plt.axis('off')
+            plt.axis([0, width, height, 0])
+            plt.show()
+
+
+def main():
+    args = parse_args()
+
+    with open(os.path.join(args.path, "scene_%05d" % (args.scene, ), 'bbox_3d.json')) as file:
+        annos = json.load(file)
+
+    visualize_bbox(annos, args)
+
+
+if __name__ == "__main__":
+    main()