# Dataset Loading

In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Prepare dataset
!cp -r /content/drive/MyDrive/dataset /content/
# !unzip dataset/models.zip -d ./
!unzip dataset/testing_data_pose.zip -d ./
!unzip dataset/training_data.zip -d ./



[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
  inflating: ./training_data_filtered/training_data/v2.2/1-6-0_meta.pkl  
  inflating: ./training_data_filtered/training_data/v2.2/2-168-12_label_kinect.png  
  inflating: ./training_data_filtered/training_data/v2.2/2-29-2_color_kinect.png  
  inflating: ./training_data_filtered/training_data/v2.2/1-84-0_meta.pkl  
  inflating: ./training_data_filtered/training_data/v2.2/2-213-38_meta.pkl  
  inflating: ./training_data_filtered/training_data/v2.2/2-209-30_color_kinect.png  
  inflating: ./training_data_filtered/training_data/v2.2/2-45-9_meta.pkl  
  inflating: ./training_data_filtered/training_data/v2.2/2-35-17_color_kinect.png  
  inflating: ./training_data_filtered/training_data/v2.2/1-47-1_label_kinect.png  
  inflating: ./training_data_filtered/training_data/v2.2/1-54-6_depth_kinect.png  
  inflating: ./training_data_filtered/training_data/v2.2/2-28-34_depth_kinect.png  
  inflating: ./training_data_filtered/training_data/v2.2/1-53-1_meta.pk

In [None]:
# Train data loading helpers
import os

training_data_root = "./training_data_filtered/training_data"
training_data_dir = "./training_data_filtered/training_data/v2.2"
split_dir = "./training_data_filtered/training_data/splits/v2"

def get_train_split_files(split_name):
    with open(os.path.join(split_dir, f"{split_name}.txt"), 'r') as f:
        prefixIds = [line.strip() for line in f if line.strip()]
        prefix = [os.path.join(training_data_dir, line) for line in prefixIds]
        # prefix = [os.path.join(training_data_dir, line.strip()) for line in f if line.strip()]
        rgb = [p + "_color_kinect.png" for p in prefix]
        depth = [p + "_depth_kinect.png" for p in prefix]
        label = [p + "_label_kinect.png" for p in prefix]
        meta = [p + "_meta.pkl" for p in prefix]
    return rgb, depth, label, meta, prefixIds

val_rgb_files, val_depth_files, val_label_files, val_meta_files, val_prefix_ids = get_train_split_files('val')
train_rgb_files, train_depth_files, train_label_files, train_meta_files, train_prefix_ids = get_train_split_files('train')

In [None]:
# File reading helpers
import numpy as np
from PIL import Image
import pickle

def load_pickle(filename):
  with open(filename, 'rb') as f:
    return pickle.load(f)

def get_meta(meta_path):
  return load_pickle(meta_path)

def get_depth(depth_path):
  return (np.array(Image.open(depth_path))/1000)

def get_label(label_path):
  return np.array(Image.open(label_path))

def get_point_cloud(depth, intrinsic):
  z = depth
  v, u = np.indices(z.shape)
  uv1 = np.stack([u + 0.5, v + 0.5, np.ones_like(z)], axis=-1)
  points_viewer = uv1 @ np.linalg.inv(intrinsic).T * z[..., None]  # [H, W, 3]
  return points_viewer

def get_object_point_cloud(image_label, object_id, depth_map, intrinsic):
  # print(np.where(image_label==object_id))
  # print(image_label[327][654])
  image_label[np.where(image_label==object_id)] = 255
  # print(image_label[327][654])
  # print(np.where(image_label==255))
  image_label[np.where(image_label!=255)] = 0
  image_label[np.where(image_label==255)] = 1
  object_depth = depth_map * image_label
  pcd_target = get_point_cloud(object_depth, intrinsic)
  # (H, W, dim) = pcd_target.shape
  # filter out target object point cloud
  # print(pcd_target.shape)
  # print(pcd_target)
  # print((pcd_target[:,0]!=0)|(pcd_target[:,1]!=0)|(pcd_target[:,2]!=0))
  pcd_target = pcd_target.reshape(-1, pcd_target.shape[-1]) # reshape to (H*W, 3)
  pcd_target = pcd_target[(pcd_target[:,0]!=0)|(pcd_target[:,1]!=0)|(pcd_target[:,2]!=0)]
  return pcd_target
# test_pcd_target = get_object_point_cloud(image_label, object_id, depth_map, intrinsic)


In [None]:
# Rephase training objects point clouds and poses into {scene}_{object_id}.npz
import numpy as np
import time
from tqdm import tqdm

roundDown = 0

def save_object_info_array(train_rgb_files, train_depth_files, train_label_files, train_meta_files, train_prefix_ids, save_dir="/content/training_data_filtered/training_data/train_npy"):
  stt = time.time()
  infoPaths = []
  # roundUp = 400
  roundUp = len(train_rgb_files)
  for i in tqdm(range(roundDown, roundUp)):
    meta_dict = get_meta(train_meta_files[i])
    object_dict = {}

    # Fetch scene id
    prefix_id = train_prefix_ids[i]
    if prefix_id.startswith('1'):
      object_dict["scene"] = 0
    else:
      object_dict["scene"] = 1

    label_map = get_label(train_label_files[i])
    depth_map = get_depth(train_depth_files[i])
    intrinsic = meta_dict['intrinsic']
    object_ids = meta_dict['object_ids']
    for j in range(len(object_ids)):
      # Fetch object id and pose
      object_id = object_ids[j]
      object_dict["id"] = object_id
      object_dict["pose"] = meta_dict["poses_world"][object_id]
      # Fetch point cloud and colors (TODO)
      object_pcd = get_object_point_cloud(label_map.copy(), object_id, depth_map, intrinsic)
      object_dict["points"] = object_pcd
      save_path = os.path.join(save_dir, '{}-{:02d}.npy'.format(prefix_id, object_id))
      np.save(save_path, object_dict)
      infoPaths.append(save_path)
  end = time.time()
  print("Object info array saving finished! Taking {}".format(end-stt))
  return infoPaths


# Training of PointNetDense

In [None]:
# A network proposal tried: combining translation and rotation outputs (saved)
# But its performance is worse than PointNetDense. So excluded.
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.utils.data
from torch.autograd import Variable
import numpy as np
import torch.nn.functional as F

def rot_6d_9d_cvr(x, y):
    # Gram-schmidt process to get the rotation matrix
    x = F.normalize(x, dim=-1)
    y = y - x * (x * y).sum(-1, keepdims=True)
    y = F.normalize(y, dim=-1)
    z = torch.cross(x, y, -1)
    return torch.stack([x, y, z], dim=-1)

class PointNetCombined(nn.Module):
    def __init__(self, global_feature_size=81, n_points=256): # global feature = object id (size=79) + scene id (size=2)
        super(PointNetCombined, self).__init__()
        self.conv1 = torch.nn.Conv1d(3, 64, 1) # local feature extraction layers
        self.conv2 = torch.nn.Conv1d(64, 128, 1)
        self.conv3 = torch.nn.Conv1d(128, 256, 1)
        self.conv4 = torch.nn.Conv1d(256, 512, 1)
        self.conv5 = torch.nn.Conv1d(512, 1024, 1)
        # self.conv6 = torch.nn.Conv1d(1024, 2048, 1)
        self.bnConv1 = nn.BatchNorm1d(64)
        self.bnConv2 = nn.BatchNorm1d(128)
        self.bnConv3 = nn.BatchNorm1d(256)
        self.bnConv4 = nn.BatchNorm1d(512)
        self.bnConv5 = nn.BatchNorm1d(1024)
        # self.bnConv6 = nn.BatchNorm1d(2048)

        self.branch1 = torch.nn.Linear(global_feature_size, 256) # global feature extraction layers
        self.bnBranch1 = nn.BatchNorm1d(256)
        self.branch2 = torch.nn.Linear(256, 1024) # global feature extraction layers
        self.bnBranch2 = nn.BatchNorm1d(1024)
        # self.branch3 = torch.nn.Linear(512, 1024) # global feature extraction layers
        # self.bnBranch3 = nn.BatchNorm1d(1024)
        # self.branch4 = torch.nn.Linear(1024, 2048) # global feature extraction layers
        # self.bnBranch4 = nn.BatchNorm1d(2048)


        # self.dense1_r = torch.nn.Linear(4096, 2048) # inference layers
        # self.dense1_t = torch.nn.Linear(4096, 2048)
        # self.bnDense1_r = nn.BatchNorm1d(2048)
        # self.bnDense1_t = nn.BatchNorm1d(2048)
        self.dense2 = torch.nn.Linear(1024+1024, 1024) # inference layers
        self.bnDense2 = nn.BatchNorm1d(1024)
        self.dense3 = torch.nn.Linear(1024, 512)
        self.bnDense3 = nn.BatchNorm1d(512)
        self.dense4 = torch.nn.Linear(512, 256)
        self.bnDense4 = nn.BatchNorm1d(256)
        self.dense5 = torch.nn.Linear(256, 128)
        self.bnDense5 = nn.BatchNorm1d(128)
        self.dense6 = torch.nn.Linear(128, 9) # output layer for rotation

        for net in [
            self.conv1,
            self.conv2,
            self.conv3,
            self.conv4,
            self.conv5,
            # self.conv6,
            self.branch1,
            self.branch2,
            # self.branch3,
            # self.branch4,
            # self.dense1_r,
            # self.dense1_t,
            self.dense2,
            self.dense3,
            self.dense4,
            self.dense5,
            self.dense6
        ]:
            torch.nn.init.xavier_uniform_(net.weight)

    def forward(self, x, label, scene):
        points = x[:, :3]  # batch_size, 3, n_points
        # colors = x[:, 3:]

        # normalize to [-1,1] centered at (0,0,0)
        mins = points.min(dim=2, keepdim=True).values
        maxs = points.max(dim=2, keepdim=True).values
        center = (mins + maxs) / 2
        half_extents = (maxs - mins) / 2
        longest = half_extents.max(dim=1, keepdim=True).values.clamp(
            min=1e-3
        )
        points = (points - center) / longest

        # pcd feature extractions
        x = points
        # print("loca feature")
        # print(x)
        # x = torch.cat([points, colors], dim=1)
        x = F.relu(self.bnConv1(self.conv1(x)))
        x = F.relu(self.bnConv2(self.conv2(x)))
        x = F.relu(self.bnConv3(self.conv3(x)))
        x = F.relu(self.bnConv4(self.conv4(x)))
        x = F.relu(self.bnConv5(self.conv5(x)))
        # x = F.relu(self.bnConv6(self.conv6(x)))
        x = torch.max(x, 2, keepdim=True)[0]
        x = x.view(-1, 1024)

        # global feature extractions
        globalFeature = torch.cat((label, scene), dim=1)
        globalFeature = self.bnBranch1(self.branch1(globalFeature))
        globalFeature = self.bnBranch2(self.branch2(globalFeature))
        # globalFeature = self.bnBranch3(self.branch3(globalFeature))
        # globalFeature = self.bnBranch4(self.branch4(globalFeature))
        # print("global feature")
        # print(globalFeature)

        # concatenate local and global features
        x = torch.cat((x, globalFeature), dim=1)

        # infer rotation and translation jointly
        # x = F.relu(self.bnDense1_r(self.dense1_r(x)))
        x = F.relu(self.bnDense2(self.dense2(x)))
        x = F.relu(self.bnDense3(self.dense3(x)))
        x = F.relu(self.bnDense4(self.dense4(x)))
        x = F.relu(self.bnDense5(self.dense5(x)))
        x = self.dense6(x)
        x_rot, x_tran = rot_6d_9d_cvr(x[..., 0:3], x[..., 3:6]), x[..., 6:9] # fetch outputs
        x_tran = x_tran * longest.view(-1, 1) + center.view_as(x_tran) # scale back and un-center (batch_size, 3), to get translation vector
        return x_tran, x_rot


In [None]:
# Load object meta-data
import pandas as pd
object_models_file = "/content/training_data_filtered/training_data/objects_v1.csv"
object_models_info = pd.read_csv(object_models_file)
# print(object_models_info.loc[35])

In [None]:
"""Metric and visualization."""

def compute_rre(R_est: np.ndarray, R_gt: np.ndarray):
    """Compute the relative rotation error (geodesic distance of rotation)."""
    # assert R_est.shape == (3, 3), 'R_est: expected shape (3, 3), received shape {}.'.format(R_est.shape)
    # assert R_gt.shape == (3, 3), 'R_gt: expected shape (3, 3), received shape {}.'.format(R_gt.shape)
    # relative rotation error (RRE)
    # Rotational degree loss (not objective of optimization)
    rre = np.arccos(np.clip(0.5 * (np.trace(R_est.T @ R_gt) - 1), -1.0, 1.0))
    return rre


def compute_rte(t_est: np.ndarray, t_gt: np.ndarray):
    # assert t_est.shape == (3,), 't_est: expected shape (3,), received shape {}.'.format(t_est.shape)
    # assert t_gt.shape == (3,), 't_gt: expected shape (3,), received shape {}.'.format(t_gt.shape)
    # relative translation error (RTE)
    rte = np.linalg.norm(t_est - t_gt) # Resembling MSE loss
    return rte




In [None]:
# PointNetDense network. This is the network used now. (saved)
from torch.autograd import Variable
import torch

def rot_6d_9d_cvr(x, y):
    # Gram-schmidt process to get the rotation matrix
    x = F.normalize(x, dim=-1)
    y = y - x * (x * y).sum(-1, keepdims=True)
    y = F.normalize(y, dim=-1)
    z = torch.cross(x, y, -1)
    return torch.stack([x, y, z], dim=-1)

class STNkd(nn.Module):
    def __init__(self, k_input=32, k_output=32*32, n_points=256, spread_points=False):
        self.n_points = n_points
        self.spread_points = spread_points
        self.k_input = k_input
        self.k_output = k_output

        super(STNkd, self).__init__()
        self.conv1 = torch.nn.Conv1d(k_input, 64, 1) # feature extractors
        self.conv2 = torch.nn.Conv1d(64, 256, 1)
        self.conv3 = torch.nn.Conv1d(256, 512, 1)
        self.bnConv1 = nn.BatchNorm1d(64)
        self.bnConv2 = nn.BatchNorm1d(256)
        self.bnConv3 = nn.BatchNorm1d(512)
        self.relu = nn.ReLU()

        self.fc2 = nn.Linear(512, 256) # full connection layers
        self.fc3 = nn.Linear(256, 128)
        if self.spread_points:
          self.fc4 = nn.Linear(128, k_output*n_points)
        else:
          self.fc4 = nn.Linear(128, k_output)
        self.bnDense2 = nn.BatchNorm1d(256)
        self.bnDense3 = nn.BatchNorm1d(128)

        # weight initializations
        for net in [
            self.conv1,
            self.conv2,
            self.conv3,
            # self.conv4,
            # self.fc1,
            self.fc2,
            self.fc3,
            self.fc4
        ]:
            torch.nn.init.xavier_uniform_(net.weight)

    def forward(self, x):
        batchsize = x.size()[0]
        x = F.relu(self.bnConv1(self.conv1(x)))
        x = F.relu(self.bnConv2(self.conv2(x)))
        x = F.relu(self.bnConv3(self.conv3(x)))
        # x = F.relu(self.bnConv4(self.conv4(x)))
        x = torch.max(x, 2, keepdim=True)[0]
        x = x.view(-1, 512)

        # x = F.relu(self.bnDense1(self.fc1(x)))
        x = F.relu(self.bnDense2(self.fc2(x)))
        x = F.relu(self.bnDense3(self.fc3(x)))
        x = self.fc4(x)

        if self.spread_points:
          x = x.view(-1, self.k_output, self.n_points)
        return x

class PointNet(nn.Module):
    def __init__(self, global_feature_size=81, n_points=256): # global feature = object id (size=79) + scene id (size=2)
        super(PointNet, self).__init__()
        self.n_points = n_points
        self.global_feature_size = global_feature_size
        self.stn3d_input_feat = STNkd(k_input=3, k_output=3*3, n_points=self.n_points, spread_points=True)
        # self.conv0 = torch.nn.Conv1d(3, 3*3, 1) # local feature extraction layers
        self.conv1 = torch.nn.Conv1d(3*3, 64, 1) # local feature extraction layers
        self.conv2 = torch.nn.Conv1d(64, 128, 1)
        self.conv3 = torch.nn.Conv1d(128, 256, 1)
        self.conv4 = torch.nn.Conv1d(256, 512, 1)
        self.conv5 = torch.nn.Conv1d(512, 1024, 1)
        self.bnConv1 = nn.BatchNorm1d(64)
        self.bnConv2 = nn.BatchNorm1d(128)
        self.bnConv3 = nn.BatchNorm1d(256)
        self.bnConv4 = nn.BatchNorm1d(512)
        self.bnConv5 = nn.BatchNorm1d(1024)

        self.branch1 = torch.nn.Linear(global_feature_size, 256) # global feature extraction layers
        self.bnBranch1 = nn.BatchNorm1d(256)
        self.branch2 = torch.nn.Linear(256, 1024) # global feature extraction layers
        self.bnBranch2 = nn.BatchNorm1d(1024)

        self.dense2_r = torch.nn.Linear(2048, 1024) # inference layers
        self.dense2_t = torch.nn.Linear(2048, 1024) # rotation and translation inference are separated
        self.bnDense2_r = nn.BatchNorm1d(1024)
        self.bnDense2_t = nn.BatchNorm1d(1024)
        self.dense3_r = torch.nn.Linear(1024, 512)
        self.dense3_t = torch.nn.Linear(1024, 512)
        self.bnDense3_r = nn.BatchNorm1d(512)
        self.bnDense3_t = nn.BatchNorm1d(512)
        self.dense4_r = torch.nn.Linear(512, 256)
        self.dense4_t = torch.nn.Linear(512, 256)
        self.bnDense4_r = nn.BatchNorm1d(256)
        self.bnDense4_t = nn.BatchNorm1d(256)
        self.dense5_r = torch.nn.Linear(256, 128)
        self.dense5_t = torch.nn.Linear(256, 128)
        self.bnDense5_r = nn.BatchNorm1d(128)
        self.bnDense5_t = nn.BatchNorm1d(128)
        self.dense6_r = torch.nn.Linear(128, 6) # output layer for rotation
        self.dense6_t = torch.nn.Linear(128, 3) # output layer for translation

        # weight initializations
        for net in [
            # self.conv0,
            self.conv1,
            self.conv2,
            self.conv3,
            self.conv4,
            self.conv5,
            self.branch1,
            self.branch2,
            self.dense2_r,
            self.dense2_t,
            self.dense3_r,
            self.dense3_t,
            self.dense4_r,
            self.dense4_t,
            self.dense5_r,
            self.dense5_t,
            self.dense6_r,
            self.dense6_t
        ]:
            torch.nn.init.xavier_uniform_(net.weight)

    def forward(self, x, label, scene):
        points = x[:, :3]  # batch_size, 3, n_points
        # colors = x[:, 3:]

        # normalize to [-1,1] centered at (0,0,0)
        mins = points.min(dim=2, keepdim=True).values
        maxs = points.max(dim=2, keepdim=True).values
        center = (mins + maxs) / 2
        half_extents = (maxs - mins) / 2
        longest = half_extents.max(dim=1, keepdim=True).values.clamp(
            min=1e-3
        )
        points = (points - center) / longest

        # pcd feature extractions
        x = points

        # initial transform with spatial transformer
        x = self.stn3d_input_feat(x)
        x = F.relu(self.bnConv1(self.conv1(x)))
        x = F.relu(self.bnConv2(self.conv2(x)))
        x = F.relu(self.bnConv3(self.conv3(x)))
        x = F.relu(self.bnConv4(self.conv4(x)))
        x = F.relu(self.bnConv5(self.conv5(x)))
        # x = F.relu(self.bnConv6(self.conv6(x)))
        x = torch.max(x, 2, keepdim=True)[0]
        x = x.view(-1, 1024)

        # global feature extractions
        globalFeature = torch.cat((label, scene), dim=1)
        globalFeature = self.bnBranch1(self.branch1(globalFeature))
        globalFeature = self.bnBranch2(self.branch2(globalFeature))

        # concatenate local and global features, and transform using a transformer
        x = torch.cat((x, globalFeature), dim=1)
        # print("shape after cat {}".format(x.shape))
        x_rot = x
        x_tran = x_rot.clone()

        # infer rotation
        x_rot = F.relu(self.bnDense2_r(self.dense2_r(x_rot)))
        x_rot = F.relu(self.bnDense3_r(self.dense3_r(x_rot)))
        x_rot = F.relu(self.bnDense4_r(self.dense4_r(x_rot)))
        x_rot = F.relu(self.bnDense5_r(self.dense5_r(x_rot)))
        x_rot = self.dense6_r(x_rot)
        x_rot = rot_6d_9d_cvr(x_rot[..., 0:3], x_rot[..., 3:6]) # fetch output rotation matrix

        # infer translation
        x_tran = F.relu(self.bnDense2_t(self.dense2_t(x_tran)))
        x_tran = F.relu(self.bnDense3_t(self.dense3_t(x_tran)))
        x_tran = F.relu(self.bnDense4_t(self.dense4_t(x_tran)))
        x_tran = F.relu(self.bnDense5_t(self.dense5_t(x_tran)))
        x_tran = self.dense6_t(x_tran)
        x_tran = x_tran * longest.view(-1, 1) + center.view_as(x_tran) # scale back and un-center (batch_size, 3), to get translation vector
        return x_tran, x_rot


In [None]:
# Data loader definition

class MyDataset(Dataset):
    def __init__(self, files, root, n_points=256, is_test=False) -> None:
        super().__init__()
        self.files = files
        self.root = root
        self.n_points = n_points
        self.is_test = is_test
        np.random.shuffle(self.files)

    def __len__(self):
        return len(self.files)

    def __getitem__(self, index):
        result = {}
        npz = np.load(os.path.join(self.root, self.files[index]), allow_pickle=True)
        npz = npz.ravel()[0]
        # print(npz)
        # print(type(npz))
        object_id = npz["id"]
        scene = npz["scene"]
        points = npz["points"]
        # colors = npz["colors"]

        if len(points) == 0:
            points = np.zeros((self.n_points, 3))
            # colors = np.zeros((self.n_points, 3))
        if len(points) < self.n_points:
            idx = np.random.choice(len(points), self.n_points, replace=True)
            points = points[idx]
            # colors = colors[idx]
        else:
            idx = np.random.choice(len(points), self.n_points, replace=False)
            points = points[idx]
            # colors = colors[idx]
        result["object_id"] = torch.tensor(object_id, dtype=torch.long)
        result["scene"] = torch.tensor(scene, dtype=torch.long)
        result["points"] = torch.tensor(points.T, dtype=torch.float32)
        # result["colors"] = torch.tensor(colors.T, dtype=torch.float32)
        if not self.is_test:
          pose = npz["pose"]
          result["pose"] = torch.tensor(pose, dtype=torch.float32)
        else:
          result["prefix_id"] = npz["prefix_id"]
        return result

In [None]:
!mkdir model_save_4
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.utils.data
import numpy as np
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
import math
import shutil

import gc
gc.collect() #

def rotation_matrix_gen(alpha=0.0, beta=0.0, gamma=0.0):
    """Eular angle to rotation matrix conversions"""
    alpha = torch.tensor(alpha)
    beta = torch.tensor(beta)
    gamma = torch.tensor(gamma)
    rotation_matrix =  \
            torch.tensor([beta.cos()*gamma.cos(), alpha.sin()*beta.sin()*gamma.cos()-alpha.cos()*gamma.sin(), alpha.cos()*beta.sin()*gamma.cos()+alpha.sin()*gamma.sin(),
                          beta.cos()*gamma.sin(), alpha.sin()*beta.sin()*gamma.sin()+alpha.cos()*gamma.cos(), alpha.cos()*beta.sin()*gamma.sin()-alpha.sin()*gamma.cos(),
                          -beta.sin(), alpha.sin()*beta.cos(), alpha.cos()*beta.cos()]).to(device)
    rotation_matrix = torch.reshape(rotation_matrix, (3, 3))
    return rotation_matrix  

def get_sym_poses(pose, object_id, inf_num = 20):
    """Get symmetric poses with the geometric symmetry information"""
    symmetry_pose = torch.Tensor(pose.shape[0],3,3,21).to(device)
    for i in range(object_id.shape[0]):
        ob_id = object_id[i].item()
        num = 0
        symmetry_pose[i,:,:,num] = pose[i]
        if object_models_info.loc[ob_id]['geometric_symmetry'] == "no":
            num = num + 1
        else:
            symm_tokens = object_models_info.loc[ob_id]['geometric_symmetry'].split("|")
            # parse symmetric tokens and fill out symmetric poses first
            for symm_property in symm_tokens:
                axis, order = symm_property[0], symm_property[1:]
                order_num = 0
                # compute symmetry order along a given axis
                if(order == "inf"):
                    order_num = inf_num
                else:
                    order_num = int(order)
                # compute rotation matrices after rotating along a given axis
                if(axis == "x"):
                    for j in range(1, order_num):
                        symmetry_pose[i,:,:,num]=(rotation_matrix_gen(alpha=
                            float(j)*float(2*math.pi/order_num))@ pose[i])
                elif(axis == "y"):
                    for j in range(1, order_num):
                        symmetry_pose[i,:,:,num]=(rotation_matrix_gen(beta=
                            float(j)*float(2*math.pi/order_num))@ pose[i])
                elif(axis == "z"):
                    for j in range(1, order_num):
                        symmetry_pose[i,:,:,num] =(rotation_matrix_gen(gamma=
                            float(j)*float(2*math.pi/order_num))@ pose[i])
                num = num + 1
        while num < 21:
            # fill out remaining non-symmetric poses
            symmetry_pose[i,:,:,num]=pose[i]
            num = num + 1
    return symmetry_pose

def getRotationGeoDist(r, rgt):
    """
    Get geodesic distance between two rotational matrices
    output -> (num_of_rotations, num_of_gts)
    """
    r = r.squeeze(-1)
    GeoDist = torch.zeros((r.shape[0], rgt.shape[-1]))
    for i in range(rgt.shape[-1]):
      # print(rgt[..., i].shape)
      # print(torch.permute(r, (0,2,1)).shape)
      # print((rgt[..., i] @ torch.permute(r, (0,2,1))).shape)
      GeoDistGt = torch.arccos(1/2 * (torch.vmap(torch.trace)(rgt[..., i] @ torch.permute(r, (0,2,1))) - 1))
      GeoDist[:, i] = GeoDistGt
    return GeoDist

def get_sym_loss_rot(pred_rot, gt_rot):
    """
    Compute symmetric loss between symmetric gts and the predictions:
    Using the Min-of-N loss.
    """
    gt_rot = get_sym_poses(gt_rot, object_id, inf_num=20) # find all symmetric poses for all gts of the rotations
    # loss_all_symmetry = getRotationGeoDist(pred_rot.unsqueeze(dim=-1), gt_rot) # Get each rotation's geodesic distances to all gts respectively
    # loss_rot = (loss_all_symmetry).min(dim=1)[0].mean() # Get minimal geodesic distances for all rotations, and sum along all data
    loss_all_symmetry = (pred_rot.unsqueeze(dim=-1) - gt_rot).abs().sum(dim=(1,2))
    loss_rot = (loss_all_symmetry).min(dim=1)[0].mean()
    return loss_rot

def compute_rre(R_est: np.ndarray, R_gt: np.ndarray):
    # assert R_est.shape == (3, 3), 'R_est: expected shape (3, 3), received shape {}.'.format(R_est.shape)
    # assert R_gt.shape == (3, 3), 'R_gt: expected shape (3, 3), received shape {}.'.format(R_gt.shape)
    # relative rotation error (RRE)
    rre = 0
    batch_size = R_gt.shape[0]
    for i in range(batch_size):
      R_est_i, R_gt_i = R_est[i, ...], R_gt[i, ...]
      rre += np.arccos(np.clip(0.5 * (np.trace(R_est_i.T @ R_gt_i) - 1), -1.0, 1.0))
    return rre/batch_size


def compute_rte(t_est: np.ndarray, t_gt: np.ndarray):
    # assert t_est.shape == (3,), 't_est: expected shape (3,), received shape {}.'.format(t_est.shape)
    # assert t_gt.shape == (3,), 't_gt: expected shape (3,), received shape {}.'.format(t_gt.shape)
    # relative translation error (RTE)
    rte = 0
    batch_size = t_gt.shape[0]
    for i in range(batch_size):
      t_est_i, t_gt_i = t_est[i, ...], t_gt[i, ...]
      rte += np.linalg.norm(t_est_i - t_gt_i) # Resembling MSE loss
    return rte/batch_size


batch_size = 512
num_workers = 4
epochs = 35
init_lr = 1e-3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_files_path = "/content/training_data_filtered/training_data/train_npy"
val_files_path = "/content/training_data_filtered/training_data/val_npy"
train_data = MyDataset(os.listdir(train_files_path), train_files_path)  # generated by make_data.py
valid_data = MyDataset(os.listdir(val_files_path), val_files_path)

loader_train = DataLoader(train_data, batch_size=batch_size, num_workers=num_workers)
loader_val = DataLoader(valid_data, batch_size=batch_size, num_workers=num_workers)

train_epochs_loss_rot = []
train_epochs_loss_trans = []
val_epochs_loss_rot = []
val_epochs_loss_trans = []
val_total_loss = []

model_save_dir = "model_save_4"
model = PointNet().to(device)
optim = torch.optim.Adam(model.parameters(), lr=init_lr)
cty = False
stt_epoch = 0
if cty:
  stt_epoch = 1
  model_path = os.path.join(model_save_dir, "model_0.pth")
  model = torch.load(model_path)


# start training PointNetDense
for epoch in range(stt_epoch, epochs):
    with tqdm(total=len(loader_train)) as t:
        gc.collect()
        t.set_description("Epoch %i"%epoch)
        
        # training
        model.train()
        train_loss_trans = []
        train_loss_rot = []
        train_loss_rre = []
        train_loss_rte = []
        for iteration, data in tqdm(enumerate(loader_train)):
            object_id = data["object_id"].to(device)
            points = data["points"].to(device)
            pose = data["pose"].to(device)
            scene = data["scene"].to(device)

            pred_trans, pred_rot = model(
                torch.cat([points], dim=1), F.one_hot(object_id, 79).float(), F.one_hot(scene, 2).float()
            )

            gt_trans = pose[:, :3, 3]
            gt_rot = pose[:, :3, :3]

            loss_trans = F.l1_loss(pred_trans, gt_trans) # shape-agnostic loss for translations
            loss_rot = get_sym_loss_rot(pred_rot, gt_rot) # shape-agnostic loss with symmetry for rotations
            train_loss_trans.append(loss_trans.item())
            train_loss_rot.append(loss_rot.item())
            loss = loss_trans + loss_rot

            optim.zero_grad()
            loss.backward()
            optim.step()
            t.set_postfix(loss_trans=np.mean(train_loss_trans), loss_rot=np.mean(train_loss_rot))
            t.update(1)

        train_epochs_loss_rot.append(np.mean(train_loss_rot))
        train_epochs_loss_trans.append(np.mean(train_loss_trans))

        # validations
        model.eval()
        val_loss_rot = []
        val_loss_trans = []
        for data in loader_val:
            object_id = data["object_id"].to(device)
            points = data["points"].to(device)
            # colors = data["colors"].to(device)
            pose = data["pose"].to(device)
            scene = data["scene"].to(device)

            pred_trans, pred_rot = model( # Feed local points and global object and scene ids as input features
                torch.cat([points], dim=1), F.one_hot(object_id, 79).float(), F.one_hot(scene, 2).float()
                # torch.cat([points, colors], dim=1), F.one_hot(object_id, 79).float()
            )
            gt_trans = pose[:, :3, 3]
            gt_rot = pose[:, :3, :3]

            loss_trans = F.l1_loss(pred_trans, gt_trans)
            loss_rot = get_sym_loss_rot(pred_rot, gt_rot)
            val_loss_trans.append(loss_trans.item())
            val_loss_rot.append(loss_rot.item())
            loss = loss_trans + loss_rot

        val_epochs_loss_rot.append(np.mean(val_loss_rot))
        val_epochs_loss_trans.append(np.mean(val_loss_trans))
        if len(val_epochs_loss_rot)>=1 and np.mean(val_loss_rot) <= val_epochs_loss_rot[-1]:
            torch.save(model, os.path.join(model_save_dir, f"model_{epoch}.pth"))
            shutil.copytree(model_save_dir, os.path.join("/content/drive/MyDrive/", model_save_dir), dirs_exist_ok=True)

    print("val_loss_rot="+str(np.mean(val_loss_rot))+" val_loss_trans="+str(np.mean(val_loss_trans)))
    print("val_loss_total="+str(np.mean([val_loss_rot[i]+val_loss_trans[i] for i in range(len(val_loss_trans))])))
    if epoch == epochs - 1:
        torch.save(model, os.path.join(model_save_dir, f"model_{epoch}.pth"))
        shutil.copytree(model_save_dir, os.path.join("/content/drive/MyDrive/", model_save_dir), dirs_exist_ok=True)

# draw loss
plt.figure(figsize=(12,4))
plt.subplot(121)
plt.plot(train_epochs_loss_rot[0:],'-o',label="train_loss_rot")
plt.plot(val_epochs_loss_rot[0:],'-o',label="valid_loss_rot")
plt.title("epochs_loss_rot")
plt.legend()
plt.subplot(122)
plt.plot(train_epochs_loss_trans[0:],'-o',label="train_loss_trans")
plt.plot(val_epochs_loss_trans[0:],'-o',label="valid_loss_trans")
plt.title("epochs_loss_trans")
plt.legend()
plt.savefig('loss.png')


mkdir: cannot create directory ‘model_save_4’: File exists


Epoch 0:   0%|          | 0/59 [00:00<?, ?it/s]
Epoch 0:   2%|▏         | 1/59 [00:02<02:10,  2.25s/it, loss_rot=5.05, loss_trans=0.374]
Epoch 0:   3%|▎         | 2/59 [00:03<01:45,  1.85s/it, loss_rot=4.91, loss_trans=0.365]
Epoch 0:   5%|▌         | 3/59 [00:05<01:35,  1.71s/it, loss_rot=4.63, loss_trans=0.361]
Epoch 0:   7%|▋         | 4/59 [00:06<01:31,  1.66s/it, loss_rot=4.39, loss_trans=0.355]
Epoch 0:   8%|▊         | 5/59 [00:08<01:30,  1.68s/it, loss_rot=4.19, loss_trans=0.35]
Epoch 0:  10%|█         | 6/59 [00:10<01:26,  1.63s/it, loss_rot=4, loss_trans=0.348]
Epoch 0:  12%|█▏        | 7/59 [00:11<01:21,  1.57s/it, loss_rot=3.82, loss_trans=0.344]
Epoch 0:  14%|█▎        | 8/59 [00:13<01:19,  1.55s/it, loss_rot=3.68, loss_trans=0.342]
Epoch 0:  15%|█▌        | 9/59 [00:14<01:17,  1.55s/it, loss_rot=3.55, loss_trans=0.339]
Epoch 0:  17%|█▋        | 10/59 [00:16<01:16,  1.55s/it, loss_rot=3.46, loss_trans=0.337]
Epoch 0:  19%|█▊        | 11/59 [00:17<01:15,  1.57s/it, loss_rot

KeyboardInterrupt: ignored

In [None]:
# garbage collection to avoid CUDA memory explosion =_=
import gc
gc.collect()

0

In [None]:
# os.listdir("/content/training_data_filtered/training_data/splits/v2")
print("val_loss_rot="+str(np.mean(val_loss_rot))+" val_loss_trans="+str(np.mean(val_loss_trans)))
print("val_loss_total="+str(np.mean([val_loss_rot[i]+val_loss_trans[i] for i in range(len(val_loss_trans))])))
# shutil.copytree(model_save_dir, "/content/drive/MyDrive/", dirs_exist_ok=True)

val_loss_rot=3.076801836490631 val_loss_trans=0.34637565165758133
val_loss_total=3.4231774881482124


'/content/drive/MyDrive/'

# Inference

In [None]:
# Test data loading helpers
import os

test_split_dir = "./testing_data_pose_filtered/testing_data"
testing_data_root = "./testing_data_pose_filtered/testing_data"
testing_data_dir = "./testing_data_pose_filtered/testing_data/v2.2"

def get_test_split_files(split_name):
    with open(os.path.join(test_split_dir, f"{split_name}.txt"), 'r') as f:
        # for line in f:
        #   print(line)
        #   if line.strip():
        #     print(testing_data_dir)
        #     print(line.strip())
        prefixIds = [line.strip() for line in f if line.strip()]
        prefix = [os.path.join(testing_data_dir, line) for line in prefixIds]
        # print(prefixIds)
        # print(prefix)
        rgb = [p + "_color_kinect.png" for p in prefix]
        depth = [p + "_depth_kinect.png" for p in prefix]
        label = [p + "_label_kinect.png" for p in prefix]
        meta = [p + "_meta.pkl" for p in prefix]
    return rgb, depth, label, meta, prefixIds


test_rgb_files, test_depth_files, test_label_files, test_meta_files, test_prefix_ids = get_test_split_files('test')

In [None]:
# Rephase training objects point clouds and poses into {scene}_{object_id}.npz
!mkdir /content/testing_data_pose_filtered/testing_data/test_npy
import numpy as np
import time
from tqdm import tqdm

roundDown = 0

def save_object_info_array(train_rgb_files, train_depth_files, train_label_files, train_meta_files, train_prefix_ids, save_dir="/content/training_data_filtered/training_data/train_npy"):
  stt = time.time()
  infoPaths = []
  # roundUp = 400
  roundUp = len(train_rgb_files)
  for i in tqdm(range(roundDown, roundUp)):
    meta_dict = get_meta(train_meta_files[i])
    object_dict = {}

    # Fetch scene id
    prefix_id = train_prefix_ids[i]
    object_dict["prefix_id"] = prefix_id
    if prefix_id.startswith('1'):
      object_dict["scene"] = 0
    else:
      object_dict["scene"] = 1

    label_map = get_label(train_label_files[i])
    depth_map = get_depth(train_depth_files[i])
    intrinsic = meta_dict['intrinsic']
    object_ids = meta_dict['object_ids']
    for j in range(len(object_ids)):
      # Fetch object id and pose
      object_id = object_ids[j]
      object_dict["id"] = object_id
      # object_dict["pose"] = meta_dict["poses_world"][object_id]
      # Fetch point cloud and colors (TODO)
      object_pcd = get_object_point_cloud(label_map.copy(), object_id, depth_map, intrinsic)
      object_dict["points"] = object_pcd
      save_path = os.path.join(save_dir, '{}-{:02d}.npy'.format(prefix_id, object_id))
      np.save(save_path, object_dict)
      infoPaths.append(save_path)
  end = time.time()
  print("Object info array saving finished! Taking {}".format(end-stt))
  return infoPaths

object_file_paths_test = save_object_info_array(test_rgb_files, test_depth_files, test_label_files, test_meta_files, test_prefix_ids, save_dir="/content/testing_data_pose_filtered/testing_data/test_npy")



100%|██████████| 200/200 [01:53<00:00,  1.77it/s]

Object info array saving finished! Taking 113.18145489692688





In [77]:
# inference for result submissions
import numpy as np
import shutil
import json

test_batch_size = 512
test_num_workers = 4

sel_epoch = 20 # choose the 20th epoch's model. Since latter ones may overfit
model_path = "/content/model_save_3/model_20.pth"
model = torch.load(model_path)
model.eval()
poses = {}
test_files_path = "/content/testing_data_pose_filtered/testing_data/test_npy"
test_data = MyDataset(os.listdir(test_files_path), test_files_path, is_test=True)
loader_test = DataLoader(test_data, batch_size=test_batch_size, num_workers=test_num_workers)

def get_pose_matrix(rot, tr):
  T = np.identity(4)
  T[:3, :3] = rot
  T[:3, 3] = tr
  return T

def load_json(path):
  f = open(path)
  data = json.load(f)
  return data

def dump_json(sample, path):
  with open(path, 'w') as fp:
    json.dump(sample, fp)
  return 0

for data in tqdm(loader_test):
    # print(data)
    object_id = data["object_id"].to(device)
    points = data["points"].to(device)
    # colors = data["colors"].to(device)
    scene = data["scene"].to(device)
    prefix_id = data["prefix_id"]

    pred_trans, pred_rot = model(
        torch.cat([points], dim=1), F.one_hot(object_id, 79).float(), F.one_hot(scene, 2).float()
    )

    for j in range(pred_trans.shape[0]):
      tr, rot = pred_trans[j, ...], pred_rot[j, ...]
      prefix = prefix_id[j]
      id = object_id[j]
      T = get_pose_matrix(rot.detach().cpu().numpy(), tr.detach().cpu().numpy())
      if prefix not in poses.keys():
        poses[prefix] = {"poses_world": [None]*79}
      poses[prefix]["poses_world"][id] = T.tolist()

dump_json(poses, "result_test.json")
shutil.copy("result_test.json", "/content/drive/MyDrive/model_save_3/")


100%|██████████| 3/3 [00:00<00:00,  3.52it/s]


'/content/drive/MyDrive/model_save_3/result_test.json'