In [23]:
import os, sys
sys.path.append('/home/ubuntu/votenet')
import numpy as np
import torch

# Code Sketch

The Architecture consists of 3 main parts:
* Backbone network PointNet++ (Multi Scale (Two scales per set abstraction layer)
* Voting Module
* Proposal Module

## BackBone

In [None]:
class Pointnet2Backbone_MSG(nn.Module):
    """
    ABAHNASY: backbone with MSG
       Backbone network for point cloud feature learning.
       Based on Pointnet++ multi-scale grouping network. 
        
       Parameters
       ----------
       input_feature_dim: int
            Number of input channels in the feature descriptor for each point.
            e.g. 3 for RGB.
    """
    def __init__(self, input_feature_dim=0):
        super().__init__()

        self.sa1 = PointnetSAModuleMSGVotes(
                npoint=4096,
                radii=[0.1, 0.5],
                nsamples=[64, 64],
                mlps=[[input_feature_dim, 16, 16, 32], [input_feature_dim, 32, 32, 64]],
                use_xyz=True
            )

        self.sa2 = PointnetSAModuleMSGVotes(
                npoint=1024,
                radii=[0.5, 1.0],
                nsamples=[32, 32],
                mlps=[[96, 64, 64, 128], [96, 64, 96, 128]],
                use_xyz=True
            )

        self.sa3 = PointnetSAModuleMSGVotes(
                npoint=512,
                radii=[1.0, 2.0],
                nsamples=[16, 16],
                mlps=[[256, 128, 196, 256], [256, 128, 196, 256]], 
                use_xyz=True
            )

        self.sa4 = PointnetSAModuleMSGVotes(
                npoint=256,
                radii=[2.0, 4.0],
                nsamples=[16, 16],
                mlps=[[512, 256, 256, 512], [512, 256, 384, 512]],
                use_xyz=True
            )

        self.fp1 = PointnetFPModule(mlp=[1024+512, 512, 512])
        self.fp2 = PointnetFPModule(mlp=[512+256, 512, 512])

In [None]:
# Set Abstraction Block Code
class PointnetSAModuleMSGVotes(nn.Module):
    ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG
    with extra support for returning point indices for getting their GT votes '''

    def __init__(
            self,
            *,
            mlps: List[List[int]],
            npoint: int,
            radii: List[float],
            nsamples: List[int],
            bn: bool = True,
            use_xyz: bool = True,
            sample_uniformly: bool = False
    ):
        super().__init__()

        assert(len(mlps) == len(nsamples) == len(radii))

        self.npoint = npoint
        self.groupers = nn.ModuleList()
        self.mlps = nn.ModuleList()
        for i in range(len(radii)):
            radius = radii[i]
            nsample = nsamples[i]
            self.groupers.append(
                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, sample_uniformly=sample_uniformly)
                if npoint is not None else pointnet2_utils.GroupAll(use_xyz)
            )
            mlp_spec = mlps[i]
            if use_xyz:
                mlp_spec[0] += 3

            self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn))


## Voting Module

In [None]:
class VotingModule(nn.Module):
    def __init__(self, vote_factor, seed_feature_dim):
        """ Votes generation from seed point features.

        Args:
            vote_facotr: int
                number of votes generated from each seed point
            seed_feature_dim: int
                number of channels of seed point features
            vote_feature_dim: int
                number of channels of vote features
        """
        super().__init__()
        self.vote_factor = vote_factor
        self.in_dim = seed_feature_dim
        self.out_dim = self.in_dim 
        self.conv1 = torch.nn.Conv1d(self.in_dim, self.in_dim, 1)
        self.conv2 = torch.nn.Conv1d(self.in_dim, self.in_dim, 1)
        self.conv3 = torch.nn.Conv1d(self.in_dim, (3+self.out_dim) * self.vote_factor, 1)
        self.bn1 = torch.nn.BatchNorm1d(self.in_dim)
        self.bn2 = torch.nn.BatchNorm1d(self.in_dim)

## Proposal Module

In [None]:
class ProposalModule(nn.Module):
    def __init__(self, num_class, num_heading_bin, num_size_cluster, mean_size_arr, num_proposal, sampling, seed_feat_dim=512):
        super().__init__() 

        self.num_class = num_class
        self.num_heading_bin = num_heading_bin
        self.num_size_cluster = num_size_cluster
        self.mean_size_arr = mean_size_arr
        self.num_proposal = num_proposal
        self.sampling = sampling
        self.seed_feat_dim = seed_feat_dim

        # Vote clustering
        self.vote_aggregation = PointnetSAModuleVotes( 
                npoint=self.num_proposal,
                radius=0.7,
                nsample=16,
                mlp=[self.seed_feat_dim, 128, 128, 128],
                use_xyz=True,
                normalize_xyz=True
            )
    
        # Object proposal/detection
        # Objectness scores (2), center residual (3),
        # heading class+residual (num_heading_bin*2), size class+residual(num_size_cluster*4)
        self.conv1 = torch.nn.Conv1d(128,128,1)
        self.conv2 = torch.nn.Conv1d(128,128,1)
        self.conv3 = torch.nn.Conv1d(128,2+3+num_heading_bin*2+num_size_cluster*4+self.num_class,1)
        self.bn1 = torch.nn.BatchNorm1d(128)
        self.bn2 = torch.nn.BatchNorm1d(128)

## VoteNet (All modules together)

In [None]:
class VoteNet(nn.Module):
    r"""
        A deep neural network for 3D object detection with end-to-end optimizable hough voting.

        Parameters
        ----------
        num_class: int
            Number of semantics classes to predict over -- size of softmax classifier
        num_heading_bin: int
        num_size_cluster: int
        input_feature_dim: (default: 0)
            Input dim in the feature descriptor for each point.  If the point cloud is Nx9, this
            value should be 6 as in an Nx9 point cloud, 3 of the channels are xyz, and 6 are feature descriptors
        num_proposal: int (default: 128)
            Number of proposals/detections generated from the network. Each proposal is a 3D OBB with a semantic class.
        vote_factor: (default: 1)
            Number of votes generated from each seed point.
    """

    def __init__(self, num_class, num_heading_bin, num_size_cluster, mean_size_arr,
        input_feature_dim=0, num_proposal=128, vote_factor=1, sampling='vote_fps'):
        super().__init__()

        self.num_class = num_class
        self.num_heading_bin = num_heading_bin
        self.num_size_cluster = num_size_cluster
        self.mean_size_arr = mean_size_arr
        assert(mean_size_arr.shape[0] == self.num_size_cluster)
        self.input_feature_dim = input_feature_dim
        self.num_proposal = num_proposal
        self.vote_factor = vote_factor
        self.sampling=sampling

        # Backbone point feature learning
        # self.backbone_net = Pointnet2Backbone(input_feature_dim=self.input_feature_dim)
        self.backbone_net = Pointnet2Backbone_MSG(input_feature_dim=self.input_feature_dim)

        # Hough voting
        self.vgen = VotingModule(self.vote_factor, 512)

        # Vote aggregation and detection
        self.pnet = ProposalModule(num_class, num_heading_bin, num_size_cluster,
            mean_size_arr, num_proposal, sampling)

# Calculate Number of parameters

In [8]:
from models.votenet import VoteNet

In [11]:
num_class = 5 # classes to detect
num_heading_bin = 12 # angle discretization
num_size_cluster = 5 
# bbox size priors
mean_size_arr = np.array([[4.550878,2.069160,1.755000], 
                                              [4.550878,2.069160,1.755000], 
                                              [0.858276,0.819377,1.750000], 
                                              [0.097881,0.590338,0.650000], 
                                              [2.114592,0.859100,2.070000]])
# number of bboxes to predict
num_target = 64
# height from ground feature
num_input_channel = 1
vote_factor = 1
cluster_sampling = 'vote_fps'

In [None]:
model = VoteNet(num_class=num_class,
               num_heading_bin=num_heading_bin,
               num_size_cluster=num_size_cluster,
               mean_size_arr=mean_size_arr,
               num_proposal=num_target,
               input_feature_dim=num_input_channel,
               vote_factor=vote_factor,
               sampling=cluster_sampling)

In [71]:
model

VoteNet(
  (backbone_net): Pointnet2Backbone_MSG(
    (sa1): PointnetSAModuleMSGVotes(
      (groupers): ModuleList(
        (0): QueryAndGroup()
        (1): QueryAndGroup()
      )
      (mlps): ModuleList(
        (0): SharedMLP(
          (layer0): Conv2d(
            (conv): Conv2d(4, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn): BatchNorm2d(
              (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            )
            (activation): ReLU(inplace=True)
          )
          (layer1): Conv2d(
            (conv): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn): BatchNorm2d(
              (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            )
            (activation): ReLU(inplace=True)
          )
          (layer2): Conv2d(
            (conv): Conv2d(16, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn): BatchNorm2d(
   

In [70]:
num = 0
for mod in list(model.modules()):
    p_list = list(mod.parameters())
    for p in p_list:
        num += np.prod(np.array((p.size())))
    
num

20547307