In [1]:
import sys
sys.path.append('nglod/sdf-net')

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.fx import symbolic_trace

In [2]:
def setparam(args, param, paramstr):
    argsparam = getattr(args, paramstr, None)
    if param is not None or argsparam is None:
        return param
    else:
        return argsparam

In [3]:
class FeatureVolume(nn.Module):
    def __init__(self, fdim, fsize):
        super().__init__()
        self.fsize = fsize
        self.fdim = fdim
        self.fm = nn.Parameter(torch.randn(1, fdim, fsize+1, fsize+1, fsize+1) * 0.01)
        self.sparse = None

    def forward(self, x):
        N = x.shape[0]
        sample_coords = x.view(1, N, 1, 1, 3) # [N, 1, 1, 3]    
        sample = F.grid_sample(self.fm, sample_coords, 
                               align_corners=True, padding_mode='border')[0,:,:,0,0].transpose(0,1)
        return sample


In [4]:
class BaseSDF(nn.Module):
    def __init__(self,
        args             = None,
        pos_enc  : bool  = None,
        ff_dim   : int   = None,
        ff_width : float = None
    ):
        super().__init__()
        self.args = args
        self.pos_enc = setparam(args, pos_enc, 'pos_enc')
        self.ff_dim = setparam(args, ff_dim, 'ff_dim')
        self.ff_width = setparam(args, ff_width, 'ff_width')
        
        self.input_dim = 3
        self.out_dim = 1

        if self.ff_dim > 0:
            mat = torch.randn([self.ff_dim, 3]) * self.ff_width
            self.gauss_matrix = nn.Parameter(mat)
            self.gauss_matrix.requires_grad_(False)
            self.input_dim += (self.ff_dim * 2) - 3
        elif self.pos_enc:
            self.input_dim = self.input_dim * 13


In [29]:
BATCH_SIZE = 100_000
class OctreeSDF(BaseSDF):
    def __init__(self, args, init=None):
        super().__init__(args)
        
        self.num_lods = args.num_lods
        self.lod = None

        self.fdim = self.args.feature_dim
        self.fsize = self.args.feature_size
        self.hidden_dim = self.args.hidden_dim
        self.pos_invariant = self.args.pos_invariant

        self.features = nn.ModuleList([])
        for i in range(self.args.num_lods):
            self.features.append(FeatureVolume(self.fdim, (2**(i+self.args.base_lod))))
        self.interpolate = self.args.interpolate

        self.louts = nn.ModuleList([])

        self.sdf_input_dim = self.fdim
        if not self.pos_invariant:
            self.sdf_input_dim += self.input_dim

        self.num_decoder = 1 if args.joint_decoder else self.args.num_lods 

        for i in range(self.num_decoder):
            self.louts.append(
                nn.Sequential(
                    nn.Linear(self.sdf_input_dim, self.hidden_dim, bias=True),
                    nn.ReLU(),
                    nn.Linear(self.hidden_dim, 1, bias=True),
                )
            )
        self.proj = nn.Linear(27, 1)
    def forward(self, x):
        # Query
        return torch.sum(x)
        summed = torch.zeros(BATCH_SIZE, 24, dtype=torch.float32, device='cuda')

        for i in range(3):
            # Query features
            sample = self.features[i](x)

            # Sum queried features
            summed = summed + sample

        # Concatenate xyz
        ex_sample = torch.cat([x, summed], dim=-1)
        return self.proj(ex_sample)
        # return self.louts[2](ex_sample)


def load_nglod(weights_path, obj_path):
    from nglod import build_parser
    parser = build_parser()
    args = parser.parse_args([
        '--net', 'OctreeSDF', 
        '--num-lods', '3',
        '--feature-dim', '24',
        # '--num-lods', '4',
        # '--feature-dim', '32',
        ])
    model = OctreeSDF(args)
    model.load_state_dict(torch.load(weights_path, map_location='cpu'), strict=False)
    model.cuda()
    model = {'model': model, 'obj_path': obj_path}
    return model


In [30]:
model = load_nglod('./_results_old/models/0.pth', 'test_task_meshes/0.obj')

In [31]:
model['model'] = model['model'].cuda()
model['model'].eval();

In [32]:
torch.set_grad_enabled(False)
model['compiled'] = torch.compile(model['model'], backend='inductor')
# model['fx'] = symbolic_trace(model['model'])
# model['trace'] = torch.jit.trace(model['model'], torch.rand(BATCH_SIZE, 3, device='cuda'))
# model['script'] = torch.jit.trace(model['model'], torch.rand(BATCH_SIZE, 3, device='cuda'))
torch.set_grad_enabled(True)

<torch.autograd.grad_mode.set_grad_enabled at 0x2aabe631dcf0>

In [12]:
def fn(x):
    if x.shape[0] > 1:
        return x.sum()
    return x

In [13]:
fn_c = torch.compile(fn)

In [15]:
fn_c(torch.randn(100, 100))

tensor(-129.3892)

In [10]:
m = torch.compile(nn.Linear(10, 1))

In [11]:
m(torch.randn(5, 10))

tensor([[-0.4982],
        [ 0.3896],
        [ 0.6954],
        [-0.0692],
        [ 0.7475]])

In [33]:
model['compiled'].cuda();

In [43]:
!echo $LD_LIBRARY_PATH

/cm/local/apps/cuda/libs/current/lib64:/data01/software/cuda11.2/lib64:/cm/shared/apps/sge/current/lib/linux-x64:/cm/local/apps/gcc/7.2.0/lib:/cm/local/apps/gcc/7.2.0/lib64:/titan/bohdan/miniconda3/envs/piper/lib


In [36]:
model['compiled'](torch.randn(100, 100, device='cuda'))

/usr/bin/ld: cannot find -lcuda
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -lcuda
collect2: error: ld returned 1 exit status


BackendCompilerFailed: debug_wrapper raised CalledProcessError: Command '['/cm/local/apps/gcc/7.2.0/bin/gcc', '/local/440235.1.gpu_long_2080ti/tmpx2zr_cx3/main.c', '-O3', '-I/titan/bohdan/miniconda3/envs/piper/lib/python3.10/site-packages/triton/third_party/cuda/include', '-I/titan/bohdan/miniconda3/envs/piper/include/python3.10', '-I/local/440235.1.gpu_long_2080ti/tmpx2zr_cx3', '-shared', '-fPIC', '-lcuda', '-o', '/local/440235.1.gpu_long_2080ti/tmpx2zr_cx3/triton_.cpython-310-x86_64-linux-gnu.so']' returned non-zero exit status 1.

Set torch._dynamo.config.verbose=True for more information


You can suppress this exception and fall back to eager by setting:
    torch._dynamo.config.suppress_errors = True


In [20]:
def measure_time(model):
    torch.set_grad_enabled(False)
    starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
    total_time_ms = 0
    times = []
    N_REPS = 10_000
    for _ in range(N_REPS):
        points = torch.rand(BATCH_SIZE, 3, device='cuda') * 2 - 1
        starter.record()
        model(points)
        ender.record()
        torch.cuda.synchronize()
        times.append(starter.elapsed_time(ender))
        total_time_ms += times[-1]
    mean_ns = np.mean(times) * 1000
    std_ns = np.std(times) * 1000
    print(f"mean = {mean_ns:.2f} us")
    print(f"std = {std_ns:.2f} us")
    torch.set_grad_enabled(True)
# print("Simple execution:")
# measure_time(model['model'])

print("Compiled with torch.compile:")
measure_time(model['compiled'])

# print("Symbolic trace:")
# measure_time(model['fx'])

# print("torch.jit.trace:")
# measure_time(model['trace'])

# print("torch.jit.script:")
# measure_time(model['script'])

Compiled with torch.compile:


/usr/bin/ld: cannot find -lcuda
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -lcuda
collect2: error: ld returned 1 exit status


BackendCompilerFailed: debug_wrapper raised CalledProcessError: Command '['/cm/local/apps/gcc/7.2.0/bin/gcc', '/local/440235.1.gpu_long_2080ti/tmp9w71lw15/main.c', '-O3', '-I/titan/bohdan/miniconda3/envs/piper/lib/python3.10/site-packages/triton/third_party/cuda/include', '-I/titan/bohdan/miniconda3/envs/piper/include/python3.10', '-I/local/440235.1.gpu_long_2080ti/tmp9w71lw15', '-shared', '-fPIC', '-lcuda', '-o', '/local/440235.1.gpu_long_2080ti/tmp9w71lw15/triton_.cpython-310-x86_64-linux-gnu.so']' returned non-zero exit status 1.

Set torch._dynamo.config.verbose=True for more information


You can suppress this exception and fall back to eager by setting:
    torch._dynamo.config.suppress_errors = True


In [10]:
!nvidia-smi

Wed Jan 10 22:23:33 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  On   | 00000000:3D:00.0 Off |                  N/A |
| 35%   46C    P2    87W / 260W |    500MiB / 11019MiB |     28%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  On   | 00000000:3E:00.0 Off |                  N/A |
| 27%   29C    P8    25W / 260W |      3MiB / 11019MiB |      0%      Default |
|       

In [44]:
from measure_f1 import load_siren

In [46]:
model = load_siren('siren_weights/0.pt')

In [49]:
model['model'].cuda()

Siren(
  (net): Sequential(
    (0): SineLayer(
      (linear): Linear(in_features=3, out_features=256, bias=True)
    )
    (1): SineLayer(
      (linear): Linear(in_features=256, out_features=256, bias=True)
    )
    (2): SineLayer(
      (linear): Linear(in_features=256, out_features=256, bias=True)
    )
    (3): SineLayer(
      (linear): Linear(in_features=256, out_features=256, bias=True)
    )
    (4): Linear(in_features=256, out_features=1, bias=True)
  )
)

In [66]:
BATCH_SIZE = 3_000
def measure_time(model):
    torch.set_grad_enabled(False)
    starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
    total_time_ms = 0
    times = []
    N_REPS = 10_000
    for _ in range(N_REPS):
        points = torch.rand(BATCH_SIZE, 3, device='cuda') * 2 - 1
        starter.record()
        model({'coords': points})
        ender.record()
        torch.cuda.synchronize()
        times.append(starter.elapsed_time(ender))
        total_time_ms += times[-1]
    mean_ns = np.mean(times) * 1000
    std_ns = np.std(times) * 1000
    print(f"mean = {mean_ns:.2f} us")
    print(f"std = {std_ns:.2f} us")
    torch.set_grad_enabled(True)


In [67]:
print("Simple execution:")
measure_time(model['model'])

Simple execution:
mean = 703.42 us
std = 64.64 us


In [61]:
next(model['model'].parameters()).device

device(type='cuda', index=0)

In [51]:
model['model'](torch.rand(10, 3))

IndexError: too many indices for tensor of dimension 2