diff --git a/src/blender/addon/engine.py b/src/blender/addon/engine.py index a45249fe1..cacdb3edc 100644 --- a/src/blender/addon/engine.py +++ b/src/blender/addon/engine.py @@ -258,6 +258,7 @@ def list_render_passes(srl): if crl.pass_debug_bvh_traversed_instances: yield ("Debug BVH Traversed Instances", "X", 'VALUE') if crl.pass_debug_bvh_intersections: yield ("Debug BVH Intersections", "X", 'VALUE') if crl.pass_debug_ray_bounces: yield ("Debug Ray Bounces", "X", 'VALUE') + if crl.pass_debug_sample_count: yield ("Debug Sample Count", "X", 'VALUE') if crl.use_pass_volume_direct: yield ("VolumeDir", "RGB", 'COLOR') if crl.use_pass_volume_indirect: yield ("VolumeInd", "RGB", 'COLOR') diff --git a/src/blender/addon/properties.py b/src/blender/addon/properties.py index 80a12aadb..64a0d5f11 100644 --- a/src/blender/addon/properties.py +++ b/src/blender/addon/properties.py @@ -112,6 +112,7 @@ enum_sampling_pattern = ( ('SOBOL', "Sobol", "Use Sobol random sampling pattern"), ('CORRELATED_MUTI_JITTER', "Correlated Multi-Jitter", "Use Correlated Multi-Jitter random sampling pattern"), + ('PROGRESSIVE_MUTI_JITTER', "Progressive Multi-Jitter", "Use Progressive Multi-Jitter random sampling pattern"), ) enum_integrator = ( @@ -336,6 +337,26 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default=0.01, ) + adaptive_threshold: FloatProperty( + name="Adaptive Sampling Threshold", + description="Zero for automatic setting based on AA samples", + min=0.0, max=1.0, + default=0.0, + ) + + adaptive_min_samples: IntProperty( + name="Adaptive Min Samples", + description="Minimum AA samples for adaptive sampling. Zero for automatic setting based on AA samples", + min=0, max=4096, + default=0, + ) + + use_adaptive_sampling: BoolProperty( + name="Use adaptive sampling", + description="Automatically determine the number of samples per pixel based on a variance estimation", + default=False, + ) + min_light_bounces: IntProperty( name="Min Light Bounces", description="Minimum number of light bounces. Setting this higher reduces noise in the first bounces, " @@ -1279,6 +1300,12 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup): default=False, update=update_render_passes, ) + pass_debug_sample_count: BoolProperty( + name="Debug Sample Count", + description="Number of samples/camera rays per pixel", + default=False, + update=update_render_passes, + ) use_pass_volume_direct: BoolProperty( name="Volume Direct", description="Deliver direct volumetric scattering pass", diff --git a/src/blender/addon/ui.py b/src/blender/addon/ui.py index ffc067fd6..61ac9b03c 100644 --- a/src/blender/addon/ui.py +++ b/src/blender/addon/ui.py @@ -188,6 +188,8 @@ def draw(self, context): col.prop(cscene, "aa_samples", text="Render") col.prop(cscene, "preview_aa_samples", text="Viewport") + col.prop(cscene, "use_adaptive_sampling", text="Adaptive Sampling") + class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel): bl_label = "Sub Samples" @@ -239,7 +241,13 @@ def draw(self, context): row.prop(cscene, "seed") row.prop(cscene, "use_animated_seed", text="", icon='TIME') - layout.prop(cscene, "sampling_pattern", text="Pattern") + col = layout.column(align=True) + col.active = not(cscene.use_adaptive_sampling) + col.prop(cscene, "sampling_pattern", text="Pattern") + col = layout.column(align=True) + col.active = cscene.use_adaptive_sampling + col.prop(cscene, "adaptive_min_samples", text="Adaptive Min Samples") + col.prop(cscene, "adaptive_threshold", text="Adaptive Threshold") layout.prop(cscene, "use_square_samples") @@ -803,6 +811,8 @@ def draw(self, context): col.prop(cycles_view_layer, "denoising_store_passes", text="Denoising Data") col = flow.column() col.prop(cycles_view_layer, "pass_debug_render_time", text="Render Time") + col = flow.column() + col.prop(cycles_view_layer, "pass_debug_sample_count", text="Sample Count") layout.separator() diff --git a/src/blender/blender_session.cpp b/src/blender/blender_session.cpp index 78fb49db6..609f0ff46 100644 --- a/src/blender/blender_session.cpp +++ b/src/blender/blender_session.cpp @@ -474,7 +474,8 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_) b_rlay_name = b_view_layer.name(); /* add passes */ - vector passes = sync->sync_render_passes(b_rlay, b_view_layer); + vector passes = sync->sync_render_passes( + b_rlay, b_view_layer, session_params.adaptive_sampling); buffer_params.passes = passes; PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles"); diff --git a/src/blender/blender_sync.cpp b/src/blender/blender_sync.cpp index 332ee3575..7757120e1 100644 --- a/src/blender/blender_sync.cpp +++ b/src/blender/blender_sync.cpp @@ -291,6 +291,16 @@ void BlenderSync::sync_integrator() integrator->sample_all_lights_indirect = get_boolean(cscene, "sample_all_lights_indirect"); integrator->light_sampling_threshold = get_float(cscene, "light_sampling_threshold"); + if (RNA_boolean_get(&cscene, "use_adaptive_sampling")) { + integrator->sampling_pattern = SAMPLING_PATTERN_PMJ; + integrator->adaptive_min_samples = get_int(cscene, "adaptive_min_samples"); + integrator->adaptive_threshold = get_float(cscene, "adaptive_threshold"); + } + else { + integrator->adaptive_min_samples = INT_MAX; + integrator->adaptive_threshold = 0.0f; + } + int diffuse_samples = get_int(cscene, "diffuse_samples"); int glossy_samples = get_int(cscene, "glossy_samples"); int transmission_samples = get_int(cscene, "transmission_samples"); @@ -307,6 +317,8 @@ void BlenderSync::sync_integrator() integrator->mesh_light_samples = mesh_light_samples * mesh_light_samples; integrator->subsurface_samples = subsurface_samples * subsurface_samples; integrator->volume_samples = volume_samples * volume_samples; + integrator->adaptive_min_samples = min( + integrator->adaptive_min_samples * integrator->adaptive_min_samples, INT_MAX); } else { integrator->diffuse_samples = diffuse_samples; @@ -482,6 +494,8 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass) MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES); #endif MAP_PASS("Debug Render Time", PASS_RENDER_TIME); + MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER); + MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT); if (string_startswith(name, cryptomatte_prefix)) { return PASS_CRYPTOMATTE; } @@ -517,7 +531,9 @@ int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass) return -1; } -vector BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer) +vector BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, + BL::ViewLayer &b_view_layer, + bool adaptive_sampling) { vector passes; @@ -589,6 +605,10 @@ vector BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str()); Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time"); } + if (get_boolean(crp, "pass_debug_sample_count")) { + b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str()); + Pass::add(PASS_SAMPLE_COUNT, passes); + } if (get_boolean(crp, "use_pass_volume_direct")) { b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str()); Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir"); @@ -650,6 +670,13 @@ vector BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa } RNA_END; + if (adaptive_sampling) { + Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes); + if (!get_boolean(crp, "pass_debug_sample_count")) { + Pass::add(PASS_SAMPLE_COUNT, passes); + } + } + return passes; } @@ -883,6 +910,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && background && BlenderSession::print_render_stats; + params.adaptive_sampling = RNA_boolean_get(&cscene, "use_adaptive_sampling"); + return params; } diff --git a/src/blender/blender_sync.h b/src/blender/blender_sync.h index a80f484fb..d00baef31 100644 --- a/src/blender/blender_sync.h +++ b/src/blender/blender_sync.h @@ -70,7 +70,9 @@ class BlenderSync { int height, void **python_thread_state); void sync_view_layer(BL::SpaceView3D &b_v3d, BL::ViewLayer &b_view_layer); - vector sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer); + vector sync_render_passes(BL::RenderLayer &b_render_layer, + BL::ViewLayer &b_view_layer, + bool adaptive_sampling); void sync_integrator(); void sync_camera(BL::RenderSettings &b_render, BL::Object &b_override, diff --git a/src/device/device_cpu.cpp b/src/device/device_cpu.cpp index c2843a61e..ab7e7b350 100644 --- a/src/device/device_cpu.cpp +++ b/src/device/device_cpu.cpp @@ -34,6 +34,7 @@ #include "kernel/kernel_types.h" #include "kernel/split/kernel_split_data.h" #include "kernel/kernel_globals.h" +#include "kernel/kernel_adaptive_sampling.h" #include "kernel/filter/filter.h" @@ -317,6 +318,10 @@ class CPUDevice : public Device { REGISTER_SPLIT_KERNEL(next_iteration_setup); REGISTER_SPLIT_KERNEL(indirect_subsurface); REGISTER_SPLIT_KERNEL(buffer_update); + REGISTER_SPLIT_KERNEL(adaptive_stopping); + REGISTER_SPLIT_KERNEL(adaptive_filter_x); + REGISTER_SPLIT_KERNEL(adaptive_filter_y); + REGISTER_SPLIT_KERNEL(adaptive_adjust_samples); #undef REGISTER_SPLIT_KERNEL #undef KERNEL_FUNCTIONS } @@ -851,10 +856,33 @@ class CPUDevice : public Device { path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride); } } - tile.sample = sample + 1; task.update_progress(&tile, tile.w * tile.h); + + if (kernel_data.film.pass_adaptive_aux_buffer && (sample & 0x3) == 3 && + sample >= kernel_data.integrator.adaptive_min_samples - 1) { + WorkTile wtile; + wtile.x = tile.x; + wtile.y = tile.y; + wtile.w = tile.w; + wtile.h = tile.h; + wtile.offset = tile.offset; + wtile.stride = tile.stride; + wtile.buffer = (float *)tile.buffer; + + bool any = false; + for (int y = tile.y; y < tile.y + tile.h; ++y) { + any |= kernel_do_adaptive_filter_x(kg, y, &wtile); + } + for (int x = tile.x; x < tile.x + tile.w; ++x) { + any |= kernel_do_adaptive_filter_y(kg, x, &wtile); + } + if (!any) { + tile.sample = end_sample; + break; + } + } } if (use_coverage) { coverage.finalize(); @@ -931,6 +959,28 @@ class CPUDevice : public Device { } else { path_trace(task, tile, kg); + if (task.integrator_adaptive && kernel_data.film.pass_adaptive_aux_buffer) { + float *render_buffer = (float *)tile.buffer; + for (int y = tile.y; y < tile.y + tile.h; y++) { + for (int x = tile.x; x < tile.x + tile.w; x++) { + int index = tile.offset + x + y * tile.stride; + ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride; + if (buffer[kernel_data.film.pass_sample_count] < 0.0f) { + buffer[kernel_data.film.pass_sample_count] = + -buffer[kernel_data.film.pass_sample_count]; + float sample_multiplier = tile.sample / + max((float)tile.start_sample + 1.0f, + buffer[kernel_data.film.pass_sample_count]); + if (sample_multiplier != 1.0f) { + kernel_adaptive_post_adjust(kg, buffer, sample_multiplier); + } + } + else { + kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f)); + } + } + } + } } } else if (tile.task == RenderTile::DENOISE) { diff --git a/src/device/device_cuda.cpp b/src/device/device_cuda.cpp index dfd80d678..260d94030 100644 --- a/src/device/device_cuda.cpp +++ b/src/device/device_cuda.cpp @@ -1788,6 +1788,23 @@ class CUDADevice : public Device { cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + /* Kernels for adaptive sampling. */ + CUfunction cuAdaptiveStopping, cuAdaptiveFilterX, cuAdaptiveFilterY, cuAdaptiveScaleSamples; + if (task.integrator_adaptive) { + cuda_assert( + cuModuleGetFunction(&cuAdaptiveStopping, cuModule, "kernel_cuda_adaptive_stopping")); + cuda_assert(cuFuncSetCacheConfig(cuAdaptiveStopping, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert( + cuModuleGetFunction(&cuAdaptiveFilterX, cuModule, "kernel_cuda_adaptive_filter_x")); + cuda_assert(cuFuncSetCacheConfig(cuAdaptiveFilterX, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert( + cuModuleGetFunction(&cuAdaptiveFilterY, cuModule, "kernel_cuda_adaptive_filter_y")); + cuda_assert(cuFuncSetCacheConfig(cuAdaptiveFilterY, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuModuleGetFunction( + &cuAdaptiveScaleSamples, cuModule, "kernel_cuda_adaptive_scale_samples")); + cuda_assert(cuFuncSetCacheConfig(cuAdaptiveScaleSamples, CU_FUNC_CACHE_PREFER_L1)); + } + /* Allocate work tile. */ work_tiles.alloc(1); @@ -1812,6 +1829,16 @@ class CUDADevice : public Device { uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); + if (task.integrator_adaptive) { + /* Force to either 1, 2 or multiple of 4 samples per kernel invocation. */ + if (step_samples == 3) { + step_samples = 2; + } + else if (step_samples > 4) { + step_samples &= 0xfffffffc; + } + } + /* Render all samples. */ int start_sample = rtile.start_sample; int end_sample = rtile.start_sample + rtile.num_samples; @@ -1832,6 +1859,26 @@ class CUDADevice : public Device { cuda_assert(cuLaunchKernel( cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + uint filter_sample = sample + wtile->num_samples - 1; + /* Run the adaptive sampling kernels when we're at a multiple of 4 samples. + * These are a series of tiny kernels because there is no grid synchronisation + * from within a kernel, so multiple kernel launches it is. */ + if (task.integrator_adaptive && (filter_sample & 0x3) == 3) { + total_work_size = wtile->h * wtile->w; + void *args2[] = {&d_work_tiles, &filter_sample, &total_work_size}; + num_blocks = divide_up(total_work_size, num_threads_per_block); + cuda_assert(cuLaunchKernel( + cuAdaptiveStopping, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args2, 0)); + total_work_size = wtile->h; + num_blocks = divide_up(total_work_size, num_threads_per_block); + cuda_assert(cuLaunchKernel( + cuAdaptiveFilterX, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args2, 0)); + total_work_size = wtile->w; + num_blocks = divide_up(total_work_size, num_threads_per_block); + cuda_assert(cuLaunchKernel( + cuAdaptiveFilterY, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args2, 0)); + } + cuda_assert(cuCtxSynchronize()); /* Update progress. */ @@ -1843,6 +1890,17 @@ class CUDADevice : public Device { break; } } + + if (task.integrator_adaptive) { + CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); + uint total_work_size = wtile->h * wtile->w; + void *args[] = {&d_work_tiles, &rtile.start_sample, &rtile.sample, &total_work_size}; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); + cuda_assert(cuLaunchKernel( + cuAdaptiveScaleSamples, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + cuda_assert(cuCtxSynchronize()); + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + } } void film_convert(DeviceTask &task, diff --git a/src/device/device_split_kernel.cpp b/src/device/device_split_kernel.cpp index 42e597a34..197293b7a 100644 --- a/src/device/device_split_kernel.cpp +++ b/src/device/device_split_kernel.cpp @@ -55,6 +55,10 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device) kernel_next_iteration_setup = NULL; kernel_indirect_subsurface = NULL; kernel_buffer_update = NULL; + kernel_adaptive_stopping = NULL; + kernel_adaptive_filter_x = NULL; + kernel_adaptive_filter_y = NULL; + kernel_adaptive_adjust_samples = NULL; } DeviceSplitKernel::~DeviceSplitKernel() @@ -83,6 +87,10 @@ DeviceSplitKernel::~DeviceSplitKernel() delete kernel_next_iteration_setup; delete kernel_indirect_subsurface; delete kernel_buffer_update; + delete kernel_adaptive_stopping; + delete kernel_adaptive_filter_x; + delete kernel_adaptive_filter_y; + delete kernel_adaptive_adjust_samples; } bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features) @@ -114,6 +122,10 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_fe LOAD_KERNEL(next_iteration_setup); LOAD_KERNEL(indirect_subsurface); LOAD_KERNEL(buffer_update); + LOAD_KERNEL(adaptive_stopping); + LOAD_KERNEL(adaptive_filter_x); + LOAD_KERNEL(adaptive_filter_y); + LOAD_KERNEL(adaptive_adjust_samples); #undef LOAD_KERNEL @@ -208,6 +220,19 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, RenderTile subtile = tile; subtile.start_sample = tile.sample; + + if (task->integrator_adaptive) { + int step_samples = subtile.start_sample % 4; + /* Round so that we end up on multiples of four for adaptive sampling. */ + if (step_samples == 3) { + step_samples = 2; + } + else if (step_samples > 4) { + step_samples &= 0xfffffffc; + } + samples_per_second = max(1, step_samples - (subtile.start_sample % 4)); + } + subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample); @@ -302,6 +327,22 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, } } + if (task->integrator_adaptive && ((tile.sample + subtile.num_samples - 1) & 3) == 3) { + size_t buffer_size[2]; + buffer_size[0] = round_up(tile.w, local_size[0]); + buffer_size[1] = round_up(tile.h, local_size[1]); + kernel_adaptive_stopping->enqueue( + KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); + buffer_size[0] = round_up(tile.h, local_size[0]); + buffer_size[1] = round_up(1, local_size[1]); + kernel_adaptive_filter_x->enqueue( + KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); + buffer_size[0] = round_up(tile.w, local_size[0]); + buffer_size[1] = round_up(1, local_size[1]); + kernel_adaptive_filter_y->enqueue( + KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); + } + double time_per_sample = ((time_dt() - start_time) / subtile.num_samples); if (avg_time_per_sample == 0.0) { @@ -324,6 +365,28 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, } } + if (task->integrator_adaptive) { + /* Reset the start samples. */ + RenderTile subtile = tile; + subtile.start_sample = tile.start_sample; + subtile.num_samples = tile.sample - tile.start_sample; + enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size), + subtile, + num_global_elements, + kgbuffer, + kernel_data, + split_data, + ray_state, + queue_index, + use_queues_flag, + work_pool_wgs); + size_t buffer_size[2]; + buffer_size[0] = round_up(tile.w, local_size[0]); + buffer_size[1] = round_up(tile.h, local_size[1]); + kernel_adaptive_adjust_samples->enqueue( + KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); + } + return true; } diff --git a/src/device/device_split_kernel.h b/src/device/device_split_kernel.h index 6ff326bf2..9d6b9efdd 100644 --- a/src/device/device_split_kernel.h +++ b/src/device/device_split_kernel.h @@ -75,6 +75,10 @@ class DeviceSplitKernel { SplitKernelFunction *kernel_next_iteration_setup; SplitKernelFunction *kernel_indirect_subsurface; SplitKernelFunction *kernel_buffer_update; + SplitKernelFunction *kernel_adaptive_stopping; + SplitKernelFunction *kernel_adaptive_filter_x; + SplitKernelFunction *kernel_adaptive_filter_y; + SplitKernelFunction *kernel_adaptive_adjust_samples; /* Global memory variables [porting]; These memory is used for * co-operation between different kernels; Data written by one diff --git a/src/device/device_task.h b/src/device/device_task.h index f45de5564..305177b70 100644 --- a/src/device/device_task.h +++ b/src/device/device_task.h @@ -110,6 +110,7 @@ class DeviceTask : public Task { bool need_finish_queue; bool integrator_branched; + bool integrator_adaptive; int2 requested_tile_size; protected: diff --git a/src/device/opencl/opencl.h b/src/device/opencl/opencl.h index 61b1e3e3b..b761726b1 100644 --- a/src/device/opencl/opencl.h +++ b/src/device/opencl/opencl.h @@ -445,6 +445,7 @@ class OpenCLDevice : public Device { device_ptr rgba_byte, device_ptr rgba_half); void shader(DeviceTask &task); + void update_adaptive(DeviceTask &task, RenderTile &tile, int sample); void denoise(RenderTile &tile, DenoisingTask &denoising); diff --git a/src/device/opencl/opencl_split.cpp b/src/device/opencl/opencl_split.cpp index 76f9ce7a1..f56c2027a 100644 --- a/src/device/opencl/opencl_split.cpp +++ b/src/device/opencl/opencl_split.cpp @@ -56,7 +56,11 @@ static const string SPLIT_BUNDLE_KERNELS = "enqueue_inactive " "next_iteration_setup " "indirect_subsurface " - "buffer_update"; + "buffer_update " + "adaptive_stopping " + "adaptive_filter_x " + "adaptive_filter_y " + "adaptive_adjust_samples"; const string OpenCLDevice::get_opencl_program_name(const string &kernel_name) { @@ -283,6 +287,10 @@ void OpenCLDevice::OpenCLSplitPrograms::load_kernels( ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup); ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface); ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples); programs.push_back(&program_split); # undef ADD_SPLIT_KERNEL_PROGRAM diff --git a/src/kernel/CMakeLists.txt b/src/kernel/CMakeLists.txt index 67fc78ebd..c7ebdcaf9 100644 --- a/src/kernel/CMakeLists.txt +++ b/src/kernel/CMakeLists.txt @@ -36,6 +36,10 @@ set(SRC_CUDA_KERNELS ) set(SRC_OPENCL_KERNELS + kernels/opencl/kernel_adaptive_stopping.cl + kernels/opencl/kernel_adaptive_filter_x.cl + kernels/opencl/kernel_adaptive_filter_y.cl + kernels/opencl/kernel_adaptive_adjust_samples.cl kernels/opencl/kernel_bake.cl kernels/opencl/kernel_base.cl kernels/opencl/kernel_displace.cl @@ -94,6 +98,7 @@ set(SRC_BVH_HEADERS set(SRC_HEADERS kernel_accumulate.h + kernel_adaptive_sampling.h kernel_bake.h kernel_camera.h kernel_color.h @@ -323,6 +328,10 @@ set(SRC_UTIL_HEADERS ) set(SRC_SPLIT_HEADERS + split/kernel_adaptive_adjust_samples.h + split/kernel_adaptive_filter_x.h + split/kernel_adaptive_filter_y.h + split/kernel_adaptive_stopping.h split/kernel_branched.h split/kernel_buffer_update.h split/kernel_data_init.h diff --git a/src/kernel/kernel_adaptive_sampling.h b/src/kernel/kernel_adaptive_sampling.h new file mode 100644 index 000000000..1c1fcc063 --- /dev/null +++ b/src/kernel/kernel_adaptive_sampling.h @@ -0,0 +1,239 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__ +#define __KERNEL_ADAPTIVE_SAMPLING_H__ + +CCL_NAMESPACE_BEGIN + +/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */ + +ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg, + ccl_global float *buffer, + int sample) +{ + /* TODO Stefan: Is this better in linear, sRGB or something else? */ + float4 I = *((ccl_global float4 *)buffer); + float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer); + /* The per pixel error as seen in section 2.1 of + * "A hierarchical automatic stopping condition for Monte Carlo global illumination" + * A small epsilon is added to the divisor to prevent division by zero. */ + float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) / + (sample * 0.0001f + sqrtf(I.x + I.y + I.z)); + if (error < kernel_data.integrator.adaptive_threshold * (float)sample) { + /* Set the fourth component to non-zero value to indicate that this pixel has converged. */ + buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f; + } +} + +/* Adjust the values of an adaptively sampled pixel. */ + +ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg, + ccl_global float *buffer, + float sample_multiplier) +{ + *(ccl_global float4 *)(buffer) *= sample_multiplier; + + /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */ + kernel_assert(kernel_data.film.pass_adaptive_aux_buffer); + *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier; + +#ifdef __PASSES__ + int flag = kernel_data.film.pass_flag; + + if (flag & PASSMASK(SHADOW)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_shadow) *= sample_multiplier; + + if (flag & PASSMASK(MIST)) + *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier; + + if (flag & PASSMASK(NORMAL)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier; + + if (flag & PASSMASK(UV)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier; + + if (flag & PASSMASK(MOTION)) { + *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier; + *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier; + } + + if (kernel_data.film.use_light_pass) { + int light_flag = kernel_data.film.light_pass_flag; + + if (light_flag & PASSMASK(DIFFUSE_INDIRECT)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier; + if (light_flag & PASSMASK(GLOSSY_INDIRECT)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier; + if (light_flag & PASSMASK(TRANSMISSION_INDIRECT)) + *(ccl_global float3 *)(buffer + + kernel_data.film.pass_transmission_indirect) *= sample_multiplier; + if (light_flag & PASSMASK(SUBSURFACE_INDIRECT)) + *(ccl_global float3 *)(buffer + + kernel_data.film.pass_subsurface_indirect) *= sample_multiplier; + if (light_flag & PASSMASK(VOLUME_INDIRECT)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier; + if (light_flag & PASSMASK(DIFFUSE_DIRECT)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier; + if (light_flag & PASSMASK(GLOSSY_DIRECT)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier; + if (light_flag & PASSMASK(TRANSMISSION_DIRECT)) + *(ccl_global float3 *)(buffer + + kernel_data.film.pass_transmission_direct) *= sample_multiplier; + if (light_flag & PASSMASK(SUBSURFACE_DIRECT)) + *(ccl_global float3 *)(buffer + + kernel_data.film.pass_subsurface_direct) *= sample_multiplier; + if (light_flag & PASSMASK(VOLUME_DIRECT)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier; + + if (light_flag & PASSMASK(EMISSION)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier; + if (light_flag & PASSMASK(BACKGROUND)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier; + if (light_flag & PASSMASK(AO)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier; + + if (light_flag & PASSMASK(DIFFUSE_COLOR)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier; + if (light_flag & PASSMASK(GLOSSY_COLOR)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier; + if (light_flag & PASSMASK(TRANSMISSION_COLOR)) + *(ccl_global float3 *)(buffer + + kernel_data.film.pass_transmission_color) *= sample_multiplier; + if (light_flag & PASSMASK(SUBSURFACE_COLOR)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_subsurface_color) *= sample_multiplier; + } +#endif + +#ifdef __DENOISING_FEATURES__ + +# define scale_float3_variance(buffer, offset, scale) \ + *(buffer + offset) *= scale; \ + *(buffer + offset + 1) *= scale; \ + *(buffer + offset + 2) *= scale; \ + *(buffer + offset + 3) *= scale * scale; \ + *(buffer + offset + 4) *= scale * scale; \ + *(buffer + offset + 5) *= scale * scale; + +# define scale_shadow_variance(buffer, offset, scale) \ + *(buffer + offset) *= scale; \ + *(buffer + offset + 1) *= scale; \ + *(buffer + offset + 2) *= scale * scale; + + if (kernel_data.film.pass_denoising_data) { + scale_shadow_variance( + buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier); + scale_shadow_variance( + buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier); + if (kernel_data.film.pass_denoising_clean) { + scale_float3_variance( + buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier); + *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier; + *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier; + *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier; + } + else { + scale_float3_variance( + buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier); + } + scale_float3_variance( + buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier); + scale_float3_variance( + buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier); + *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier; + *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH + + 1) *= sample_multiplier * sample_multiplier; + } +#endif /* __DENOISING_FEATURES__ */ + + if (kernel_data.film.cryptomatte_passes) { + int num_slots = 0; + num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0; + num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0; + num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0; + num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth; + ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer + + kernel_data.film.pass_cryptomatte); + for (int slot = 0; slot < num_slots; slot++) { + id_buffer[slot].y *= sample_multiplier; + } + } +} + +/* This is a simple box filter in two passes. + * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */ + +ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile) +{ + bool any = false; + bool prev = false; + for (int x = tile->x; x < tile->x + tile->w; ++x) { + int index = tile->offset + x + y * tile->stride; + ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; + ccl_global float4 *aux = (ccl_global float4 *)(buffer + + kernel_data.film.pass_adaptive_aux_buffer); + if (aux->w == 0.0f) { + any = true; + if (x > tile->x && !prev) { + index = index - 1; + buffer = tile->buffer + index * kernel_data.film.pass_stride; + aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer); + aux->w = 0.0f; + } + prev = true; + } + else { + if (prev) { + aux->w = 0.0f; + } + prev = false; + } + } + return any; +} + +ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile) +{ + bool prev = false; + bool any = false; + for (int y = tile->y; y < tile->y + tile->h; ++y) { + int index = tile->offset + x + y * tile->stride; + ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; + ccl_global float4 *aux = (ccl_global float4 *)(buffer + + kernel_data.film.pass_adaptive_aux_buffer); + if (aux->w == 0.0f) { + any = true; + if (y > tile->y && !prev) { + index = index - tile->stride; + buffer = tile->buffer + index * kernel_data.film.pass_stride; + aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer); + aux->w = 0.0f; + } + prev = true; + } + else { + if (prev) { + aux->w = 0.0f; + } + prev = false; + } + } + return any; +} + +CCL_NAMESPACE_END + +#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */ diff --git a/src/kernel/kernel_passes.h b/src/kernel/kernel_passes.h index 828add9dc..9196710a7 100644 --- a/src/kernel/kernel_passes.h +++ b/src/kernel/kernel_passes.h @@ -29,7 +29,9 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, if (kernel_data.film.pass_denoising_data == 0) return; - buffer += (sample & 1) ? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A; + buffer += sample_is_even(kernel_data.integrator.sampling_pattern, sample) ? + DENOISING_PASS_SHADOW_B : + DENOISING_PASS_SHADOW_A; path_total = ensure_finite(path_total); path_total_shaded = ensure_finite(path_total_shaded); @@ -378,6 +380,38 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg, #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, L); #endif + + /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping criteria. + This is the heuristic from "A hierarchical automatic stopping condition for Monte Carlo global illumination" + except that here it is applied per pixel and not in hierarchical tiles. */ + if (kernel_data.film.pass_adaptive_aux_buffer && + kernel_data.integrator.adaptive_threshold > 0.0f) { + if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) { + kernel_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer, + make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f)); + } +#ifdef __KERNEL_CPU__ + if (sample >= kernel_data.integrator.adaptive_min_samples - 1 && (sample & 0x3) == 3) { + kernel_do_adaptive_stopping(kg, buffer, sample); + } +#endif + } + + /* Write the sample count as negative numbers initially to mark the samples as in progress. + * Once the tile has finished rendering, the sign gets flipped and all the pixel values + * are scaled as if they were taken at a uniform sample count. */ + if (kernel_data.film.pass_sample_count) { + /* Make sure it's a negative number. In progressive refine mode, this bit gets flipped between passes. */ +#ifdef __ATOMIC_PASS_WRITE__ + atomic_fetch_and_or_uint32((ccl_global uint *)(buffer + kernel_data.film.pass_sample_count), + 0x80000000); +#else + if (buffer[kernel_data.film.pass_sample_count] > 0) { + buffer[kernel_data.film.pass_sample_count] *= -1.0f; + } +#endif + kernel_write_pass_float(buffer + kernel_data.film.pass_sample_count, -1.0f); + } } CCL_NAMESPACE_END diff --git a/src/kernel/kernel_path.h b/src/kernel/kernel_path.h index 1a0b67275..bdd2703a8 100644 --- a/src/kernel/kernel_path.h +++ b/src/kernel/kernel_path.h @@ -31,6 +31,7 @@ #include "kernel/kernel_accumulate.h" #include "kernel/kernel_shader.h" #include "kernel/kernel_light.h" +#include "kernel/kernel_adaptive_sampling.h" #include "kernel/kernel_passes.h" #if defined(__VOLUME__) || defined(__SUBSURFACE__) @@ -656,6 +657,14 @@ ccl_device void kernel_path_trace( buffer += index * pass_stride; + if (kernel_data.film.pass_adaptive_aux_buffer) { + ccl_global float4 *aux = (ccl_global float4 *)(buffer + + kernel_data.film.pass_adaptive_aux_buffer); + if (aux->w > 0.0f) { + return; + } + } + /* Initialize random numbers and sample ray. */ uint rng_hash; Ray ray; diff --git a/src/kernel/kernel_path_branched.h b/src/kernel/kernel_path_branched.h index f75e4ab4c..0d5781fe3 100644 --- a/src/kernel/kernel_path_branched.h +++ b/src/kernel/kernel_path_branched.h @@ -523,6 +523,14 @@ ccl_device void kernel_branched_path_trace( buffer += index * pass_stride; + if (kernel_data.film.pass_adaptive_aux_buffer) { + ccl_global float4 *aux = (ccl_global float4 *)(buffer + + kernel_data.film.pass_adaptive_aux_buffer); + if (aux->w > 0.0f) { + return; + } + } + /* initialize random numbers and ray */ uint rng_hash; Ray ray; diff --git a/src/kernel/kernel_random.h b/src/kernel/kernel_random.h index 80738213d..b2523e3c6 100644 --- a/src/kernel/kernel_random.h +++ b/src/kernel/kernel_random.h @@ -43,20 +43,34 @@ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension) uint i = index + SOBOL_SKIP; for (int j = 0, x; (x = find_first_set(i)); i >>= x) { j += x; - result ^= kernel_tex_fetch(__sobol_directions, 32 * dimension + j - 1); + result ^= kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1); } return result; } #endif /* __SOBOL__ */ +#define NUM_PJ_SAMPLES 64 * 64 +#define NUM_PJ_PATTERNS 48 + ccl_device_forceinline float path_rng_1D( KernelGlobals *kg, uint rng_hash, int sample, int num_samples, int dimension) { #ifdef __DEBUG_CORRELATION__ return (float)drand48(); #endif - + if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) { + /* Fallback to random */ + if (sample > NUM_PJ_SAMPLES) { + int p = rng_hash + dimension; + return cmj_randfloat(sample, p); + } + uint tmp_rng = cmj_hash_simple(dimension, rng_hash); + int index = ((dimension % NUM_PJ_PATTERNS) * NUM_PJ_SAMPLES + sample) * 2; + return __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ + (tmp_rng & 0x007fffff)) - + 1.0f; + } #ifdef __CMJ__ # ifdef __SOBOL__ if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) @@ -99,7 +113,22 @@ ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, *fy = (float)drand48(); return; #endif - + if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) { + if (sample > NUM_PJ_SAMPLES) { + int p = rng_hash + dimension; + *fx = cmj_randfloat(sample, p); + *fy = cmj_randfloat(sample, p + 1); + } + uint tmp_rng = cmj_hash_simple(dimension, rng_hash); + int index = ((dimension % NUM_PJ_PATTERNS) * NUM_PJ_SAMPLES + sample) * 2; + *fx = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ (tmp_rng & 0x007fffff)) - + 1.0f; + tmp_rng = cmj_hash_simple(dimension + 1, rng_hash); + *fy = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index + 1) ^ + (tmp_rng & 0x007fffff)) - + 1.0f; + return; + } #ifdef __CMJ__ # ifdef __SOBOL__ if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) @@ -284,4 +313,23 @@ ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng) return (float)*rng * (1.0f / (float)0xFFFFFFFF); } +ccl_device_inline bool sample_is_even(int pattern, int sample) +{ + if (pattern == SAMPLING_PATTERN_PMJ) { + /* See Section 10.2.1, "Progressive Multi-Jittered Sample Sequences", Christensen et al. + * We can use this to get divide sample sequence into two classes for easier variance estimation. + * There must be a more elegant way of writing this? */ + return (bool)(sample & 2) ^ (bool)(sample & 8) ^ (bool)(sample & 0x20) ^ + (bool)(sample & 0x80) ^ (bool)(sample & 0x200) ^ (bool)(sample & 0x800) ^ + (bool)(sample & 0x2000) ^ (bool)(sample & 0x8000) ^ (bool)(sample & 0x20000) ^ + (bool)(sample & 0x80000) ^ (bool)(sample & 0x200000) ^ (bool)(sample & 0x800000) ^ + (bool)(sample & 0x2000000) ^ (bool)(sample & 0x8000000) ^ (bool)(sample & 0x20000000) ^ + (bool)(sample & 0x80000000); + } + else { + /* TODO: Are there reliable ways of dividing CMJ and Sobol into two classes? */ + return sample & 0x1; + } +} + CCL_NAMESPACE_END diff --git a/src/kernel/kernel_textures.h b/src/kernel/kernel_textures.h index 9eaa6b551..1cae34348 100644 --- a/src/kernel/kernel_textures.h +++ b/src/kernel/kernel_textures.h @@ -77,7 +77,7 @@ KERNEL_TEX(KernelShader, __shaders) KERNEL_TEX(float, __lookup_table) /* sobol */ -KERNEL_TEX(uint, __sobol_directions) +KERNEL_TEX(uint, __sample_pattern_lut) /* image textures */ KERNEL_TEX(TextureInfo, __texture_info) diff --git a/src/kernel/kernel_types.h b/src/kernel/kernel_types.h index c35e34576..ec19031e8 100644 --- a/src/kernel/kernel_types.h +++ b/src/kernel/kernel_types.h @@ -269,6 +269,7 @@ enum PathTraceDimension { enum SamplingPattern { SAMPLING_PATTERN_SOBOL = 0, SAMPLING_PATTERN_CMJ = 1, + SAMPLING_PATTERN_PMJ = 2, SAMPLING_NUM_PATTERNS, }; @@ -375,6 +376,8 @@ typedef enum PassType { PASS_CRYPTOMATTE, PASS_AOV_COLOR, PASS_AOV_VALUE, + PASS_ADAPTIVE_AUX_BUFFER, + PASS_SAMPLE_COUNT, PASS_CATEGORY_MAIN_END = 31, PASS_MIST = 32, @@ -1238,6 +1241,9 @@ typedef struct KernelFilm { int cryptomatte_passes; int cryptomatte_depth; int pass_cryptomatte; + + int pass_adaptive_aux_buffer; + int pass_sample_count; int pass_mist; float mist_start; @@ -1258,7 +1264,7 @@ typedef struct KernelFilm { float4 xyz_to_r; float4 xyz_to_g; float4 xyz_to_b; - float4 rgb_to_y; + float4 rgb_to_y; //50 #ifdef __KERNEL_DEBUG__ int pass_bvh_traversed_nodes; @@ -1273,6 +1279,8 @@ typedef struct KernelFilm { int display_divide_pass_stride; int use_display_exposure; int use_display_pass_alpha; + + int pad1, pad2; } KernelFilm; static_assert_align(KernelFilm, 16); @@ -1354,6 +1362,8 @@ typedef struct KernelIntegrator { /* sampler */ int sampling_pattern; int aa_samples; + int adaptive_min_samples; + float adaptive_threshold; /* volume render */ int use_volumes; @@ -1365,7 +1375,7 @@ typedef struct KernelIntegrator { int max_closures; - int pad1; + int pad1, pad2, pad3; } KernelIntegrator; static_assert_align(KernelIntegrator, 16); diff --git a/src/kernel/kernel_work_stealing.h b/src/kernel/kernel_work_stealing.h index 799561a74..c642d227e 100644 --- a/src/kernel/kernel_work_stealing.h +++ b/src/kernel/kernel_work_stealing.h @@ -23,17 +23,41 @@ CCL_NAMESPACE_BEGIN * Utility functions for work stealing */ +/* Map global work index to tile, pixel X/Y and sample. */ +ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, + uint global_work_index, + ccl_private uint *x, + ccl_private uint *y, + ccl_private uint *sample) +{ +#ifdef __KERNEL_CUDA__ + /* Keeping threads for the same pixel together improves performance on CUDA. */ + uint sample_offset = global_work_index % tile->num_samples; + uint pixel_offset = global_work_index / tile->num_samples; +#else /* __KERNEL_CUDA__ */ + uint tile_pixels = tile->w * tile->h; + uint sample_offset = global_work_index / tile_pixels; + uint pixel_offset = global_work_index - sample_offset * tile_pixels; +#endif /* __KERNEL_CUDA__ */ + uint y_offset = pixel_offset / tile->w; + uint x_offset = pixel_offset - y_offset * tile->w; + + *x = tile->x + x_offset; + *y = tile->y + y_offset; + *sample = tile->start_sample + sample_offset; +} + #ifdef __KERNEL_OPENCL__ # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #endif #ifdef __SPLIT_KERNEL__ /* Returns true if there is work */ -ccl_device bool get_next_work(KernelGlobals *kg, - ccl_global uint *work_pools, - uint total_work_size, - uint ray_index, - ccl_private uint *global_work_index) +ccl_device bool get_next_work_item(KernelGlobals *kg, + ccl_global uint *work_pools, + uint total_work_size, + uint ray_index, + ccl_private uint *global_work_index) { /* With a small amount of work there may be more threads than work due to * rounding up of global size, stop such threads immediately. */ @@ -56,31 +80,37 @@ ccl_device bool get_next_work(KernelGlobals *kg, /* Test if all work for this pool is done. */ return (*global_work_index < total_work_size); } -#endif -/* Map global work index to tile, pixel X/Y and sample. */ -ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, - uint global_work_index, - ccl_private uint *x, - ccl_private uint *y, - ccl_private uint *sample) +ccl_device bool get_next_work(KernelGlobals *kg, + ccl_global uint *work_pools, + uint total_work_size, + uint ray_index, + ccl_private uint *global_work_index) { -#ifdef __KERNEL_CUDA__ - /* Keeping threads for the same pixel together improves performance on CUDA. */ - uint sample_offset = global_work_index % tile->num_samples; - uint pixel_offset = global_work_index / tile->num_samples; -#else /* __KERNEL_CUDA__ */ - uint tile_pixels = tile->w * tile->h; - uint sample_offset = global_work_index / tile_pixels; - uint pixel_offset = global_work_index - sample_offset * tile_pixels; -#endif /* __KERNEL_CUDA__ */ - uint y_offset = pixel_offset / tile->w; - uint x_offset = pixel_offset - y_offset * tile->w; - - *x = tile->x + x_offset; - *y = tile->y + y_offset; - *sample = tile->start_sample + sample_offset; + bool got_work = false; + if (kernel_data.film.pass_adaptive_aux_buffer) { + do { + got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index); + if (got_work) { + ccl_global WorkTile *tile = &kernel_split_params.tile; + uint x, y, sample; + get_work_pixel(tile, *global_work_index, &x, &y, &sample); + uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride; + ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; + ccl_global float4 *aux = (ccl_global float4 *)(buffer + + kernel_data.film.pass_adaptive_aux_buffer); + if (aux->w == 0.0f) { + break; + } + } + } while (got_work); + } + else { + got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index); + } + return got_work; } +#endif CCL_NAMESPACE_END diff --git a/src/kernel/kernels/cpu/kernel_cpu.h b/src/kernel/kernels/cpu/kernel_cpu.h index f5d981fb7..683f4b88d 100644 --- a/src/kernel/kernels/cpu/kernel_cpu.h +++ b/src/kernel/kernels/cpu/kernel_cpu.h @@ -89,5 +89,9 @@ DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive) DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update) +DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_stopping) +DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x) +DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y) +DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples) #undef KERNEL_ARCH diff --git a/src/kernel/kernels/cpu/kernel_cpu_impl.h b/src/kernel/kernels/cpu/kernel_cpu_impl.h index 9ca3f46b5..96b2bf111 100644 --- a/src/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/src/kernel/kernels/cpu/kernel_cpu_impl.h @@ -58,6 +58,10 @@ # include "kernel/split/kernel_next_iteration_setup.h" # include "kernel/split/kernel_indirect_subsurface.h" # include "kernel/split/kernel_buffer_update.h" +# include "kernel/split/kernel_adaptive_stopping.h" +# include "kernel/split/kernel_adaptive_filter_x.h" +# include "kernel/split/kernel_adaptive_filter_y.h" +# include "kernel/split/kernel_adaptive_adjust_samples.h" # endif /* __SPLIT_KERNEL__ */ #else # define STUB_ASSERT(arch, name) \ @@ -204,6 +208,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples) #endif /* __SPLIT_KERNEL__ */ #undef KERNEL_STUB diff --git a/src/kernel/kernels/cuda/kernel.cu b/src/kernel/kernels/cuda/kernel.cu index af311027f..4f3a70a51 100644 --- a/src/kernel/kernels/cuda/kernel.cu +++ b/src/kernel/kernels/cuda/kernel.cu @@ -33,6 +33,7 @@ #include "kernel/kernel_path_branched.h" #include "kernel/kernel_bake.h" #include "kernel/kernel_work_stealing.h" +#include "kernel/kernel_adaptive_sampling.h" /* kernels */ extern "C" __global__ void @@ -81,6 +82,75 @@ kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size) } #endif +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size) +{ + int work_index = ccl_global_id(0); + bool thread_is_active = work_index < total_work_size; + KernelGlobals kg; + if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) { + uint x = tile->x + work_index % tile->w; + uint y = tile->y + work_index / tile->w; + int index = tile->offset + x + y * tile->stride; + ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; + kernel_do_adaptive_stopping(&kg, buffer, sample); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint) +{ + KernelGlobals kg; + if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) { + if(ccl_global_id(0) < tile->h) { + int y = tile->y + ccl_global_id(0); + kernel_do_adaptive_filter_x(&kg, y, tile); + } + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint) +{ + KernelGlobals kg; + if(kernel_data.film.pass_adaptive_aux_buffer && sample >= kernel_data.integrator.adaptive_min_samples - 1) { + if(ccl_global_id(0) < tile->w) { + int x = tile->x + ccl_global_id(0); + kernel_do_adaptive_filter_y(&kg, x, tile); + } + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size) +{ + if(kernel_data.film.pass_adaptive_aux_buffer) { + int work_index = ccl_global_id(0); + bool thread_is_active = work_index < total_work_size; + KernelGlobals kg; + if(thread_is_active) { + uint x = tile->x + work_index % tile->w; + uint y = tile->y + work_index / tile->w; + int index = tile->offset + x + y * tile->stride; + ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; + if(buffer[kernel_data.film.pass_sample_count] < 0.0f) { + buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count]; + float sample_multiplier = sample / max((float)start_sample + 1.0f, buffer[kernel_data.film.pass_sample_count]); + if(sample_multiplier != 1.0f) { + kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier); + } + } + else { + kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f)); + } + } + } +} + extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) diff --git a/src/kernel/kernels/cuda/kernel_split.cu b/src/kernel/kernels/cuda/kernel_split.cu index 43b3d0aa0..95ad7599c 100644 --- a/src/kernel/kernels/cuda/kernel_split.cu +++ b/src/kernel/kernels/cuda/kernel_split.cu @@ -43,6 +43,10 @@ #include "kernel/split/kernel_next_iteration_setup.h" #include "kernel/split/kernel_indirect_subsurface.h" #include "kernel/split/kernel_buffer_update.h" +#include "kernel/split/kernel_adaptive_stopping.h" +#include "kernel/split/kernel_adaptive_filter_x.h" +#include "kernel/split/kernel_adaptive_filter_y.h" +#include "kernel/split/kernel_adaptive_adjust_samples.h" #include "kernel/kernel_film.h" @@ -121,6 +125,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples) extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) diff --git a/src/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl b/src/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl new file mode 100644 index 000000000..ebdb99d47 --- /dev/null +++ b/src/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl @@ -0,0 +1,23 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_adaptive_adjust_samples.h" + +#define KERNEL_NAME adaptive_adjust_samples +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME diff --git a/src/kernel/kernels/opencl/kernel_adaptive_filter_x.cl b/src/kernel/kernels/opencl/kernel_adaptive_filter_x.cl new file mode 100644 index 000000000..76d82d418 --- /dev/null +++ b/src/kernel/kernels/opencl/kernel_adaptive_filter_x.cl @@ -0,0 +1,23 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_adaptive_filter_x.h" + +#define KERNEL_NAME adaptive_filter_x +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME diff --git a/src/kernel/kernels/opencl/kernel_adaptive_filter_y.cl b/src/kernel/kernels/opencl/kernel_adaptive_filter_y.cl new file mode 100644 index 000000000..1e6d15ba0 --- /dev/null +++ b/src/kernel/kernels/opencl/kernel_adaptive_filter_y.cl @@ -0,0 +1,23 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_adaptive_filter_y.h" + +#define KERNEL_NAME adaptive_filter_y +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME diff --git a/src/kernel/kernels/opencl/kernel_adaptive_stopping.cl b/src/kernel/kernels/opencl/kernel_adaptive_stopping.cl new file mode 100644 index 000000000..51de00596 --- /dev/null +++ b/src/kernel/kernels/opencl/kernel_adaptive_stopping.cl @@ -0,0 +1,23 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_adaptive_stopping.h" + +#define KERNEL_NAME adaptive_stopping +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME diff --git a/src/kernel/kernels/opencl/kernel_split_bundle.cl b/src/kernel/kernels/opencl/kernel_split_bundle.cl index 6041f13b5..c3b7b0946 100644 --- a/src/kernel/kernels/opencl/kernel_split_bundle.cl +++ b/src/kernel/kernels/opencl/kernel_split_bundle.cl @@ -28,3 +28,7 @@ #include "kernel/kernels/opencl/kernel_next_iteration_setup.cl" #include "kernel/kernels/opencl/kernel_indirect_subsurface.cl" #include "kernel/kernels/opencl/kernel_buffer_update.cl" +#include "kernel/kernels/opencl/kernel_adaptive_stopping.cl" +#include "kernel/kernels/opencl/kernel_adaptive_filter_x.cl" +#include "kernel/kernels/opencl/kernel_adaptive_filter_y.cl" +#include "kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl" diff --git a/src/kernel/split/kernel_adaptive_adjust_samples.h b/src/kernel/split/kernel_adaptive_adjust_samples.h new file mode 100644 index 000000000..60ebf4159 --- /dev/null +++ b/src/kernel/split/kernel_adaptive_adjust_samples.h @@ -0,0 +1,44 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg) +{ + int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h) { + int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w; + int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w; + int buffer_offset = (kernel_split_params.tile.offset + x + + y * kernel_split_params.tile.stride) * + kernel_data.film.pass_stride; + ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; + int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples; + if (buffer[kernel_data.film.pass_sample_count] < 0.0f) { + buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count]; + float sample_multiplier = sample / max((float)kernel_split_params.tile.start_sample + 1.0f, + buffer[kernel_data.film.pass_sample_count]); + if (sample_multiplier != 1.0f) { + kernel_adaptive_post_adjust(kg, buffer, sample_multiplier); + } + } + else { + kernel_adaptive_post_adjust(kg, buffer, sample / (sample - 1.0f)); + } + } +} + +CCL_NAMESPACE_END diff --git a/src/kernel/split/kernel_adaptive_filter_x.h b/src/kernel/split/kernel_adaptive_filter_x.h new file mode 100644 index 000000000..93f41f7ce --- /dev/null +++ b/src/kernel/split/kernel_adaptive_filter_x.h @@ -0,0 +1,30 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_adaptive_filter_x(KernelGlobals *kg) +{ + int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if (pixel_index < kernel_split_params.tile.h && + kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >= + kernel_data.integrator.adaptive_min_samples) { + int y = kernel_split_params.tile.y + pixel_index; + kernel_do_adaptive_filter_x(kg, y, &kernel_split_params.tile); + } +} + +CCL_NAMESPACE_END diff --git a/src/kernel/split/kernel_adaptive_filter_y.h b/src/kernel/split/kernel_adaptive_filter_y.h new file mode 100644 index 000000000..eca53d079 --- /dev/null +++ b/src/kernel/split/kernel_adaptive_filter_y.h @@ -0,0 +1,29 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_adaptive_filter_y(KernelGlobals *kg) +{ + int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if (pixel_index < kernel_split_params.tile.w && + kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >= + kernel_data.integrator.adaptive_min_samples) { + int x = kernel_split_params.tile.x + pixel_index; + kernel_do_adaptive_filter_y(kg, x, &kernel_split_params.tile); + } +} +CCL_NAMESPACE_END diff --git a/src/kernel/split/kernel_adaptive_stopping.h b/src/kernel/split/kernel_adaptive_stopping.h new file mode 100644 index 000000000..c8eb1ebd7 --- /dev/null +++ b/src/kernel/split/kernel_adaptive_stopping.h @@ -0,0 +1,37 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_adaptive_stopping(KernelGlobals *kg) +{ + int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h && + kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >= + kernel_data.integrator.adaptive_min_samples) { + int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w; + int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w; + int buffer_offset = (kernel_split_params.tile.offset + x + + y * kernel_split_params.tile.stride) * + kernel_data.film.pass_stride; + ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; + kernel_do_adaptive_stopping(kg, + buffer, + kernel_split_params.tile.start_sample + + kernel_split_params.tile.num_samples - 1); + } +} +CCL_NAMESPACE_END diff --git a/src/render/CMakeLists.txt b/src/render/CMakeLists.txt index 53196b013..ede157e50 100644 --- a/src/render/CMakeLists.txt +++ b/src/render/CMakeLists.txt @@ -22,6 +22,7 @@ set(SRC graph.cpp image.cpp integrator.cpp + jitter.cpp light.cpp merge.cpp mesh.cpp @@ -58,6 +59,7 @@ set(SRC_HEADERS image.h integrator.h light.h + jitter.h merge.h mesh.h nodes.h diff --git a/src/render/buffers.cpp b/src/render/buffers.cpp index fe8606e19..46e8a9f08 100644 --- a/src/render/buffers.cpp +++ b/src/render/buffers.cpp @@ -240,6 +240,22 @@ bool RenderBuffers::get_pass_rect( return false; } + float *sample_count = NULL; + if (type == PassType::PASS_COMBINED) { + int sample_offset = 0; + for (size_t j = 0; j < params.passes.size(); j++) { + Pass &pass = params.passes[j]; + if (pass.type != PASS_SAMPLE_COUNT) { + sample_offset += pass.components; + continue; + } + else { + sample_count = buffer.data() + sample_offset; + break; + } + } + } + int pass_offset = 0; for (size_t j = 0; j < params.passes.size(); j++) { @@ -400,6 +416,11 @@ bool RenderBuffers::get_pass_rect( } else { for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) { + if (sample_count && sample_count[i * pass_stride] < 0.0f) { + scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f; + scale_exposure = (pass.exposure) ? scale * exposure : scale; + } + float4 f = make_float4(in[0], in[1], in[2], in[3]); pixels[0] = f.x * scale_exposure; diff --git a/src/render/film.cpp b/src/render/film.cpp index 3cd7936ae..c6f87a2dc 100644 --- a/src/render/film.cpp +++ b/src/render/film.cpp @@ -168,6 +168,12 @@ void Pass::add(PassType type, vector &passes, const char *name) break; case PASS_AOV_VALUE: pass.components = 1; + case PASS_ADAPTIVE_AUX_BUFFER: + pass.components = 4; + break; + case PASS_SAMPLE_COUNT: + pass.components = 1; + pass.exposure = false; break; default: assert(false); @@ -291,6 +297,7 @@ NODE_DEFINE(Film) SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false); SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false); SOCKET_INT(denoising_flags, "Denoising Flags", 0); + SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false); return type; } @@ -479,6 +486,11 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->pass_aov_value = kfilm->pass_stride; have_aov_value = true; } + case PASS_ADAPTIVE_AUX_BUFFER: + kfilm->pass_adaptive_aux_buffer = kfilm->pass_stride; + break; + case PASS_SAMPLE_COUNT: + kfilm->pass_sample_count = kfilm->pass_stride; break; default: assert(false); diff --git a/src/render/film.h b/src/render/film.h index 95e54cb54..aae8fb404 100644 --- a/src/render/film.h +++ b/src/render/film.h @@ -81,6 +81,8 @@ class Film : public Node { CryptomatteType cryptomatte_passes; int cryptomatte_depth; + bool use_adaptive_sampling; + bool need_update; Film(); diff --git a/src/render/integrator.cpp b/src/render/integrator.cpp index 530c32106..e69a2eba0 100644 --- a/src/render/integrator.cpp +++ b/src/render/integrator.cpp @@ -18,12 +18,14 @@ #include "render/background.h" #include "render/integrator.h" #include "render/film.h" +#include "render/jitter.h" #include "render/light.h" #include "render/scene.h" #include "render/shader.h" #include "render/sobol.h" #include "util/util_foreach.h" +#include "util/util_logging.h" #include "util/util_hash.h" CCL_NAMESPACE_BEGIN @@ -66,6 +68,9 @@ NODE_DEFINE(Integrator) SOCKET_INT(volume_samples, "Volume Samples", 1); SOCKET_INT(start_sample, "Start Sample", 0); + SOCKET_FLOAT(adaptive_threshold, "Adaptive Threshold", 0.0f); + SOCKET_INT(adaptive_min_samples, "Adaptive Min Samples", 0); + SOCKET_BOOLEAN(sample_all_lights_direct, "Sample All Lights Direct", true); SOCKET_BOOLEAN(sample_all_lights_indirect, "Sample All Lights Indirect", true); SOCKET_FLOAT(light_sampling_threshold, "Light Sampling Threshold", 0.05f); @@ -78,6 +83,7 @@ NODE_DEFINE(Integrator) static NodeEnum sampling_pattern_enum; sampling_pattern_enum.insert("sobol", SAMPLING_PATTERN_SOBOL); sampling_pattern_enum.insert("cmj", SAMPLING_PATTERN_CMJ); + sampling_pattern_enum.insert("pmj", SAMPLING_PATTERN_PMJ); SOCKET_ENUM(sampling_pattern, "Sampling Pattern", sampling_pattern_enum, SAMPLING_PATTERN_SOBOL); return type; @@ -174,6 +180,22 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene kintegrator->sampling_pattern = sampling_pattern; kintegrator->aa_samples = aa_samples; + if (aa_samples > 0 && adaptive_min_samples == 0) { + kintegrator->adaptive_min_samples = max(4, (int)sqrtf(aa_samples)); + VLOG(1) << "Cycles adaptive sampling: automatic min samples = " + << kintegrator->adaptive_min_samples; + } + else { + kintegrator->adaptive_min_samples = max(4, adaptive_min_samples); + } + if (aa_samples > 0 && adaptive_threshold == 0.0f) { + kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples); + VLOG(1) << "Cycles adaptive sampling: automatic threshold = " + << kintegrator->adaptive_threshold; + } + else { + kintegrator->adaptive_threshold = adaptive_threshold; + } if (light_sampling_threshold > 0.0f) { kintegrator->light_inv_rr_threshold = 1.0f / light_sampling_threshold; @@ -203,18 +225,34 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene int dimensions = PRNG_BASE_NUM + max_samples * PRNG_BOUNCE_NUM; dimensions = min(dimensions, SOBOL_MAX_DIMENSIONS); - uint *directions = dscene->sobol_directions.alloc(SOBOL_BITS * dimensions); + if (sampling_pattern == SAMPLING_PATTERN_SOBOL) { + uint *directions = dscene->sample_pattern_lut.alloc(SOBOL_BITS * dimensions); - sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions); + sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions); - dscene->sobol_directions.copy_to_device(); + dscene->sample_pattern_lut.copy_to_device(); + } + else { + constexpr int sequence_size = 64 * 64; + constexpr int num_sequences = 48; + float2 *directions = (float2 *)dscene->sample_pattern_lut.alloc(sequence_size * num_sequences * + 2); + TaskPool pool; + for (int j = 0; j < num_sequences; ++j) { + float2 *sequence = directions + j * sequence_size; + pool.push( + function_bind(&progressive_multi_jitter_02_generate_2D, sequence, sequence_size, j)); + } + pool.wait_work(); + dscene->sample_pattern_lut.copy_to_device(); + } need_update = false; } void Integrator::device_free(Device *, DeviceScene *dscene) { - dscene->sobol_directions.free(); + dscene->sample_pattern_lut.free(); } bool Integrator::modified(const Integrator &integrator) diff --git a/src/render/integrator.h b/src/render/integrator.h index 32d84c270..9930e907a 100644 --- a/src/render/integrator.h +++ b/src/render/integrator.h @@ -75,6 +75,9 @@ class Integrator : public Node { bool sample_all_lights_indirect; float light_sampling_threshold; + int adaptive_min_samples; + float adaptive_threshold; + enum Method { BRANCHED_PATH = 0, PATH = 1, diff --git a/src/render/jitter.cpp b/src/render/jitter.cpp new file mode 100644 index 000000000..579d406c8 --- /dev/null +++ b/src/render/jitter.cpp @@ -0,0 +1,287 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This file is based on "Progressive Multi-Jittered Sample Sequences" + * by Per Christensen, Andrew Kensler and Charlie Kilpatrick. + * http://graphics.pixar.com/library/ProgressiveMultiJitteredSampling/paper.pdf + * + * Performance can be improved in the future by implementing the new + * algorithm from Matt Pharr in http://jcgt.org/published/0008/01/04/ + * "Efficient Generation of Points that Satisfy Two-Dimensional Elementary Intervals" + */ + +#include "render/jitter.h" + +#include +#include + +CCL_NAMESPACE_BEGIN + +static uint cmj_hash(uint i, uint p) +{ + i ^= p; + i ^= i >> 17; + i ^= i >> 10; + i *= 0xb36534e5; + i ^= i >> 12; + i ^= i >> 21; + i *= 0x93fc4795; + i ^= 0xdf6e307f; + i ^= i >> 17; + i *= 1 | p >> 18; + + return i; +} + +static float cmj_randfloat(uint i, uint p) +{ + return cmj_hash(i, p) * (1.0f / 4294967808.0f); +} + +class PMJ_Generator { + public: + static void generate_2D(float2 points[], int size, int rng_seed_in) + { + PMJ_Generator g(rng_seed_in); + points[0].x = g.rnd(); + points[0].y = g.rnd(); + int N = 1; + while (N < size) { + g.extend_sequence_even(points, N); + g.extend_sequence_odd(points, 2 * N); + N = 4 * N; + } + } + + protected: + PMJ_Generator(int rnd_seed_in) : num_samples(1), rnd_index(2), rnd_seed(rnd_seed_in) + { + } + + float rnd() + { + return cmj_randfloat(++rnd_index, rnd_seed); + } + + virtual void mark_occupied_strata(float2 points[], int N) + { + int NN = 2 * N; + for (int s = 0; s < NN; ++s) { + occupied1Dx[s] = occupied1Dy[s] = false; + } + for (int s = 0; s < N; ++s) { + int xstratum = (int)(NN * points[s].x); + int ystratum = (int)(NN * points[s].y); + occupied1Dx[xstratum] = true; + occupied1Dy[ystratum] = true; + } + } + + virtual void generate_sample_point( + float2 points[], float i, float j, float xhalf, float yhalf, int n, int N) + { + int NN = 2 * N; + float2 pt; + int xstratum, ystratum; + do { + pt.x = (i + 0.5f * (xhalf + rnd())) / n; + xstratum = (int)(NN * pt.x); + } while (occupied1Dx[xstratum]); + do { + pt.y = (j + 0.5f * (yhalf + rnd())) / n; + ystratum = (int)(NN * pt.y); + } while (occupied1Dy[ystratum]); + occupied1Dx[xstratum] = true; + occupied1Dy[ystratum] = true; + points[num_samples] = pt; + ++num_samples; + } + + void extend_sequence_even(float2 points[], int N) + { + int n = (int)sqrtf(N); + occupied1Dx.resize(2 * N); + occupied1Dy.resize(2 * N); + mark_occupied_strata(points, N); + for (int s = 0; s < N; ++s) { + float2 oldpt = points[s]; + float i = floorf(n * oldpt.x); + float j = floorf(n * oldpt.y); + float xhalf = floorf(2.0f * (n * oldpt.x - i)); + float yhalf = floorf(2.0f * (n * oldpt.y - j)); + xhalf = 1.0f - xhalf; + yhalf = 1.0f - yhalf; + generate_sample_point(points, i, j, xhalf, yhalf, n, N); + } + } + + void extend_sequence_odd(float2 points[], int N) + { + int n = (int)sqrtf(N / 2); + occupied1Dx.resize(2 * N); + occupied1Dy.resize(2 * N); + mark_occupied_strata(points, N); + std::vector xhalves(N / 2); + std::vector yhalves(N / 2); + for (int s = 0; s < N / 2; ++s) { + float2 oldpt = points[s]; + float i = floorf(n * oldpt.x); + float j = floorf(n * oldpt.y); + float xhalf = floorf(2.0f * (n * oldpt.x - i)); + float yhalf = floorf(2.0f * (n * oldpt.y - j)); + if (rnd() > 0.5f) { + xhalf = 1.0f - xhalf; + } + else { + yhalf = 1.0f - yhalf; + } + xhalves[s] = xhalf; + yhalves[s] = yhalf; + generate_sample_point(points, i, j, xhalf, yhalf, n, N); + } + for (int s = 0; s < N / 2; ++s) { + float2 oldpt = points[s]; + float i = floorf(n * oldpt.x); + float j = floorf(n * oldpt.y); + float xhalf = 1.0f - xhalves[s]; + float yhalf = 1.0f - yhalves[s]; + generate_sample_point(points, i, j, xhalf, yhalf, n, N); + } + } + + std::vector occupied1Dx, occupied1Dy; + int num_samples; + int rnd_index, rnd_seed; +}; + +class PMJ02_Generator : public PMJ_Generator { + protected: + void generate_sample_point( + float2 points[], float i, float j, float xhalf, float yhalf, int n, int N) override + { + int NN = 2 * N; + float2 pt; + do { + pt.x = (i + 0.5f * (xhalf + rnd())) / n; + pt.y = (j + 0.5f * (yhalf + rnd())) / n; + } while (is_occupied(pt, NN)); + mark_occupied_strata1(pt, NN); + points[num_samples] = pt; + ++num_samples; + } + + void mark_occupied_strata(float2 points[], int N) override + { + int NN = 2 * N; + int num_shapes = (int)log2f(NN) + 1; + occupiedStrata.resize(num_shapes); + for (int shape = 0; shape < num_shapes; ++shape) { + occupiedStrata[shape].resize(NN); + for (int n = 0; n < NN; ++n) { + occupiedStrata[shape][n] = false; + } + } + for (int s = 0; s < N; ++s) { + mark_occupied_strata1(points[s], NN); + } + } + + void mark_occupied_strata1(float2 pt, int NN) + { + int shape = 0; + int xdivs = NN; + int ydivs = 1; + do { + int xstratum = (int)(xdivs * pt.x); + int ystratum = (int)(ydivs * pt.y); + size_t index = ystratum * xdivs + xstratum; + assert(index < NN); + occupiedStrata[shape][index] = true; + shape = shape + 1; + xdivs = xdivs / 2; + ydivs = ydivs * 2; + } while (xdivs > 0); + } + + bool is_occupied(float2 pt, int NN) + { + int shape = 0; + int xdivs = NN; + int ydivs = 1; + do { + int xstratum = (int)(xdivs * pt.x); + int ystratum = (int)(ydivs * pt.y); + size_t index = ystratum * xdivs + xstratum; + assert(index < NN); + if (occupiedStrata[shape][index]) { + return true; + } + shape = shape + 1; + xdivs = xdivs / 2; + ydivs = ydivs * 2; + } while (xdivs > 0); + return false; + } + + private: + std::vector> occupiedStrata; +}; + +static void shuffle(float2 points[], int size, int rng_seed) +{ + /* Offset samples by 1.0 for faster scrambling in kernel_random.h */ + for (int i = 0; i < size; ++i) { + points[i].x += 1.0f; + points[i].y += 1.0f; + } + + if (rng_seed == 0) { + return; + } + + constexpr int odd[8] = {0, 1, 4, 5, 10, 11, 14, 15}; + constexpr int even[8] = {2, 3, 6, 7, 8, 9, 12, 13}; + + int rng_index = 0; + for (int yy = 0; yy < size / 16; ++yy) { + for (int xx = 0; xx < 8; ++xx) { + int other = (int)(cmj_randfloat(++rng_index, rng_seed) * (8.0f - xx) + xx); + float2 tmp = points[odd[other] + yy * 16]; + points[odd[other] + yy * 16] = points[odd[xx] + yy * 16]; + points[odd[xx] + yy * 16] = tmp; + } + for (int xx = 0; xx < 8; ++xx) { + int other = (int)(cmj_randfloat(++rng_index, rng_seed) * (8.0f - xx) + xx); + float2 tmp = points[even[other] + yy * 16]; + points[even[other] + yy * 16] = points[even[xx] + yy * 16]; + points[even[xx] + yy * 16] = tmp; + } + } +} + +void progressive_multi_jitter_generate_2D(float2 points[], int size, int rng_seed) +{ + PMJ_Generator::generate_2D(points, size, rng_seed); + shuffle(points, size, rng_seed); +} + +void progressive_multi_jitter_02_generate_2D(float2 points[], int size, int rng_seed) +{ + PMJ02_Generator::generate_2D(points, size, rng_seed); + shuffle(points, size, rng_seed); +} + +CCL_NAMESPACE_END diff --git a/src/render/jitter.h b/src/render/jitter.h new file mode 100644 index 000000000..ed34c7a4f --- /dev/null +++ b/src/render/jitter.h @@ -0,0 +1,29 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __JITTER_H__ +#define __JITTER_H__ + +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +void progressive_multi_jitter_generate_2D(float2 points[], int size, int rng_seed); +void progressive_multi_jitter_02_generate_2D(float2 points[], int size, int rng_seed); + +CCL_NAMESPACE_END + +#endif /* __JITTER_H__ */ diff --git a/src/render/scene.cpp b/src/render/scene.cpp index 1e75fa0f9..6cea3c7c7 100644 --- a/src/render/scene.cpp +++ b/src/render/scene.cpp @@ -77,7 +77,7 @@ DeviceScene::DeviceScene(Device *device) svm_nodes(device, "__svm_nodes", MEM_TEXTURE), shaders(device, "__shaders", MEM_TEXTURE), lookup_table(device, "__lookup_table", MEM_TEXTURE), - sobol_directions(device, "__sobol_directions", MEM_TEXTURE), + sample_pattern_lut(device, "__sample_pattern_lut", MEM_TEXTURE), ies_lights(device, "__ies", MEM_TEXTURE) { memset((void *)&data, 0, sizeof(data)); diff --git a/src/render/scene.h b/src/render/scene.h index f99510d2d..ae4cb2ecd 100644 --- a/src/render/scene.h +++ b/src/render/scene.h @@ -119,7 +119,7 @@ class DeviceScene { device_vector lookup_table; /* integrator */ - device_vector sobol_directions; + device_vector sample_pattern_lut; /* ies lights */ device_vector ies_lights; diff --git a/src/render/session.cpp b/src/render/session.cpp index 7a894c1e9..b7205c643 100644 --- a/src/render/session.cpp +++ b/src/render/session.cpp @@ -902,7 +902,7 @@ bool Session::update_scene() Integrator *integrator = scene->integrator; BakeManager *bake_manager = scene->bake_manager; - if (integrator->sampling_pattern == SAMPLING_PATTERN_CMJ || bake_manager->get_baking()) { + if (integrator->sampling_pattern != SAMPLING_PATTERN_SOBOL || bake_manager->get_baking()) { int aa_samples = tile_manager.num_samples; if (aa_samples != integrator->aa_samples) { @@ -1024,6 +1024,7 @@ void Session::render() task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2); task.need_finish_queue = params.progressive_refine; task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH; + task.integrator_adaptive = scene->integrator->sampling_pattern == SAMPLING_PATTERN_PMJ; task.requested_tile_size = params.tile_size; task.passes_size = tile_manager.params.get_passes_size(); diff --git a/src/render/session.h b/src/render/session.h index 9fffc13dd..d1e8648d0 100644 --- a/src/render/session.h +++ b/src/render/session.h @@ -55,6 +55,7 @@ class SessionParams { int start_resolution; int pixel_size; int threads; + bool adaptive_sampling; bool use_profiling; @@ -86,6 +87,7 @@ class SessionParams { start_resolution = INT_MAX; pixel_size = 1; threads = 0; + adaptive_sampling = false; use_profiling = false; @@ -112,6 +114,7 @@ class SessionParams { && progressive == params.progressive && experimental == params.experimental && tile_size == params.tile_size && start_resolution == params.start_resolution && pixel_size == params.pixel_size && threads == params.threads && + adaptive_sampling == params.adaptive_sampling && use_profiling == params.use_profiling && display_buffer_linear == params.display_buffer_linear && cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout && diff --git a/src/util/util_atomic.h b/src/util/util_atomic.h index a8ea1dc92..13d177d2b 100644 --- a/src/util/util_atomic.h +++ b/src/util/util_atomic.h @@ -77,6 +77,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float # define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x)) # define atomic_fetch_and_inc_uint32(p) atomic_inc((p)) # define atomic_fetch_and_dec_uint32(p) atomic_dec((p)) +# define atomic_fetch_and_or_uint32(p, x) atomic_or((p), (x)) # define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE # define ccl_barrier(flags) barrier(flags) @@ -91,6 +92,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float # define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int *)(p), (unsigned int)(x)) # define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) # define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1) +# define atomic_fetch_and_or_uint32(p, x) atomicOr((unsigned int *)(p), (unsigned int)(x)) ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest, const float old_val,