Adaptive sampling patch https://developer.blender.org/D4686

# Conflicts: # src/blender/addon/engine.py # src/blender/blender_sync.cpp # src/kernel/kernel_types.h # src/render/film.cpp
boberfly · Dec 16, 2019 · 8d378f9 · 8d378f9
1 parent 9fb3359
commit 8d378f9
Show file tree

Hide file tree

Showing 48 changed files with 1,415 additions and 47 deletions.
diff --git a/src/blender/addon/engine.py b/src/blender/addon/engine.py
@@ -258,6 +258,7 @@ def list_render_passes(srl):
     if crl.pass_debug_bvh_traversed_instances: yield ("Debug BVH Traversed Instances", "X",   'VALUE')
     if crl.pass_debug_bvh_intersections:       yield ("Debug BVH Intersections",       "X",   'VALUE')
     if crl.pass_debug_ray_bounces:             yield ("Debug Ray Bounces",             "X",   'VALUE')
+    if crl.pass_debug_sample_count:            yield ("Debug Sample Count",            "X",   'VALUE')
     if crl.use_pass_volume_direct:             yield ("VolumeDir",                     "RGB", 'COLOR')
     if crl.use_pass_volume_indirect:           yield ("VolumeInd",                     "RGB", 'COLOR')
 

diff --git a/src/blender/addon/properties.py b/src/blender/addon/properties.py
@@ -112,6 +112,7 @@
 enum_sampling_pattern = (
     ('SOBOL', "Sobol", "Use Sobol random sampling pattern"),
     ('CORRELATED_MUTI_JITTER', "Correlated Multi-Jitter", "Use Correlated Multi-Jitter random sampling pattern"),
+    ('PROGRESSIVE_MUTI_JITTER', "Progressive Multi-Jitter", "Use Progressive Multi-Jitter random sampling pattern"),
 )
 
 enum_integrator = (
@@ -336,6 +337,26 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=0.01,
     )
 
+    adaptive_threshold: FloatProperty(
+        name="Adaptive Sampling Threshold",
+        description="Zero for automatic setting based on AA samples",
+        min=0.0, max=1.0,
+        default=0.0,
+    )
+
+    adaptive_min_samples: IntProperty(
+        name="Adaptive Min Samples",
+        description="Minimum AA samples for adaptive sampling. Zero for automatic setting based on AA samples",
+        min=0, max=4096,
+        default=0,
+    )
+
+    use_adaptive_sampling: BoolProperty(
+        name="Use adaptive sampling",
+        description="Automatically determine the number of samples per pixel based on a variance estimation",
+        default=False,
+    )
+
     min_light_bounces: IntProperty(
             name="Min Light Bounces",
             description="Minimum number of light bounces. Setting this higher reduces noise in the first bounces, "
@@ -1279,6 +1300,12 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         default=False,
         update=update_render_passes,
     )
+    pass_debug_sample_count: BoolProperty(
+        name="Debug Sample Count",
+        description="Number of samples/camera rays per pixel",
+        default=False,
+        update=update_render_passes,
+    )
     use_pass_volume_direct: BoolProperty(
         name="Volume Direct",
         description="Deliver direct volumetric scattering pass",

diff --git a/src/blender/addon/ui.py b/src/blender/addon/ui.py
@@ -188,6 +188,8 @@ def draw(self, context):
             col.prop(cscene, "aa_samples", text="Render")
             col.prop(cscene, "preview_aa_samples", text="Viewport")
 
+        col.prop(cscene, "use_adaptive_sampling", text="Adaptive Sampling")
+
 
 class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
     bl_label = "Sub Samples"
@@ -239,7 +241,13 @@ def draw(self, context):
         row.prop(cscene, "seed")
         row.prop(cscene, "use_animated_seed", text="", icon='TIME')
 
-        layout.prop(cscene, "sampling_pattern", text="Pattern")
+        col = layout.column(align=True)
+        col.active = not(cscene.use_adaptive_sampling)
+        col.prop(cscene, "sampling_pattern", text="Pattern")
+        col = layout.column(align=True)
+        col.active = cscene.use_adaptive_sampling
+        col.prop(cscene, "adaptive_min_samples", text="Adaptive Min Samples")
+        col.prop(cscene, "adaptive_threshold", text="Adaptive Threshold")
 
         layout.prop(cscene, "use_square_samples")
 
@@ -803,6 +811,8 @@ def draw(self, context):
         col.prop(cycles_view_layer, "denoising_store_passes", text="Denoising Data")
         col = flow.column()
         col.prop(cycles_view_layer, "pass_debug_render_time", text="Render Time")
+        col = flow.column()
+        col.prop(cycles_view_layer, "pass_debug_sample_count", text="Sample Count")
 
         layout.separator()
 

diff --git a/src/blender/blender_session.cpp b/src/blender/blender_session.cpp
@@ -474,7 +474,8 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   b_rlay_name = b_view_layer.name();
 
   /* add passes */
-  vector<Pass> passes = sync->sync_render_passes(b_rlay, b_view_layer);
+  vector<Pass> passes = sync->sync_render_passes(
+      b_rlay, b_view_layer, session_params.adaptive_sampling);
   buffer_params.passes = passes;
 
   PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");

diff --git a/src/blender/blender_sync.cpp b/src/blender/blender_sync.cpp
@@ -291,6 +291,16 @@ void BlenderSync::sync_integrator()
   integrator->sample_all_lights_indirect = get_boolean(cscene, "sample_all_lights_indirect");
   integrator->light_sampling_threshold = get_float(cscene, "light_sampling_threshold");
 
+  if (RNA_boolean_get(&cscene, "use_adaptive_sampling")) {
+    integrator->sampling_pattern = SAMPLING_PATTERN_PMJ;
+    integrator->adaptive_min_samples = get_int(cscene, "adaptive_min_samples");
+    integrator->adaptive_threshold = get_float(cscene, "adaptive_threshold");
+  }
+  else {
+    integrator->adaptive_min_samples = INT_MAX;
+    integrator->adaptive_threshold = 0.0f;
+  }
+
   int diffuse_samples = get_int(cscene, "diffuse_samples");
   int glossy_samples = get_int(cscene, "glossy_samples");
   int transmission_samples = get_int(cscene, "transmission_samples");
@@ -307,6 +317,8 @@ void BlenderSync::sync_integrator()
     integrator->mesh_light_samples = mesh_light_samples * mesh_light_samples;
     integrator->subsurface_samples = subsurface_samples * subsurface_samples;
     integrator->volume_samples = volume_samples * volume_samples;
+    integrator->adaptive_min_samples = min(
+        integrator->adaptive_min_samples * integrator->adaptive_min_samples, INT_MAX);
   }
   else {
     integrator->diffuse_samples = diffuse_samples;
@@ -482,6 +494,8 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
   MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES);
 #endif
   MAP_PASS("Debug Render Time", PASS_RENDER_TIME);
+  MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER);
+  MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT);
   if (string_startswith(name, cryptomatte_prefix)) {
     return PASS_CRYPTOMATTE;
   }
@@ -517,7 +531,9 @@ int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass)
   return -1;
 }
 
-vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer)
+vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
+                                             BL::ViewLayer &b_view_layer,
+                                             bool adaptive_sampling)
 {
   vector<Pass> passes;
 
@@ -589,6 +605,10 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
     b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str());
     Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time");
   }
+  if (get_boolean(crp, "pass_debug_sample_count")) {
+    b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str());
+    Pass::add(PASS_SAMPLE_COUNT, passes);
+  }
   if (get_boolean(crp, "use_pass_volume_direct")) {
     b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str());
     Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir");
@@ -650,6 +670,13 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
   }
   RNA_END;
 
+  if (adaptive_sampling) {
+    Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes);
+    if (!get_boolean(crp, "pass_debug_sample_count")) {
+      Pass::add(PASS_SAMPLE_COUNT, passes);
+    }
+  }
+
   return passes;
 }
 
@@ -883,6 +910,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
   params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && background &&
                          BlenderSession::print_render_stats;
 
+  params.adaptive_sampling = RNA_boolean_get(&cscene, "use_adaptive_sampling");
+
   return params;
 }
 

diff --git a/src/blender/blender_sync.h b/src/blender/blender_sync.h
@@ -70,7 +70,9 @@ class BlenderSync {
                  int height,
                  void **python_thread_state);
   void sync_view_layer(BL::SpaceView3D &b_v3d, BL::ViewLayer &b_view_layer);
-  vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer);
+  vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer,
+                                  BL::ViewLayer &b_view_layer,
+                                  bool adaptive_sampling);
   void sync_integrator();
   void sync_camera(BL::RenderSettings &b_render,
                    BL::Object &b_override,

diff --git a/src/device/device_cpu.cpp b/src/device/device_cpu.cpp
@@ -34,6 +34,7 @@
 #include "kernel/kernel_types.h"
 #include "kernel/split/kernel_split_data.h"
 #include "kernel/kernel_globals.h"
+#include "kernel/kernel_adaptive_sampling.h"
 
 #include "kernel/filter/filter.h"
 
@@ -317,6 +318,10 @@ class CPUDevice : public Device {
     REGISTER_SPLIT_KERNEL(next_iteration_setup);
     REGISTER_SPLIT_KERNEL(indirect_subsurface);
     REGISTER_SPLIT_KERNEL(buffer_update);
+    REGISTER_SPLIT_KERNEL(adaptive_stopping);
+    REGISTER_SPLIT_KERNEL(adaptive_filter_x);
+    REGISTER_SPLIT_KERNEL(adaptive_filter_y);
+    REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
 #undef REGISTER_SPLIT_KERNEL
 #undef KERNEL_FUNCTIONS
   }
@@ -851,10 +856,33 @@ class CPUDevice : public Device {
           path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
         }
       }
-
       tile.sample = sample + 1;
 
       task.update_progress(&tile, tile.w * tile.h);
+
+      if (kernel_data.film.pass_adaptive_aux_buffer && (sample & 0x3) == 3 &&
+          sample >= kernel_data.integrator.adaptive_min_samples - 1) {
+        WorkTile wtile;
+        wtile.x = tile.x;
+        wtile.y = tile.y;
+        wtile.w = tile.w;
+        wtile.h = tile.h;
+        wtile.offset = tile.offset;
+        wtile.stride = tile.stride;
+        wtile.buffer = (float *)tile.buffer;
+
+        bool any = false;
+        for (int y = tile.y; y < tile.y + tile.h; ++y) {
+          any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
+        }
+        for (int x = tile.x; x < tile.x + tile.w; ++x) {
+          any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
+        }
+        if (!any) {
+          tile.sample = end_sample;
+          break;
+        }
+      }
     }
     if (use_coverage) {
       coverage.finalize();
@@ -931,6 +959,28 @@ class CPUDevice : public Device {
         }
         else {
           path_trace(task, tile, kg);
+          if (task.integrator_adaptive && kernel_data.film.pass_adaptive_aux_buffer) {
+            float *render_buffer = (float *)tile.buffer;
+            for (int y = tile.y; y < tile.y + tile.h; y++) {
+              for (int x = tile.x; x < tile.x + tile.w; x++) {
+                int index = tile.offset + x + y * tile.stride;
+                ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+                if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+                  buffer[kernel_data.film.pass_sample_count] =
+                      -buffer[kernel_data.film.pass_sample_count];
+                  float sample_multiplier = tile.sample /
+                                            max((float)tile.start_sample + 1.0f,
+                                                buffer[kernel_data.film.pass_sample_count]);
+                  if (sample_multiplier != 1.0f) {
+                    kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
+                  }
+                }
+                else {
+                  kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
+                }
+              }
+            }
+          }
         }
       }
       else if (tile.task == RenderTile::DENOISE) {

diff --git a/src/device/device_cuda.cpp b/src/device/device_cuda.cpp
@@ -1788,6 +1788,23 @@ class CUDADevice : public Device {
 
     cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
 
+    /* Kernels for adaptive sampling. */
+    CUfunction cuAdaptiveStopping, cuAdaptiveFilterX, cuAdaptiveFilterY, cuAdaptiveScaleSamples;
+    if (task.integrator_adaptive) {
+      cuda_assert(
+          cuModuleGetFunction(&cuAdaptiveStopping, cuModule, "kernel_cuda_adaptive_stopping"));
+      cuda_assert(cuFuncSetCacheConfig(cuAdaptiveStopping, CU_FUNC_CACHE_PREFER_L1));
+      cuda_assert(
+          cuModuleGetFunction(&cuAdaptiveFilterX, cuModule, "kernel_cuda_adaptive_filter_x"));
+      cuda_assert(cuFuncSetCacheConfig(cuAdaptiveFilterX, CU_FUNC_CACHE_PREFER_L1));
+      cuda_assert(
+          cuModuleGetFunction(&cuAdaptiveFilterY, cuModule, "kernel_cuda_adaptive_filter_y"));
+      cuda_assert(cuFuncSetCacheConfig(cuAdaptiveFilterY, CU_FUNC_CACHE_PREFER_L1));
+      cuda_assert(cuModuleGetFunction(
+          &cuAdaptiveScaleSamples, cuModule, "kernel_cuda_adaptive_scale_samples"));
+      cuda_assert(cuFuncSetCacheConfig(cuAdaptiveScaleSamples, CU_FUNC_CACHE_PREFER_L1));
+    }
+
     /* Allocate work tile. */
     work_tiles.alloc(1);
 
@@ -1812,6 +1829,16 @@ class CUDADevice : public Device {
 
     uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
 
+    if (task.integrator_adaptive) {
+      /* Force to either 1, 2 or multiple of 4 samples per kernel invocation. */
+      if (step_samples == 3) {
+        step_samples = 2;
+      }
+      else if (step_samples > 4) {
+        step_samples &= 0xfffffffc;
+      }
+    }
+
     /* Render all samples. */
     int start_sample = rtile.start_sample;
     int end_sample = rtile.start_sample + rtile.num_samples;
@@ -1832,6 +1859,26 @@ class CUDADevice : public Device {
       cuda_assert(cuLaunchKernel(
           cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
 
+      uint filter_sample = sample + wtile->num_samples - 1;
+      /* Run the adaptive sampling kernels when we're at a multiple of 4 samples.
+       * These are a series of tiny kernels because there is no grid synchronisation
+       * from within a kernel, so multiple kernel launches it is. */
+      if (task.integrator_adaptive && (filter_sample & 0x3) == 3) {
+        total_work_size = wtile->h * wtile->w;
+        void *args2[] = {&d_work_tiles, &filter_sample, &total_work_size};
+        num_blocks = divide_up(total_work_size, num_threads_per_block);
+        cuda_assert(cuLaunchKernel(
+            cuAdaptiveStopping, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args2, 0));
+        total_work_size = wtile->h;
+        num_blocks = divide_up(total_work_size, num_threads_per_block);
+        cuda_assert(cuLaunchKernel(
+            cuAdaptiveFilterX, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args2, 0));
+        total_work_size = wtile->w;
+        num_blocks = divide_up(total_work_size, num_threads_per_block);
+        cuda_assert(cuLaunchKernel(
+            cuAdaptiveFilterY, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args2, 0));
+      }
+
       cuda_assert(cuCtxSynchronize());
 
       /* Update progress. */
@@ -1843,6 +1890,17 @@ class CUDADevice : public Device {
           break;
       }
     }
+
+    if (task.integrator_adaptive) {
+      CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
+      uint total_work_size = wtile->h * wtile->w;
+      void *args[] = {&d_work_tiles, &rtile.start_sample, &rtile.sample, &total_work_size};
+      uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+      cuda_assert(cuLaunchKernel(
+          cuAdaptiveScaleSamples, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+      cuda_assert(cuCtxSynchronize());
+      task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+    }
   }
 
   void film_convert(DeviceTask &task,