aliutkus · baldassarreFe · Nov 15, 2019 · Nov 15, 2019 · Nov 16, 2019 · Nov 18, 2019
diff --git a/README.md b/README.md
@@ -2,7 +2,6 @@
 
 This repository is an implementation of the searchsorted function to work for pytorch CUDA Tensors. Initially derived from the great [C extension tutorial](https://github.com/chrischoy/pytorch-custom-cuda-tutorial), but totally changed since then because building C extensions is not available anymore on pytorch 1.0.
 
-
 > Warning: only works with pytorch > v1.3 and CUDA >= v10.1
 
 ## Description
@@ -24,39 +23,60 @@ the output is of size as `(nrows, ncols_v)`. If all input tensors are on GPU, a
 
 
 ## Installation
+After setting up an environment with pytorch >= 1.3, run either of these commands from the root folder of the repo: 
 
-Just `python setup.py install`, in the root folder of this repo. This will compile
-and install the torchsearchsorted module.
-be careful that sometimes, `nvcc` needs versions of `gcc` and `g++` that are older than those found by default on the system. If so, just create symbolic links to the right versions in your cuda/bin folder (where `nvcc` is)
+```bash
+pip install -v .
+```
 
-be careful that you need pytorch to be installed on your system. The code was tested on pytorch v1.3
+```bash
+python setup.py install -v
+```
 
-## Usage
+The verbose flag `-v` is not mandatory, but it will print whether the installer was able to find `nvcc` and install the CUDA version of `torchsearchsorted`.
+If you're having problems with the installation, make sure `nvcc` and `gcc` are installed and available in your path, e.g.:
+```bash
+export PATH="/usr/local/cuda/bin:${PATH}"
+export CPATH="/usr/local/cuda/include:${CPATH}"
 
-Just import the torchsearchsorted package after installation. I typically do:
+which gcc
+which nvcc
 
+pip install -v .
 ```
+
+## Usage
+
+```python
+import torch
 from torchsearchsorted import searchsorted
+
+a = torch.sort(torch.randn(5000, 300, device='cuda'), dim=1)[0]
+v = torch.randn(5000, 100, device='cuda')
+out = searchsorted(a, v)
 ```
 
 
-## Testing
+## Testing and benchmarking
 
-Try `python test.py` with `torch` available for an example.
+Install test dependencies and run the unit tests:
+```bash
+pip install '.[test]'
+pytest -v
+```
 
+Run [benchmark.py](examples/benchmark.py) for a speed comparison: 
+```bash
+python examples/benchmark.py
 ```
-Looking for 50000x1000 values in 50000x300 entries
-NUMPY:  searchsorted in 4851.592ms
-CPU:  searchsorted in 4805.432ms
-    difference between CPU and NUMPY: 0.000
-GPU:  searchsorted in 1.055ms
-    difference between GPU and NUMPY: 0.000
-
-Looking for 50000x1000 values in 50000x300 entries
-NUMPY:  searchsorted in 4333.964ms
-CPU:  searchsorted in 4753.958ms
-    difference between CPU and NUMPY: 0.000
-GPU:  searchsorted in 0.391ms
-    difference between GPU and NUMPY: 0.000
+```text
+Benchmark searchsorted:
+- a [5000 x 300]
+- v [5000 x 100]
+- reporting fastest time of 20 runs
+- each run executes searchsorted 100 times
+
+Numpy: 	3.4524286500000017
+CPU: 	10.617608329001087
+CUDA: 	0.00124932999824523
 ```
-The first run comprises the time of allocation, while the second one does not.
diff --git a/examples/benchmark.py b/examples/benchmark.py
@@ -0,0 +1,66 @@
+import timeit
+
+import torch
+import numpy as np
+from torchsearchsorted import searchsorted, numpy_searchsorted
+
+B = 5_000
+A = 300
+V = 100
+
+repeats = 20
+number = 100
+
+print(
+    f'Benchmark searchsorted:',
+    f'- a [{B} x {A}]',
+    f'- v [{B} x {V}]',
+    f'- reporting fastest time of {repeats} runs',
+    f'- each run executes searchsorted {number} times',
+    sep='\n',
+    end='\n\n'
+)
+
+
+def get_arrays():
+    a = np.sort(np.random.randn(B, A), axis=1)
+    v = np.random.randn(B, V)
+    out = np.empty_like(v, dtype=np.long)
+    return a, v, out
+
+
+def get_tensors(device):
+    a = torch.sort(torch.randn(B, A, device=device), dim=1)[0]
+    v = torch.randn(B, V, device=device)
+    out = torch.empty(B, V, device=device, dtype=torch.long)
+    return a, v, out
+
+
+numpy = timeit.repeat(
+    stmt="numpy_searchsorted(a, v, out, side='left')",
+    setup="a, v, out = get_arrays()",
+    globals=globals(),
+    repeat=repeats,
+    number=number
+)
+print('Numpy: ', min(numpy), sep='\t')
+
+cpu = timeit.repeat(
+    stmt="searchsorted(a, v, out, side='left')",
+    setup="a, v, out = get_tensors(device='cpu')",
+    globals=globals(),
+    repeat=repeats,
+    number=number
+)
+print('CPU: ', min(cpu), sep='\t')
+
+if torch.cuda.is_available():
+    gpu = timeit.repeat(
+        stmt="searchsorted(a, v, out, side='left')",
+        setup="a, v, out = get_tensors(device='cuda')",
+        globals=globals(),
+        repeat=repeats,
+        number=number
+    )
+    print('CUDA: ', min(gpu), sep='\t')
+
diff --git a/examples/test.py b/examples/test.py
@@ -33,7 +33,7 @@
         # v = torch.tensor([[1.]])
 
         t0 = time.time()
-        test_NP = torch.tensor(numpy_searchsorted(a, v, side))
+        test_NP = torch.tensor(numpy_searchsorted(a, v, side=side))
         print('NUMPY:  searchsorted in %0.3fms' % (1000*(time.time()-t0)))
         t0 = time.time()
         test_CPU = searchsorted(a, v, test_CPU, side)

diff --git a/setup.py b/setup.py
@@ -8,13 +8,17 @@
                  ['src/cpu/searchsorted_cpu_wrapper.cpp']),
 ]
 
-# If nvcc is available, add the CUDA extension
+# If nvcc is available, add the CUDA extension, messages are
+# printed when using `pip install -v .` or `python setup.py -v install`
 if CUDA_HOME:
+    print('torchsearchsorted will be installed with CUDA support')
     modules.append(
         CUDAExtension('torchsearchsorted.cuda',
                       ['src/cuda/searchsorted_cuda_wrapper.cpp',
                        'src/cuda/searchsorted_cuda_kernel.cu'])
     )
+else:
+    print('torchsearchsorted will be installed for CPU only')
 
 tests_require = [
     'pytest',

diff --git a/src/cpu/searchsorted_cpu_wrapper.cpp b/src/cpu/searchsorted_cpu_wrapper.cpp
@@ -1,121 +1,91 @@
 #include "searchsorted_cpu_wrapper.h"
 #include <stdio.h>
 
-int eval(float val, float *a, int64_t row, int64_t col, int64_t ncol, bool side_left)
-{
-    /* Evaluates whether a[row,col] < val <= a[row, col+1]*/
 
-    if (col == ncol - 1)
-    {
-      // special case: we are on the right border
-      if (a[row * ncol + col] <= val){
-        return 1;}
-      else {
-        return -1;}
-    }
-    bool is_lower;
-    bool is_next_higher;
-
-    if (side_left) {
-      // a[row, col] < v <= a[row, col+1]
-      is_lower = (a[row * ncol + col] < val);
-      is_next_higher = (a[row*ncol + col + 1] >= val);
-    } else {
-      // a[row, col] <= v < a[row, col+1]
-      is_lower = (a[row * ncol + col] <= val);
-      is_next_higher = (a[row * ncol + col + 1] > val);
-    }
-    if (is_lower && is_next_higher) {
-        // we found the right spot
-        return 0;
-    } else if (is_lower) {
-    	// answer is on the right side
-        return 1;
+int64_t bisect_left(float *array, float value, int64_t left, int64_t right) {
+/**
+ * Locate the insertion point of a value in a sorted array that would
+ * maintain the array sorted, i.e. the index i such that:
+ * array[i] <= value < array[i + 1]
+ * Only the index range [right, left) is considered.
+ *
+ * If the value is already present in the array, the returned index would
+ * insert the value to the left of any existing entry.
+ * If value is < than every element, the returned index is equal to left.
+ * If value is >=  than every element, the returned index is equal to right.
+ */
+  int64_t mid;
+  while (left < right) {
+    mid = (left + right) / 2;
+    if (value > array[mid]) {
+      left = mid + 1;
     } else {
-    	// answer is on the left side
-        return -1;
+      right = mid;
     }
+  }
+  return left;
 }
 
 
-int64_t binary_search(float *a, int64_t row, float val, int64_t ncol, bool side_left)
-{
-  /* Look for the value `val` within row `row` of matrix `a`, which
-  has `ncol` columns.
-
-  the `a` matrix is assumed sorted in increasing order, row-wise
-
-  returns:
-  * -1 if `val` is smaller than the smallest value found within that row of `a`
-  * `ncol` - 1 if `val` is larger than the largest element of that row of `a`
-  * Otherwise, return the column index `res` such that:
-    - a[row, col] < val <= a[row, col+1]. (if side_left), or
-    - a[row, col] < val <= a[row, col+1] (if not side_left).
-   */
-
-  //start with left at 0 and right at number of columns of a
-  int64_t right = ncol;
-  int64_t left = 0;
-
-  while (right >= left) {
-      // take the midpoint of current left and right cursors
-      int64_t mid = left + (right-left)/2;
-
-      // check the relative position of val: are we good here ?
-      int rel_pos = eval(val, a, row, mid, ncol, side_left);
-      // we found the point
-      if(rel_pos == 0) {
-          return mid;
-      } else if (rel_pos > 0) {
-        if (mid==ncol-1){return ncol-1;}
-        // the answer is on the right side
-        left = mid;
-      } else {
-        if (mid==0){return -1;}
-        right = mid;
-      }
+int64_t bisect_right(float *array, float value, int64_t left, int64_t right) {
+/**
+ * Locate the insertion point of a value in a sorted array that would
+ * maintain the array sorted, i.e. the index i such that:
+ * array[i] < value <= array[i + 1]
+ * Only the index range [right, left) is considered.
+ *
+ * If the value is already present in the array, the returned index would
+ * insert the value to the right of any existing entry.
+ * If value is <= than every element, the returned index is equal to left.
+ * If value is >  than every element, the returned index is equal to right.
+ */
+  int64_t mid;
+  while (left < right) {
+    mid = (left + right) / 2;
+    if (value >= array[mid]) {
+      left = mid + 1;
+    } else {
+      right = mid;
+    }
   }
-  return -1;
+  return left;
 }
 
-void searchsorted_cpu_wrapper(
-    at::Tensor a,
-    at::Tensor v,
-    at::Tensor res,
-    bool side_left)
-{
-
-  // Get the dimensions
-  auto nrow_a = a.size(/*dim=*/0);
-  auto ncol_a = a.size(/*dim=*/1);
-  auto nrow_v = v.size(/*dim=*/0);
-  auto ncol_v = v.size(/*dim=*/1);
-
-  auto nrow_res = fmax(nrow_a, nrow_v);
-
-  //auto acc_v = v.accessor<float, 2>();
-  //auto acc_res = res.accessor<float, 2>();
-
-  float *a_data = a.data<float>();
-  float *v_data = v.data<float>();
-
-  for (int64_t row = 0; row < nrow_res; row++)
-  {
-    for (int64_t col = 0; col < ncol_v; col++)
-    {
-      // get the value to look for
-      int64_t row_in_v = (nrow_v == 1) ? 0 : row;
-      int64_t row_in_a = (nrow_a == 1) ? 0 : row;
-
-      int64_t idx_in_v = row_in_v * ncol_v + col;
-      int64_t idx_in_res = row * ncol_v + col;
-
-      // apply binary search
-      res.data<int64_t>()[idx_in_res] = (binary_search(a_data, row_in_a, v_data[idx_in_v], ncol_a, side_left) + 1);
-    }}
 
+void searchsorted_cpu_wrapper(
+          at::Tensor a,
+          at::Tensor v,
+          at::Tensor res,
+          bool side_left) {
+  float *a_data = a.data_ptr<float>();
+  float *v_data = v.data_ptr<float>();
+  int64_t *res_data = res.data_ptr<int64_t>();
+
+  int64_t (*bisect)(float*, float, int64_t, int64_t);
+  if (side_left) {
+    bisect = &bisect_left;
+  } else {
+    bisect = &bisect_right;
   }
 
-  PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("searchsorted_cpu_wrapper", &searchsorted_cpu_wrapper, "searchsorted (CPU)");
+  for (int64_t i = 0; i < v.size(0); i++) {
+    // Search values in the range [left, right), i.e. an entire row of a
+    int64_t left = i * a.stride(0);
+    int64_t right = i * a.stride(0) + a.size(1);
+
+    for (int64_t j = 0; j < v.size(1); j++) {
+      // idx_v is the location of the value in the flattened tensor v
+      // idx_res is the where the result will go in the flattened tensor res
+      int64_t idx_v = i * v.stride(0) + j * v.stride(1);
+      int64_t idx_res = i * res.stride(0) + j * res.stride(1);
+      // idx is the insertion index in the flattened tensor a
+      int64_t idx = (*bisect)(a_data, v_data[idx_v], left, right);
+      res_data[idx_res] = idx - i * a.stride(0);
+    }
   }
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("searchsorted_cpu_wrapper", &searchsorted_cpu_wrapper, "searchsorted (CPU)");
+}