working on CuArray, but really need an optimised searchsortedlast; ne…

…ed a KernelAbstractions.jl sorted to work on different accelerators
anicusan · Nov 9, 2023 · 91e2db9 · 91e2db9
1 parent b88ea28
commit 91e2db9
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 10 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MPISort"
 uuid = "404a976e-21b5-45c7-95b2-5d74c9953ee0"
 authors = ["Andrei-Leonard Nicusan"]
-version = "0.2.0"
+version = "0.2.1"
 
 [deps]
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"

diff --git a/README.md b/README.md
@@ -21,8 +21,8 @@ MPI-related), optimised for minimum inter-rank communication and memory footprin
 - **Does not require that distributed data fits into the memory of a single node**. No IO either.
 - Works for any comparison-based data, with additional optimisations for numeric elements.
 - Optimised for minimum MPI communication; can use Julia threads on each shared-memory node.
-- The node-local arrays may have different sizes; sorting will try to balance number of elements held by each MPI rank.
-- Works with any `AbstractVector`, including accelerators such as GPUs (TODO: test this further). Julia type-inference and optimisations do wonders.
+- The node-local arrays may have different sizes; sorting will try to balance the number of elements held by each MPI rank.
+- Works with any `AbstractVector`, including accelerators such as GPUs (see Note).
 - Implements the standard Julia `sort!` API, and naturally works for custom data, comparisons, orderings, etc.
 
 
@@ -118,6 +118,19 @@ memory footprint only depends on the number of nodes involved, hence it should b
 thousands of MPI ranks. Anyone got a spare 200,000 nodes to benchmark this?
 
 
+### Note on sorting multi-node GPU arrays
+
+`SIHSort` is generic over the input array type, so it can work with GPU arrays - e.g. Julia
+`CUDA.CuArray` - and benefit from MPI-configured, optimised inter-GPU connects.
+
+However, to be fully performant, it needs:
+- A single-node `sort` implementation - at the moment, only `CUDA.CuArray` has one; there is great potential in a `KernelAbstractions.jl` sorter, we really need one!
+- A fully GPU-based `searchsortedlast` implementation; **we do not have one** yet, so we rely on a binary search where each tested element is copied to the CPU (!), which of course is not great. More great potential in some optimised `KernelAbstractions.jl` kernels!
+
+While it works currently, it is not ideal: sorting 1,000,000 `Int32` values split across 2 MPI
+ranks takes \~0.015s on my Intel i9 CPU and \~0.034s on my NVidia Quadro RTX4000 with Max-Q.
+
+
 ### References
 
 This algorithm builds on prior art:

diff --git a/src/MPISort.jl b/src/MPISort.jl
@@ -8,7 +8,7 @@ using MPI
 using Parameters
 using DocStringExtensions
 
-using GPUArraysCore: AbstractGPUArray
+using GPUArraysCore: AbstractGPUArray, @allowscalar
 
 
 # Public exports
@@ -159,7 +159,7 @@ function mpisort!(
     num_samples_global = num_samples * nranks
 
     # Figure out key type, then allocate vector of samples
-    keytype = typeof(by(v[1]))
+    keytype = typeof(by(@allowscalar(v[1])))
 
     # Allocate vector of samples across all processes; initially only set the
     # first `num_samples` elements
@@ -168,12 +168,17 @@ function mpisort!(
     # Extract initial samples - on GPUs do vectorised indexing, on CPUs scalar indexing
     isamples = IntLinSpace(1, num_elements, num_samples)
     if v isa AbstractGPUArray
-        indices = Vector{Int}(undef, num_elements)
+        indices = Vector{Int}(undef, num_samples)
         @inbounds for i in 1:num_samples
             indices[i] = isamples[i]
         end
 
-        samples[:] = by.(v[indices])
+        # Compute by on the GPU, then transfer samples only back onto CPU
+        samples_gpu = by.(v[indices])
+        samples_cpu = Vector(samples_gpu)
+
+        # And copy samples into global vector
+        samples[1:num_samples] .= samples_cpu
     else
         @inbounds for i in 1:num_samples
             samples[i] = by(v[isamples[i]])
@@ -202,7 +207,7 @@ function mpisort!(
     histogram[end] = num_elements
 
     @inbounds Threads.@threads for i in 1:num_samples_global
-        histogram[i] = searchsortedlast(v, samples[i]; by=by, kws...)
+        histogram[i] = @allowscalar searchsortedlast(v, samples[i]; by=by, kws...)
     end
 
     # Sum all histograms on root to find samples' _global_ positions.
@@ -268,7 +273,7 @@ function mpisort!(
     end
 
     @inbounds Threads.@threads for i in 1:num_splitters
-        histogram[i] = searchsortedlast(v, splitters[i]; by=by, kws...)
+        histogram[i] = @allowscalar searchsortedlast(v, splitters[i]; by=by, kws...)
     end
 
     # The histogram dictates how many elements will be sent to each process

diff --git a/test/basic_gpu.jl b/test/basic_gpu.jl
@@ -0,0 +1,34 @@
+# File   : basic.jl
+# License: MIT
+# Author : Andrei Leonard Nicusan <a.l.nicusan@gmail.com>
+# Date   : 13.10.2022
+
+
+using MPI
+using MPISort
+using Random
+
+using CUDA
+
+
+# Initialise MPI, get communicator for all ranks, rank index, number of ranks
+MPI.Init()
+
+comm = MPI.COMM_WORLD
+rank = MPI.Comm_rank(comm)
+nranks = MPI.Comm_size(comm)
+
+# Generate local GPU array on each MPI rank - even with different number of elements
+rng = Xoshiro(rank)
+num_elements = 50 + rank * 2
+local_array = CuArray(rand(rng, 1:500, num_elements))
+
+# Sort arrays across all MPI ranks
+alg = SIHSort(comm)
+sorted_local_array = mpisort!(local_array; alg=alg)
+
+# Print each local array sequentially
+for i in 0:nranks - 1
+    rank == i && @show rank sorted_local_array alg.stats
+    MPI.Barrier(comm)
+end
diff --git a/test/largescale.jl b/test/largescale.jl
@@ -22,7 +22,7 @@ function largescale(num_elements=500_000)
     # Generate local array on each MPI rank - even with different number of elements
     rng = Xoshiro(rank)
     num_elements = 500_000 + rank * (num_elements ÷ 20)
-    local_array = rand(rng, 1:10 * num_elements, num_elements)
+    local_array = rand(rng, Int32(1):Int32(10 * num_elements), num_elements)
 
     # Sort arrays across all MPI ranks
     alg = SIHSort(comm)

diff --git a/test/largescale_gpu.jl b/test/largescale_gpu.jl
@@ -0,0 +1,46 @@
+# File   : largescale.jl
+# License: MIT
+# Author : Andrei Leonard Nicusan <a.l.nicusan@gmail.com>
+# Date   : 13.10.2022
+
+
+using MPI
+using MPISort
+using Random
+
+using CUDA
+
+
+# Initialise MPI, get communicator for all ranks, rank index, number of ranks
+MPI.Init()
+
+comm = MPI.COMM_WORLD
+rank = MPI.Comm_rank(comm)
+nranks = MPI.Comm_size(comm)
+
+
+function largescale(num_elements=500_000)
+
+    # Generate local array on each MPI rank - even with different number of elements
+    rng = Xoshiro(rank)
+    num_elements = 500_000 + rank * (num_elements ÷ 20)
+    local_array = CuArray(rand(rng, Int32(1):Int32(10 * num_elements), num_elements))
+
+    # Sort arrays across all MPI ranks
+    alg = SIHSort(comm)
+    @time sorted_local_array = mpisort!(local_array; alg=alg)
+
+    # Print each local array sequentially
+    for i in 0:nranks - 1
+        rank == i && @show rank alg.stats
+        MPI.Barrier(comm)
+    end
+end
+
+
+# Run once to compile everything, then again to benchmark
+rank == 0 && println("First run, compiling...")
+largescale()
+
+rank == 0 && println("Single benchmark run")
+largescale()