From 64ab31ec3e945c515e08dc528318003ebb634360 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 15 Sep 2023 04:20:57 -0500
Subject: [PATCH] [UnitTest][Metal] Parametrize allreduce GPU tests (#15749)

* [UnitTest][Metal] Parametrize allreduce GPU tests

As a first step to addressing the Metal codegen errors that required
the reversion in https://github.com/apache/tvm/pull/15725,
parametrizing the unit tests for `allreduce`.  While these tests are
parametrized with `@tvm.testing.parametrize_targets("cuda", "metal")`,
the automatic `tvm.testing.requires_metal` marker inserted for the
metal parametrization will cause them to be skipped if the metal
runtime is unavailable, which includes the current CI.

* Updated filename, device used when testing on metal
---
 ...st_allreduce_cuda.py => test_allreduce.py} | 109 +++++++++---------
 1 file changed, 57 insertions(+), 52 deletions(-)
 rename tests/python/unittest/{test_allreduce_cuda.py => test_allreduce.py} (50%)

diff --git a/tests/python/unittest/test_allreduce_cuda.py b/tests/python/unittest/test_allreduce.py
similarity index 50%
rename from tests/python/unittest/test_allreduce_cuda.py
rename to tests/python/unittest/test_allreduce.py
index e9a8ef81cf1b..708384daf027 100644
--- a/tests/python/unittest/test_allreduce_cuda.py
+++ b/tests/python/unittest/test_allreduce.py
@@ -46,61 +46,66 @@ def reduce_max(a: T.handle, b: T.handle, d1: T.int32, d2: T.int32, d3: T.int32)
             B[vi, vj, vk] = T.max(B[vi, vj, vk], A[vi, vj, vk, vl])
 
 
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_allreduce_cuda():
-    def check_sum(d1: int, d2: int, d3: int):
-        _, _, _d1, _d2, _d3 = reduce.params
-        mod = reduce.specialize({_d1: d1, _d2: d2, _d3: d3})
-        sch = tvm.tir.Schedule(mod)
-        blk = sch.get_block("reduce")
-        i, j, k, l = sch.get_loops(blk)
-        sch.bind(i, "blockIdx.x")
-        sch.bind(j, "threadIdx.z")
-        sch.bind(k, "threadIdx.y")
-        sch.bind(l, "threadIdx.x")
-        f = tvm.build(sch.mod["main"], target="cuda")
-
-        # prepare input and output array
-        a_np = np.random.rand(1, d1, d2, d3).astype("float32")
-        b_np = a_np.sum(axis=-1).astype("float32")
-        a = tvm.nd.array(a_np, tvm.cuda(0))
-        b = tvm.nd.array(np.zeros_like(b_np), tvm.cuda(0))
-
-        # launch kernel
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-6, atol=1e-6)
-
-    def check_max(d1: int, d2: int, d3: int):
-        _, _, _d1, _d2, _d3 = reduce_max.params
-        mod = reduce_max.specialize({_d1: d1, _d2: d2, _d3: d3})
-        sch = tvm.tir.Schedule(mod)
-        blk = sch.get_block("reduce")
-        i, j, k, l = sch.get_loops(blk)
-        sch.bind(i, "blockIdx.x")
-        sch.bind(j, "threadIdx.z")
-        sch.bind(k, "threadIdx.y")
-        sch.bind(l, "threadIdx.x")
-        f = tvm.build(sch.mod["main"], target="cuda")
-
-        # prepare input and output array
-        a_np = -np.random.rand(1, d1, d2, d3).astype("float32")
-        b_np = a_np.max(axis=-1).astype("float32")
-        a = tvm.nd.array(a_np, tvm.cuda(0))
-        b = tvm.nd.array(np.zeros_like(b_np), tvm.cuda(0))
-
-        # launch kernel
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-6, atol=1e-6)
-
+def generate_param_sets():
     for d1 in range(1, 5):
         for d2 in range(1, 5):
             for d3 in [2, 4, 8, 12, 16, 32, 48, 64, 100, 128, 201, 256, 512, 1024]:
-                if d1 * d2 * d3 > 1024:
-                    continue
-                check_sum(d1, d2, d3)
-                check_max(d1, d2, d3)
+                if d1 * d2 * d3 < 1024:
+                    yield (d1, d2, d3)
+
+
+dims = tvm.testing.parameter(*generate_param_sets())
+
+
+@tvm.testing.parametrize_targets("cuda", "metal")
+def test_allreduce_sum(dims, target, dev):
+    d1, d2, d3 = dims
+    _, _, _d1, _d2, _d3 = reduce.params
+    mod = reduce.specialize({_d1: d1, _d2: d2, _d3: d3})
+    sch = tvm.tir.Schedule(mod)
+    blk = sch.get_block("reduce")
+    i, j, k, l = sch.get_loops(blk)
+    sch.bind(i, "blockIdx.x")
+    sch.bind(j, "threadIdx.z")
+    sch.bind(k, "threadIdx.y")
+    sch.bind(l, "threadIdx.x")
+    f = tvm.build(sch.mod["main"], target=target)
+
+    # prepare input and output array
+    a_np = np.random.rand(1, d1, d2, d3).astype("float32")
+    b_np = a_np.sum(axis=-1).astype("float32")
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(np.zeros_like(b_np), dev)
+
+    # launch kernel
+    f(a, b)
+    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-6, atol=1e-6)
+
+
+@tvm.testing.parametrize_targets("cuda", "metal")
+def test_allreduce_max(dims, target, dev):
+    d1, d2, d3 = dims
+    _, _, _d1, _d2, _d3 = reduce_max.params
+    mod = reduce_max.specialize({_d1: d1, _d2: d2, _d3: d3})
+    sch = tvm.tir.Schedule(mod)
+    blk = sch.get_block("reduce")
+    i, j, k, l = sch.get_loops(blk)
+    sch.bind(i, "blockIdx.x")
+    sch.bind(j, "threadIdx.z")
+    sch.bind(k, "threadIdx.y")
+    sch.bind(l, "threadIdx.x")
+    f = tvm.build(sch.mod["main"], target=target)
+
+    # prepare input and output array
+    a_np = -np.random.rand(1, d1, d2, d3).astype("float32")
+    b_np = a_np.max(axis=-1).astype("float32")
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(np.zeros_like(b_np), dev)
+
+    # launch kernel
+    f(a, b)
+    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-6, atol=1e-6)
 
 
 if __name__ == "__main__":
-    test_allreduce_cuda()
+    tvm.testing.main()