From 8c915e91d3dbe2470504a1236ba2b0faa9c76812 Mon Sep 17 00:00:00 2001
From: Edwin Solis <edwinsolisf12@gmail.com>
Date: Tue, 16 Sep 2025 21:28:27 -0700
Subject: [PATCH] Updated benchmarks and docs

---
 benchmarks/src/README.md                      | 14 ++-
 benchmarks/src/graphs.py                      | 39 ++++++--
 benchmarks/src/pytest_benchmark/common.py     | 20 ++--
 .../src/pytest_benchmark/test_blackscholes.py | 27 ++++++
 .../src/pytest_benchmark/test_elementwise.py  |  5 +-
 benchmarks/src/pytest_benchmark/test_fft.py   |  7 +-
 benchmarks/src/pytest_benchmark/test_gemm.py  |  7 +-
 .../src/pytest_benchmark/test_kmeans.py       | 91 ++++++++++++++++++-
 .../src/pytest_benchmark/test_linalg.py       | 44 ++++++---
 .../src/pytest_benchmark/test_mandelbrot.py   | 48 +++++++++-
 .../pytest_benchmark/test_montecarlo_pi.py    |  8 +-
 benchmarks/src/pytest_benchmark/test_nbody.py |  6 +-
 benchmarks/src/pytest_benchmark/test_nn.py    | 85 ++++++++++++++++-
 .../src/pytest_benchmark/test_random.py       | 10 +-
 benchmarks/src/requirements.txt               |  1 +
 docs/installation.rst                         | 23 ++++-
 docs/release_notes.md                         |  4 +-
 17 files changed, 386 insertions(+), 53 deletions(-)

diff --git a/benchmarks/src/README.md b/benchmarks/src/README.md
index ae84c4e..a1c7beb 100644
--- a/benchmarks/src/README.md
+++ b/benchmarks/src/README.md
@@ -3,10 +3,18 @@ Benchmarks
 
 ## Setting up environment
 
+Create a python environment and install pytest and the compute libraries:
 ```sh
     python -m pip install -r requirements.txt
 ```
 
+If running `dpnp` with Nvidia or AMD devices, you must install the oneapi toolkit along with the corresponding oneapi pluging:
+
+```sh
+    # install oneapi toolkit and plugins
+    source /opt/intel/oneapi/setvars.sh
+```
+
 ## Benchmark parameters
 
 The benchmark packages, rounds, array sizes, and numeric type may be specified on the constants at the top of [pytest_benchmark/common.py](pytest_benchmark/common.py).
@@ -20,16 +28,18 @@ These are the steps to run the benchmarks, and produce the graphs
 
 Run the benchmarks and store the results in `results.json`
 ```sh
-    pytest .\pytest_benchmark --benchmark-json=results.json
+    pytest ./pytest_benchmark --benchmark-json=results.json
 ```
 
 To create graphs and store the timing results after creating the `results.json`, run:
 ```sh
+    mkdir img
     python graphs.py
 ```
 
 To modify the tests being shown, modify the `TESTS` list at the top of the `graphs.py` file.
-To modify the labels shown, modify `PKG_LABELS`
+To modify the legend of the package labels shown, modify `PKG_LABELS`
+To modify the name of the tests shown, modify `TESTS_GRAPH_NAME`
 To modify the hardware display, modify `HARDWARE` 
 
 ## Notes
diff --git a/benchmarks/src/graphs.py b/benchmarks/src/graphs.py
index 1b2e4e4..ab32b5c 100644
--- a/benchmarks/src/graphs.py
+++ b/benchmarks/src/graphs.py
@@ -7,7 +7,7 @@
 BENCHMARKS_JSON = "results.json"
 
 # Hardware details shown in title
-HARDWARE = "AMD Ryzen 9 9900X 12-Core Processor 63032 MB (fp64 fp16)\noneAPI 2025.1.3 Intel(R) OpenCL Graphics: Intel(R) Arc(TM) B580 Graphics, 11873 MB (fp64 fp16)"
+HARDWARE = "Intel Xeon Gold 5315Y (8 Processors) @ 3.201GHz 63032 MB\noneAPI 2025.2.1 NVIDIA RTX A4000, 16222 MB, CUDA 12.8 Compute 8.6"
 
 # Show speedup in graph
 SHOW_NUMBERS = True
@@ -16,12 +16,13 @@
 ROUND_NUMBERS = 1
 
 # package list in graph order; arrayfire packages are added later
-PKG_NAMES = ["numpy", "dpnp", "cupy"]
+PKG_NAMES = ["numpy", "dpnp", "cupy", "cupynumeric"]
 
 # color used in graphs
 PKG_COLOR = {
     "numpy": "tab:blue",
     "cupy": "tab:green",
+    "cupynumeric": "green",
     "dpnp": "tab:red",
     "afcpu": "tab:orange",
     "afopencl": "tab:orange",
@@ -32,8 +33,9 @@
 # labels displayed in the graph
 PKG_LABELS = {
     "numpy": "numpy[cpu]",
-    "dpnp": "dpnp[level_zero:gpu]",
+    "dpnp": "dpnp[cuda:gpu]",
     "cupy": "cupy",
+    "cupynumeric": "cupynumeric",
     "afcpu": "afcpu",
     "afcuda": "afcuda",
     "afopencl": "afopencl[opencl:gpu]",
@@ -44,16 +46,16 @@
 
 # Tests to be shown in graphs
 TESTS = [
-    "qr",
+    "group_elementwise",
     "neural_network",
-    "gemm",
+    "black_scholes",
     "mandelbrot",
     "nbody",
     "pi",
-    "black_scholes",
-    "fft",
     "normal",
-    "group_elementwise",
+    "gemm",
+    "fft",
+    "qr",
     # Other tests
     # 'svd
     # 'cholesky',
@@ -63,6 +65,25 @@
     # 'inv'
 ]
 
+# Reverse list so it appears in order on graph
+TESTS.reverse()
+
+TESTS_GRAPH_NAME = {
+    "group_elementwise": "Group_elementwise (JIT)",
+    "neural_network": "Neural Network (JIT)",
+    "black_scholes": "Black Scholes (JIT)",
+    "mandelbrot": "Mandelbrot (JIT)",
+    "nbody": "Nbody (JIT)",
+    "pi": "Montecarlo Pi (JIT)",
+    "normal": "Normal Distribution",
+    "gemm": "General Matrix Multiplication",
+    "fft": "2D FFT",
+    "qr": "QR Decomposition",
+}
+
+for name in TESTS:
+    if name not in TESTS_GRAPH_NAME:
+        TESTS_GRAPH_NAME[name] = name
 
 def get_benchmark_data():
     results = {}
@@ -189,7 +210,7 @@ def generate_group_graph(test_list=None, show_numbers=False, filename="compariso
 
     xlabels = []
     for test in tests:
-        xlabels.append(test + "\n" + descriptions[test])
+        xlabels.append(TESTS_GRAPH_NAME[test] + "\n" + descriptions[test])
 
     ax.set_xlabel("Speedup")
     ax.set_xscale("log")
diff --git a/benchmarks/src/pytest_benchmark/common.py b/benchmarks/src/pytest_benchmark/common.py
index 9e82a47..f6dc2ef 100644
--- a/benchmarks/src/pytest_benchmark/common.py
+++ b/benchmarks/src/pytest_benchmark/common.py
@@ -29,6 +29,7 @@
 import math
 
 import cupy
+import cupynumeric
 import dpctl
 import dpnp
 import numpy as np
@@ -38,19 +39,19 @@
 
 # modify parameters for most benchmarks
 ROUNDS = 30
-NSIZE = 2**13
+NSIZE = 2**11
 NNSIZE = NSIZE**2
 DTYPE = "float32"
 
 # comment a line to remove that package from testing
 PKGDICT = {
-    "dpnp": dpnp,
     "numpy": np,
     "cupy": cupy,
     # "afcpu": af,
     "afopencl": af,
-    "afcuda": af,
     "afoneapi": af,
+    "dpnp": dpnp,
+    "cupynumeric": cupynumeric,
 }
 
 PKGS = []
@@ -66,11 +67,13 @@ def initialize_package(PKG_ID):
     pkg = PKGDICT[PKG_ID]
 
     try:
+        # Free all unused memory
+        gc.collect()
         af.device_gc()
         mempool = cupy.get_default_memory_pool()
         mempool.free_all_blocks()
-    except:
-        pass
+    except Exception as e:
+        print(e)
 
     if PKG_ID == "afcpu":
         af.set_backend(af.BackendType.cpu)
@@ -98,8 +101,7 @@ def initialize_package(PKG_ID):
         print(cupy.cuda.Device())
         mempool = cupy.get_default_memory_pool()
         mempool.free_all_blocks()
+    elif PKG_ID == "cupynumeric":
+        pass
     else:
-        raise NotImplementedError()
-
-    # Free all unused memory
-    gc.collect()
+        raise NotImplementedError()
\ No newline at end of file
diff --git a/benchmarks/src/pytest_benchmark/test_blackscholes.py b/benchmarks/src/pytest_benchmark/test_blackscholes.py
index fae2fc5..7daabc7 100644
--- a/benchmarks/src/pytest_benchmark/test_blackscholes.py
+++ b/benchmarks/src/pytest_benchmark/test_blackscholes.py
@@ -92,6 +92,29 @@ def cnd(x):
 
     return (C, P)
 
+def black_scholes_cupynumeric(S, X, R, V, T):
+    # S = Underlying stock price
+    # X = Strike Price
+    # R = Risk free rate of interest
+    # V = Volatility
+    # T = Time to maturity
+    def cnd(x):
+        temp = x > 0
+        erf = lambda arr: cupynumeric.exp(-arr * arr)
+        return temp * (0.5 + erf(x / sqrt2) / 2) + (1 - temp) * (0.5 - erf((-x) / sqrt2) / 2)
+
+    d1 = cupynumeric.log(S / X)
+    d1 = d1 + (R + (V * V) * 0.5) * T
+    d1 = d1 / (V * cupynumeric.sqrt(T))
+
+    d2 = d1 - (V * cupynumeric.sqrt(T))
+    cnd_d1 = cnd(d1)
+    cnd_d2 = cnd(d2)
+
+    C = S * cnd_d1 - (X * cupynumeric.exp((-R) * T) * cnd_d2)
+    P = X * cupynumeric.exp((-R) * T) * (1 - cnd_d2) - (S * (1 - cnd_d1))
+
+    return (C, P)
 
 def black_scholes_arrayfire(S, X, R, V, T):
     def cnd(x):
@@ -137,6 +160,9 @@ def generate_arrays(pkgid, count):
     elif "numpy" == pkg:
         for i in range(count):
             arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "cupynumeric" == pkg:
+        for i in range(count):
+            arr_list.append(cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE))
 
     return arr_list
 
@@ -146,4 +172,5 @@ def generate_arrays(pkgid, count):
     "numpy": black_scholes_numpy,
     "cupy": black_scholes_cupy,
     "arrayfire": black_scholes_arrayfire,
+    "cupynumeric": black_scholes_cupynumeric
 }
diff --git a/benchmarks/src/pytest_benchmark/test_elementwise.py b/benchmarks/src/pytest_benchmark/test_elementwise.py
index 0933ce6..771baee 100644
--- a/benchmarks/src/pytest_benchmark/test_elementwise.py
+++ b/benchmarks/src/pytest_benchmark/test_elementwise.py
@@ -52,7 +52,7 @@ def func_cupy(arr):
             cupy.cuda.runtime.deviceSynchronize()
             return x
 
-        GROUP_FUNCS = {"numpy": func, "cupy": func_cupy, "arrayfire": func_af, "dpnp": func}
+        GROUP_FUNCS = {"numpy": func, "cupy": func_cupy, "arrayfire": func_af, "dpnp": func, "cupynumeric": func}
 
         benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
         result = benchmark.pedantic(
@@ -312,5 +312,8 @@ def generate_arrays(pkgid, count):
     elif "numpy" == pkg:
         for i in range(count):
             arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "cupynumeric" == pkg:
+        for i in range(count):
+            arr_list.append(cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE))
 
     return arr_list
diff --git a/benchmarks/src/pytest_benchmark/test_fft.py b/benchmarks/src/pytest_benchmark/test_fft.py
index 096573d..ac234f2 100644
--- a/benchmarks/src/pytest_benchmark/test_fft.py
+++ b/benchmarks/src/pytest_benchmark/test_fft.py
@@ -50,6 +50,9 @@ def generate_arrays(pkgid, count):
     elif "numpy" == pkg:
         for i in range(count):
             arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "cupynumeric" == pkg:
+        for i in range(count):
+            arr_list.append(cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE))
 
     return arr_list
 
@@ -86,5 +89,7 @@ def fft_cupy(arr):
     cupy.cuda.runtime.deviceSynchronize()
     return res
 
+def fft_cupynumeric(arr):
+    return cupynumeric.fft.fft(arr)
 
-FUNCS = {"dpnp": fft_dpnp, "numpy": fft_np, "cupy": fft_cupy, "arrayfire": fft_af}
+FUNCS = {"dpnp": fft_dpnp, "numpy": fft_np, "cupy": fft_cupy, "arrayfire": fft_af, "cupynumeric": fft_cupynumeric}
diff --git a/benchmarks/src/pytest_benchmark/test_gemm.py b/benchmarks/src/pytest_benchmark/test_gemm.py
index e62d5a1..f66bbd4 100644
--- a/benchmarks/src/pytest_benchmark/test_gemm.py
+++ b/benchmarks/src/pytest_benchmark/test_gemm.py
@@ -81,6 +81,9 @@ def generate_arrays(pkgid, count):
         np.random.rand(1)
         for i in range(count):
             arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE))
+    elif "cupynumeric" == pkg:
+        for i in range(count):
+            arr_list.append(cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE))
 
     return arr_list
 
@@ -117,5 +120,7 @@ def gemm_cupy(A, B, C):
     cupy.cuda.runtime.deviceSynchronize()
     return C
 
+def gemm_cupynumeric(A, B, C):
+    return alpha * cupynumeric.matmul(A, B) + beta * C
 
-FUNCS = {"numpy": gemm_np, "cupy": gemm_cupy, "arrayfire": gemm_af, "dpnp": gemm_dpnp}
+FUNCS = {"numpy": gemm_np, "cupy": gemm_cupy, "arrayfire": gemm_af, "dpnp": gemm_dpnp, "cupynumeric": gemm_cupynumeric}
diff --git a/benchmarks/src/pytest_benchmark/test_kmeans.py b/benchmarks/src/pytest_benchmark/test_kmeans.py
index 53c5183..291c248 100644
--- a/benchmarks/src/pytest_benchmark/test_kmeans.py
+++ b/benchmarks/src/pytest_benchmark/test_kmeans.py
@@ -12,7 +12,8 @@ class TestKmeans:
     def test_kmeans(self, benchmark, pkgid):
         initialize_package(pkgid)
         pkg = PKGDICT[pkgid]
-        kmean_class = {"dpnp": kmeans_dpnp, "numpy": kmeans_numpy, "cupy": kmeans_cupy, "arrayfire": kmeans_af}
+        kmean_class = {"dpnp": kmeans_dpnp, "numpy": kmeans_numpy, "cupy": kmeans_cupy, "arrayfire": kmeans_af,
+        "cupynumeric": kmeans_cupynumeric}
         obj = kmean_class[pkg.__name__]()
 
         benchmark.extra_info["description"] = f"{NSAMPLES}x{NFEATURES} over {K} centers"
@@ -189,6 +190,94 @@ def kmeans(self):
         return centroids, cluster_assignments
 
 
+
+class kmeans_cupynumeric:
+    def __init__(self):
+        self.data = cupynumeric.random.random((NSAMPLES, NFEATURES))
+        self.centroid_indices = cupynumeric.random.choice(self.data.shape[0], K, replace=False)
+
+    def initialize_centroids(self):
+        """
+        Randomly initializes k centroids from the data points.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            k (int): The number of clusters.
+
+        Returns:
+            np.ndarray: Initial centroids (k, n_features).
+        """
+
+        return self.data[self.centroid_indices, :]
+
+    def assign_to_clusters(self, centroids):
+        """
+        Assigns each data point to the closest centroid.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            centroids (np.ndarray): The current centroids (k, n_features).
+
+        Returns:
+            np.ndarray: An array of cluster assignments for each data point (n_samples,).
+        """
+        distances = cupynumeric.sqrt(((self.data[:, cupynumeric.newaxis, :] - centroids[cupynumeric.newaxis, :, :]) ** 2).sum(axis=2))
+        cluster_assignments = cupynumeric.argmin(distances, axis=1)
+        return cluster_assignments
+
+    def update_centroids(self, cluster_assignments):
+        """
+        Recalculates the centroids based on the mean of the assigned data points.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            cluster_assignments (np.ndarray): An array of cluster assignments.
+            k (int): The number of clusters.
+
+        Returns:
+            np.ndarray: Updated centroids (k, n_features).
+        """
+        new_centroids = cupynumeric.zeros((K, self.data.shape[1]))
+        for i in range(K):
+            points_in_cluster = self.data[cluster_assignments == i]
+            if len(points_in_cluster) > 0:
+                new_centroids[i] = cupynumeric.mean(points_in_cluster, axis=0)
+        return new_centroids
+
+    def kmeans(self):
+        """
+        Performs the K-Means clustering algorithm.
+
+        Args:
+            data (np.ndarray): The input data points (n_samples, n_features).
+            k (int): The number of clusters.
+            max_iterations (int): Maximum number of iterations to run the algorithm.
+            tolerance (float): The tolerance for convergence (change in centroids).
+
+        Returns:
+            tuple: A tuple containing:
+                - np.ndarray: Final centroids (k, n_features).
+                - np.ndarray: Final cluster assignments for each data point (n_samples,).
+        """
+        centroids = self.initialize_centroids()
+        cluster_assignments = None
+
+        for i in range(ITERATIONS):
+            old_centroids = cupynumeric.copy(centroids)
+
+            # E-step: Assign points to clusters
+            cluster_assignments = self.assign_to_clusters(centroids)
+
+            # M-step: Update centroids
+            centroids = self.update_centroids(cluster_assignments)
+
+            # Check for convergence
+            if cupynumeric.linalg.norm(centroids - old_centroids) < TOLERANCE:
+                break
+
+        return centroids, cluster_assignments
+
+
 class kmeans_af:
     def __init__(self):
         self.data = af.Array(np.random.random((NSAMPLES, NFEATURES)).flatten().tolist(), shape=(NSAMPLES, NFEATURES))
diff --git a/benchmarks/src/pytest_benchmark/test_linalg.py b/benchmarks/src/pytest_benchmark/test_linalg.py
index 2b75601..870738b 100644
--- a/benchmarks/src/pytest_benchmark/test_linalg.py
+++ b/benchmarks/src/pytest_benchmark/test_linalg.py
@@ -63,7 +63,12 @@ def generate_arrays(pkgid, count, posdef=False):
             if posdef:
                 x = x @ x.T + x.T @ x + eps
             arr_list.append(x)
-
+    elif "cupynumeric" == pkg:
+        for i in range(count):
+            x = cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE)
+            if posdef:
+                x = x @ x.T + x.T @ x + eps
+            arr_list.append(x)
     return arr_list
 
 
@@ -88,6 +93,8 @@ def svd_cupy(arr):
     cupy.cuda.runtime.deviceSynchronize()
     return x
 
+def svd_cupynumeric(arr):
+    return cupynumeric.linalg.svd(arr)
 
 def qr_np(arr):
     return np.linalg.qr(arr)
@@ -110,6 +117,8 @@ def qr_cupy(arr):
     cupy.cuda.runtime.deviceSynchronize()
     return x
 
+def qr_cupynumeric(arr):
+    return cupynumeric.linalg.qr(arr)
 
 def cholesky_np(arr):
     return np.linalg.cholesky(arr)
@@ -131,11 +140,8 @@ def cholesky_cupy(arr):
     cupy.cuda.runtime.deviceSynchronize()
     return x
 
-
-def qr_cupy(arr):
-    x = cupy.linalg.qr(arr)
-    cupy.cuda.runtime.deviceSynchronize()
-    return x
+def cholesky_cupynumeric(arr):
+    return cupynumeric.linalg.cholesky(arr)
 
 
 def inv_np(arr):
@@ -147,7 +153,7 @@ def inv_dpnp(arr):
 
 
 def inv_af(arr):
-    x, info = af.inverse(arr)
+    x = af.inverse(arr)
     af.eval(x)
     af.sync()
     return x
@@ -158,6 +164,8 @@ def inv_cupy(arr):
     cupy.cuda.runtime.deviceSynchronize()
     return x
 
+def inv_cupynumeric(arr):
+    return cupynumeric.linalg.inv(arr)
 
 def det_np(arr):
     return np.linalg.det(arr)
@@ -178,6 +186,8 @@ def det_cupy(arr):
     cupy.cuda.runtime.deviceSynchronize()
     return x
 
+def det_cupynumeric(arr):
+    return cupynumeric.linalg.det(arr)
 
 def norm_np(arr):
     return np.linalg.norm(arr)
@@ -198,6 +208,8 @@ def norm_cupy(arr):
     cupy.cuda.runtime.deviceSynchronize()
     return x
 
+def norm_cupynumeric(arr):
+    return cupynumeric.linalg.norm(arr)
 
 @pytest.mark.parametrize("pkgid", IDS, ids=IDS)
 class TestLinalg:
@@ -208,7 +220,8 @@ def test_cholesky(self, benchmark, pkgid):
         benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
         pkg = PKGDICT[pkgid]
 
-        CHOLESKY_FUNCS = {"numpy": cholesky_np, "cupy": cholesky_cupy, "arrayfire": cholesky_af, "dpnp": cholesky_dpnp}
+        CHOLESKY_FUNCS = {"numpy": cholesky_np, "cupy": cholesky_cupy, "arrayfire": cholesky_af, "dpnp": cholesky_dpnp, 
+            "cupynumeric": cholesky_cupynumeric }
         result = benchmark.pedantic(
             target=CHOLESKY_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS
         )
@@ -220,7 +233,8 @@ def test_svd(self, benchmark, pkgid):
         benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
         pkg = PKGDICT[pkgid]
 
-        SVD_FUNCS = {"numpy": svd_np, "cupy": svd_cupy, "arrayfire": svd_af, "dpnp": svd_dpnp}
+        SVD_FUNCS = {"numpy": svd_np, "cupy": svd_cupy, "arrayfire": svd_af, "dpnp": svd_dpnp, 
+            "cupynumeric": svd_cupynumeric }
         result = benchmark.pedantic(target=SVD_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
 
     def test_qr(self, benchmark, pkgid):
@@ -230,7 +244,8 @@ def test_qr(self, benchmark, pkgid):
         benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
         pkg = PKGDICT[pkgid]
 
-        QR_FUNCS = {"numpy": qr_np, "cupy": qr_cupy, "arrayfire": qr_af, "dpnp": qr_dpnp}
+        QR_FUNCS = {"numpy": qr_np, "cupy": qr_cupy, "arrayfire": qr_af, "dpnp": qr_dpnp, 
+            "cupynumeric": qr_cupynumeric }
         result = benchmark.pedantic(target=QR_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
 
     def test_inv(self, benchmark, pkgid):
@@ -240,7 +255,8 @@ def test_inv(self, benchmark, pkgid):
         benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
         pkg = PKGDICT[pkgid]
 
-        INV_FUNCS = {"numpy": inv_np, "cupy": inv_cupy, "arrayfire": inv_af, "dpnp": inv_dpnp}
+        INV_FUNCS = {"numpy": inv_np, "cupy": inv_cupy, "arrayfire": inv_af, "dpnp": inv_dpnp, 
+            "cupynumeric": inv_cupynumeric }
         result = benchmark.pedantic(target=INV_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
 
     def test_det(self, benchmark, pkgid):
@@ -250,7 +266,8 @@ def test_det(self, benchmark, pkgid):
         benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
         pkg = PKGDICT[pkgid]
 
-        DET_FUNCS = {"numpy": det_np, "cupy": det_cupy, "arrayfire": det_af, "dpnp": det_dpnp}
+        DET_FUNCS = {"numpy": det_np, "cupy": det_cupy, "arrayfire": det_af, "dpnp": det_dpnp, 
+            "cupynumeric": det_cupynumeric }
         result = benchmark.pedantic(target=DET_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
 
     def test_norm(self, benchmark, pkgid):
@@ -260,5 +277,6 @@ def test_norm(self, benchmark, pkgid):
         benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix"
         pkg = PKGDICT[pkgid]
 
-        NORM_FUNCS = {"numpy": norm_np, "cupy": norm_cupy, "arrayfire": norm_af, "dpnp": norm_dpnp}
+        NORM_FUNCS = {"numpy": norm_np, "cupy": norm_cupy, "arrayfire": norm_af, "dpnp": norm_dpnp, 
+            "cupynumeric": norm_cupynumeric }
         result = benchmark.pedantic(target=NORM_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS)
diff --git a/benchmarks/src/pytest_benchmark/test_mandelbrot.py b/benchmarks/src/pytest_benchmark/test_mandelbrot.py
index a9cd73b..1ceba09 100644
--- a/benchmarks/src/pytest_benchmark/test_mandelbrot.py
+++ b/benchmarks/src/pytest_benchmark/test_mandelbrot.py
@@ -12,8 +12,8 @@
 xmax = 2
 ymin = -2
 ymax = 2
-xn = NSIZE
-yn = NSIZE
+xn = int(NSIZE / 2)
+yn = int(NSIZE / 2)
 itermax = 20
 horizon = 2.0
 
@@ -24,7 +24,7 @@ def test_mandelbrot(self, benchmark, pkgid):
         initialize_package(pkgid)
         pkg = PKGDICT[pkgid]
 
-        benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} grid iterated {itermax}x"
+        benchmark.extra_info["description"] = f"{xn}x{yn} grid iterated {itermax}x"
         result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=1)
 
 
@@ -132,12 +132,49 @@ def mandelbrot_cupy():
         Xi, Yi = Xi[I], Yi[I]
         C = C[I]
 
+        if i % 2 == 1:
+            mempool = cupy.get_default_memory_pool()
+            mempool.free_all_blocks()
+
     Z_ = Z_.T
     N_ = N_.T
 
     cupy.cuda.runtime.deviceSynchronize()
     return Z_, N_
 
+def mandelbrot_cupynumeric():
+    # Adapted from
+    # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/
+    Xi, Yi = np.mgrid[0:xn, 0:yn]
+    X = cupynumeric.linspace(xmin, xmax, xn, dtype=cupynumeric.float64)[Xi]
+    Y = cupynumeric.linspace(ymin, ymax, yn, dtype=cupynumeric.float64)[Yi]
+    C = X + Y * 1j
+
+    N_ = cupynumeric.zeros(C.shape, dtype=cupynumeric.int64)
+    Z_ = cupynumeric.zeros(C.shape, dtype=cupynumeric.complex128)
+    Xi.shape = Yi.shape = C.shape = xn * yn
+
+    Z = cupynumeric.zeros(C.shape, cupynumeric.complex128)
+    for i in range(itermax):
+        if not len(Z):
+            break
+
+        # Compute for relevant points only
+        cupynumeric.multiply(Z, Z, Z)
+        cupynumeric.add(Z, C, Z)
+
+        # Failed convergence
+        I = abs(Z) > horizon  # noqa: E741 math variable
+        N_[Xi[I], Yi[I]] = i + 1
+        Z_[Xi[I], Yi[I]] = Z[I]
+
+        # Keep going with those who have not diverged yet
+        cupynumeric.logical_not(I, I)  # np.negative(I, I) not working any longer
+        Z = Z[I]
+        Xi, Yi = Xi[I], Yi[I]
+        C = C[I]
+
+    return Z_.T, N_.T
 
 def mandelbrot_af():
     Xi = af.flat(af.range((xn, yn), axis=0, dtype=af.int64))
@@ -173,6 +210,9 @@ def mandelbrot_af():
         Yi = Yi[I]
         C = C[I]
 
+        if i % 2 == 1:
+            af.device_gc()
+
     Z_ = Z_.T
     N_ = N_.T
     af.eval(Z_)
@@ -181,4 +221,4 @@ def mandelbrot_af():
     return Z_, N_
 
 
-FUNCS = {"dpnp": mandelbrot_dpnp, "numpy": mandelbrot_np, "cupy": mandelbrot_cupy, "arrayfire": mandelbrot_af}
+FUNCS = {"dpnp": mandelbrot_dpnp, "numpy": mandelbrot_np, "cupy": mandelbrot_cupy, "arrayfire": mandelbrot_af, "cupynumeric" : mandelbrot_cupynumeric}
diff --git a/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py b/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py
index 752b701..a296cf9 100644
--- a/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py
+++ b/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py
@@ -47,5 +47,11 @@ def calc_pi_dpnp(samples):
     y = dpnp.random.rand(samples).astype(dpnp.float32)
     return 4.0 * dpnp.sum(in_circle(x, y)) / samples
 
+def calc_pi_cupynumeric(samples):
+    x = cupynumeric.random.rand(samples).astype(cupynumeric.float32)
+    y = cupynumeric.random.rand(samples).astype(cupynumeric.float32)
+    return 4.0 * cupynumeric.sum(in_circle(x, y)) / samples
 
-FUNCS = {"dpnp": calc_pi_dpnp, "numpy": calc_pi_numpy, "cupy": calc_pi_cupy, "arrayfire": calc_pi_af}
+
+FUNCS = {"dpnp": calc_pi_dpnp, "numpy": calc_pi_numpy, "cupy": calc_pi_cupy, "arrayfire": calc_pi_af,
+ "cupynumeric": calc_pi_cupynumeric }
diff --git a/benchmarks/src/pytest_benchmark/test_nbody.py b/benchmarks/src/pytest_benchmark/test_nbody.py
index 8d35aa9..1255faf 100644
--- a/benchmarks/src/pytest_benchmark/test_nbody.py
+++ b/benchmarks/src/pytest_benchmark/test_nbody.py
@@ -10,6 +10,7 @@
 @pytest.mark.parametrize("pkgid", IDS, ids=IDS)
 class TestNbody:
     def test_nbody(self, benchmark, pkgid):
+        initialize_package(pkgid)
         pkg = PKGDICT[pkgid]
         setup = lambda: (generate_arrays(pkgid), {})
 
@@ -107,5 +108,8 @@ def generate_arrays(pkgid):
         arr_list.append(M * np.ones((NSIZE, 1), dtype=DTYPE))
         for i in range(count):
             arr_list.append(np.random.rand(NSIZE, 3).astype(DTYPE))
-
+    elif "cupynumeric" == pkgname:
+        arr_list.append(M * cupynumeric.ones((NSIZE, 1), dtype=DTYPE))
+        for i in range(count):
+            arr_list.append(cupynumeric.random.rand(NSIZE, 3).astype(DTYPE))
     return (pkg, arr_list[0], arr_list[1], arr_list[2])
diff --git a/benchmarks/src/pytest_benchmark/test_nn.py b/benchmarks/src/pytest_benchmark/test_nn.py
index 7940cb0..e61f704 100644
--- a/benchmarks/src/pytest_benchmark/test_nn.py
+++ b/benchmarks/src/pytest_benchmark/test_nn.py
@@ -19,6 +19,7 @@ def test_neural_network(self, benchmark, pkgid):
             "numpy": NeuralNetwork_numpy,
             "cupy": NeuralNetwork_cupy,
             "arrayfire": NeuralNetwork_af,
+            "cupynumeric": NeuralNetwork_cupynumeric,
         }
 
         obj = nn[pkg.__name__]()
@@ -281,6 +282,87 @@ def train(self):
     def predict(self, X):
         return cupy.argmax(self.forward(X), axis=1)
 
+class NeuralNetwork_cupynumeric:
+    def __init__(self):
+        self.input_size = INPUT_SIZE
+        self.hidden_size = HIDDEN_SIZE
+        self.output_size = OUTPUT_SIZE
+        self.learning_rate = LEARNING_RATE
+
+        # Initialize weights and biases
+        # He initialization (for ReLU) is often a good choice
+        self.W1 = cupynumeric.random.randn(self.input_size, self.hidden_size) * cupynumeric.sqrt(2.0 / self.input_size)
+        self.b1 = cupynumeric.zeros((1, self.hidden_size))
+        self.W2 = cupynumeric.random.randn(self.hidden_size, self.output_size) * cupynumeric.sqrt(2.0 / self.hidden_size)
+        self.b2 = cupynumeric.zeros((1, self.output_size))
+
+        self.X_train = cupynumeric.random.rand(SAMPLES, INPUT_SIZE)
+        self.y_train = cupynumeric.zeros((SAMPLES * OUTPUT_SIZE))
+        self.y_train[
+            cupynumeric.arange(SAMPLES) * OUTPUT_SIZE + cupynumeric.floor(cupynumeric.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int)
+        ] = 1
+        self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE))
+
+    def relu(self, x):
+        return cupynumeric.maximum(0, x)
+
+    def relu_derivative(self, x):
+        return (x > 0).astype(float)
+
+    def softmax(self, x):
+        exp_scores = cupynumeric.exp(x - cupynumeric.max(x, axis=1, keepdims=True))  # Subtract max for numerical stability
+        return exp_scores / cupynumeric.sum(exp_scores, axis=1, keepdims=True)
+
+    def forward(self, X):
+        # Hidden layer
+        self.z1 = cupynumeric.dot(X, self.W1) + self.b1
+        self.a1 = self.relu(self.z1)
+
+        # Output layer
+        self.z2 = cupynumeric.dot(self.a1, self.W2) + self.b2
+        self.a2 = self.softmax(self.z2)
+        return self.a2
+
+    def backward(self, X, y, output):
+        # Calculate gradients for the output layer
+        error_output = output - y  # Derivative of cross-entropy loss w.r.t. softmax input
+        dW2 = cupynumeric.dot(self.a1.T, error_output)
+        db2 = cupynumeric.sum(error_output, axis=0, keepdims=True)
+
+        # Calculate gradients for the hidden layer
+        error_hidden = cupynumeric.dot(error_output, self.W2.T) * self.relu_derivative(self.z1)
+        dW1 = cupynumeric.dot(X.T, error_hidden)
+        db1 = cupynumeric.sum(error_hidden, axis=0, keepdims=True)
+
+        # Update weights and biases
+        self.W2 -= self.learning_rate * dW2
+        self.b2 -= self.learning_rate * db2
+        self.W1 -= self.learning_rate * dW1
+        self.b1 -= self.learning_rate * db1
+
+    def train(self):
+        X_train = self.X_train
+        y_train = self.y_train
+
+        num_samples = X_train.shape[0]
+
+        for epoch in range(ITERATIONS):
+            # Shuffle data for each epoch
+            X_shuffled = X_train
+            y_shuffled = y_train
+
+            for i in range(0, num_samples, BATCH_SIZE):
+                X_batch = X_shuffled[i : i + BATCH_SIZE, :]
+                y_batch = y_shuffled[i : i + BATCH_SIZE, :]
+
+                # Forward pass
+                output = self.forward(X_batch)
+
+                # Backward pass and update weights
+                self.backward(X_batch, y_batch, output)
+
+    def predict(self, X):
+        return cupynumeric.argmax(self.forward(X), axis=1)
 
 class NeuralNetwork_af:
     def __init__(self):
@@ -299,8 +381,7 @@ def __init__(self):
         self.X_train = af.randu((SAMPLES, INPUT_SIZE))
         self.y_train = af.constant(0, (SAMPLES, OUTPUT_SIZE))
 
-        self.y_train = af.constant(0, (SAMPLES, OUTPUT_SIZE))
-        self.y_train[af.iota(SAMPLES), af.floor(af.randu(SAMPLES) * OUTPUT_SIZE)] = 1
+        self.y_train[af.iota(SAMPLES) * OUTPUT_SIZE + af.floor(af.randu(SAMPLES) * OUTPUT_SIZE)] = 1
 
         af.eval(self.X_train)
         af.eval(self.y_train)
diff --git a/benchmarks/src/pytest_benchmark/test_random.py b/benchmarks/src/pytest_benchmark/test_random.py
index 19caebb..9f32a3e 100644
--- a/benchmarks/src/pytest_benchmark/test_random.py
+++ b/benchmarks/src/pytest_benchmark/test_random.py
@@ -15,6 +15,8 @@ def randn_cupy():
     arr = cupy.random.normal(size=(NNSIZE))
     cupy.cuda.runtime.deviceSynchronize()
 
+def randn_cupynumeric():
+    arr = cupynumeric.random.normal(size=(NNSIZE))
 
 def randn_af():
     arr = af.randn((NNSIZE))
@@ -34,6 +36,8 @@ def randu_cupy():
     arr = cupy.random.uniform(size=(NNSIZE))
     cupy.cuda.runtime.deviceSynchronize()
 
+def randu_cupynumeric():
+    arr = cupynumeric.random.uniform(size=(NNSIZE))
 
 def randu_af():
     arr = af.randu((NNSIZE))
@@ -47,7 +51,8 @@ def test_normal(self, benchmark, pkgid):
         initialize_package(pkgid)
 
         pkg = PKGDICT[pkgid]
-        FUNCS = {"dpnp": randn_dpnp, "numpy": randn_np, "cupy": randn_cupy, "arrayfire": randn_af}
+        FUNCS = {"dpnp": randn_dpnp, "numpy": randn_np, "cupy": randn_cupy, "arrayfire": randn_af,
+        "cupynumeric": randn_cupynumeric}
 
         benchmark.extra_info["description"] = f"{NNSIZE:.2e} Samples"
         result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=ITERATIONS)
@@ -56,6 +61,7 @@ def test_uniform(self, benchmark, pkgid):
         initialize_package(pkgid)
 
         pkg = PKGDICT[pkgid]
-        FUNCS = {"dpnp": randu_dpnp, "numpy": randu_np, "cupy": randu_cupy, "arrayfire": randu_af}
+        FUNCS = {"dpnp": randu_dpnp, "numpy": randu_np, "cupy": randu_cupy, "arrayfire": randu_af,
+        "cupynumeric": randu_cupynumeric}
 
         result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=ITERATIONS)
diff --git a/benchmarks/src/requirements.txt b/benchmarks/src/requirements.txt
index 30b908b..466f995 100644
--- a/benchmarks/src/requirements.txt
+++ b/benchmarks/src/requirements.txt
@@ -2,6 +2,7 @@
 dpnp
 numpy
 cupy-cuda12x
+nvidia-cupynumeric
 pytest
 pytest-benchmark
 matplotlib
diff --git a/docs/installation.rst b/docs/installation.rst
index 017d733..b147e15 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -5,17 +5,32 @@ To use ArrayFire-Python you require three things:
 
 .. list-table::
 
-    * - :literal:`arrayfire-python` package
-      - You may install it through pip :literal:`pip install arrayfire-python` or from `building arrayfire-python wheel <../README.md>`_
+    * - :literal:`arrayfire-py` package
+        - It is the intended User Interface that provides a numpy-like layer to execute math and array operations with ArrayFire. 
     
     * - :literal:`arrayfire-binary-python-wrapper`
-      - You may download and install through pip :literal:`pip install arrayfire_binary_python_wrapper-0.8.0+af3.10.0 -f https://arrayfire.com/python/binaries` which will come with ArrayFire C Libraries as well. You may also `build from source <https://github.com/arrayfire/arrayfire-binary-python-wrapper>`_ without the C Libraries.
+        - It is a thin wrapper that provides Python direct access to the ArrayFire functions in the C library. This package must have access to ArrayFire binaries, either through a system-wide install, or through a pre-bundled wheel that includes binaries.
 
     * - :literal:`ArrayFire C/C++ Libraries`
-      - If you build the binary wrapper from source or wish to program with ArrayFire in C/C++, navigate to https://arrayfire.com/download and download the appropriate installer for the target architecture and operating system. Although ArrayFire can be `built from source <https://github.com/arrayfire/arrayfire-python/tree/master?tab=readme-ov-file#arrayfire-python-bindings>`_, the installers conveniently package necessary dependencies.
+        - They are the binaries obtained from compiling the ArrayFire C/C++ Project or more simply by downloading installers in the ArrayFire download page. Binaries can also be obtained as part of a pre-packaged arrayfire-binary-python-wrapper wheel.
 
 Below we detail more on the ArrayFire C Libraries installation procedure. Install the latest device drivers before using ArrayFire. Drivers and runtimes should be downloaded and installed from each device vendor's website.
 
+Install Instructions for ArrayFire Python
+###############################################
+
+arrayfire-py
+*************
+You may install it through pip :literal:`pip install arrayfire-python` or from `building arrayfire-py wheel <../README.md>`_. By installing through pip, you will automatically download the latest arrayfire-binary-python-wrapper containing the ArrayFire C Libraries.
+
+arrayfire-binary-python-wrapper
+********************************
+
+If you wish to use some specific version of ArrayFire, you can must download a specifc version of t he arrayfire-binary-python-wrapper.
+You may download one and install through pip which will come with ArrayFire C Libraries: :literal:`pip install arrayfire_binary_python_wrapper-0.8.0+af3.10.0 -f https://arrayfire.com/python/binaries`.
+You may also `build from source <https://github.com/arrayfire/arrayfire-binary-python-wrapper>`_ without the C Libraries and then install the ArrayFire Binaries via the ArrayFire Installers.
+
+
 Install Instructions for ArrayFire C Libraries
 ###############################################
 
diff --git a/docs/release_notes.md b/docs/release_notes.md
index cc17462..9657051 100644
--- a/docs/release_notes.md
+++ b/docs/release_notes.md
@@ -1,8 +1,8 @@
 Release Notes {#releasenotes}
 ==============
 
-v0.1.0
-======
+arrayfire-py v0.1.0
+====================
 Welcome to the ArrayFire Python Bindings! These are the currently supported features:
 
 - Support for all backends (cpu, opencl, oneapi, cuda)