From 8c915e91d3dbe2470504a1236ba2b0faa9c76812 Mon Sep 17 00:00:00 2001 From: Edwin Solis Date: Tue, 16 Sep 2025 21:28:27 -0700 Subject: [PATCH] Updated benchmarks and docs --- benchmarks/src/README.md | 14 ++- benchmarks/src/graphs.py | 39 ++++++-- benchmarks/src/pytest_benchmark/common.py | 20 ++-- .../src/pytest_benchmark/test_blackscholes.py | 27 ++++++ .../src/pytest_benchmark/test_elementwise.py | 5 +- benchmarks/src/pytest_benchmark/test_fft.py | 7 +- benchmarks/src/pytest_benchmark/test_gemm.py | 7 +- .../src/pytest_benchmark/test_kmeans.py | 91 ++++++++++++++++++- .../src/pytest_benchmark/test_linalg.py | 44 ++++++--- .../src/pytest_benchmark/test_mandelbrot.py | 48 +++++++++- .../pytest_benchmark/test_montecarlo_pi.py | 8 +- benchmarks/src/pytest_benchmark/test_nbody.py | 6 +- benchmarks/src/pytest_benchmark/test_nn.py | 85 ++++++++++++++++- .../src/pytest_benchmark/test_random.py | 10 +- benchmarks/src/requirements.txt | 1 + docs/installation.rst | 23 ++++- docs/release_notes.md | 4 +- 17 files changed, 386 insertions(+), 53 deletions(-) diff --git a/benchmarks/src/README.md b/benchmarks/src/README.md index ae84c4e..a1c7beb 100644 --- a/benchmarks/src/README.md +++ b/benchmarks/src/README.md @@ -3,10 +3,18 @@ Benchmarks ## Setting up environment +Create a python environment and install pytest and the compute libraries: ```sh python -m pip install -r requirements.txt ``` +If running `dpnp` with Nvidia or AMD devices, you must install the oneapi toolkit along with the corresponding oneapi pluging: + +```sh + # install oneapi toolkit and plugins + source /opt/intel/oneapi/setvars.sh +``` + ## Benchmark parameters The benchmark packages, rounds, array sizes, and numeric type may be specified on the constants at the top of [pytest_benchmark/common.py](pytest_benchmark/common.py). @@ -20,16 +28,18 @@ These are the steps to run the benchmarks, and produce the graphs Run the benchmarks and store the results in `results.json` ```sh - pytest .\pytest_benchmark --benchmark-json=results.json + pytest ./pytest_benchmark --benchmark-json=results.json ``` To create graphs and store the timing results after creating the `results.json`, run: ```sh + mkdir img python graphs.py ``` To modify the tests being shown, modify the `TESTS` list at the top of the `graphs.py` file. -To modify the labels shown, modify `PKG_LABELS` +To modify the legend of the package labels shown, modify `PKG_LABELS` +To modify the name of the tests shown, modify `TESTS_GRAPH_NAME` To modify the hardware display, modify `HARDWARE` ## Notes diff --git a/benchmarks/src/graphs.py b/benchmarks/src/graphs.py index 1b2e4e4..ab32b5c 100644 --- a/benchmarks/src/graphs.py +++ b/benchmarks/src/graphs.py @@ -7,7 +7,7 @@ BENCHMARKS_JSON = "results.json" # Hardware details shown in title -HARDWARE = "AMD Ryzen 9 9900X 12-Core Processor 63032 MB (fp64 fp16)\noneAPI 2025.1.3 Intel(R) OpenCL Graphics: Intel(R) Arc(TM) B580 Graphics, 11873 MB (fp64 fp16)" +HARDWARE = "Intel Xeon Gold 5315Y (8 Processors) @ 3.201GHz 63032 MB\noneAPI 2025.2.1 NVIDIA RTX A4000, 16222 MB, CUDA 12.8 Compute 8.6" # Show speedup in graph SHOW_NUMBERS = True @@ -16,12 +16,13 @@ ROUND_NUMBERS = 1 # package list in graph order; arrayfire packages are added later -PKG_NAMES = ["numpy", "dpnp", "cupy"] +PKG_NAMES = ["numpy", "dpnp", "cupy", "cupynumeric"] # color used in graphs PKG_COLOR = { "numpy": "tab:blue", "cupy": "tab:green", + "cupynumeric": "green", "dpnp": "tab:red", "afcpu": "tab:orange", "afopencl": "tab:orange", @@ -32,8 +33,9 @@ # labels displayed in the graph PKG_LABELS = { "numpy": "numpy[cpu]", - "dpnp": "dpnp[level_zero:gpu]", + "dpnp": "dpnp[cuda:gpu]", "cupy": "cupy", + "cupynumeric": "cupynumeric", "afcpu": "afcpu", "afcuda": "afcuda", "afopencl": "afopencl[opencl:gpu]", @@ -44,16 +46,16 @@ # Tests to be shown in graphs TESTS = [ - "qr", + "group_elementwise", "neural_network", - "gemm", + "black_scholes", "mandelbrot", "nbody", "pi", - "black_scholes", - "fft", "normal", - "group_elementwise", + "gemm", + "fft", + "qr", # Other tests # 'svd # 'cholesky', @@ -63,6 +65,25 @@ # 'inv' ] +# Reverse list so it appears in order on graph +TESTS.reverse() + +TESTS_GRAPH_NAME = { + "group_elementwise": "Group_elementwise (JIT)", + "neural_network": "Neural Network (JIT)", + "black_scholes": "Black Scholes (JIT)", + "mandelbrot": "Mandelbrot (JIT)", + "nbody": "Nbody (JIT)", + "pi": "Montecarlo Pi (JIT)", + "normal": "Normal Distribution", + "gemm": "General Matrix Multiplication", + "fft": "2D FFT", + "qr": "QR Decomposition", +} + +for name in TESTS: + if name not in TESTS_GRAPH_NAME: + TESTS_GRAPH_NAME[name] = name def get_benchmark_data(): results = {} @@ -189,7 +210,7 @@ def generate_group_graph(test_list=None, show_numbers=False, filename="compariso xlabels = [] for test in tests: - xlabels.append(test + "\n" + descriptions[test]) + xlabels.append(TESTS_GRAPH_NAME[test] + "\n" + descriptions[test]) ax.set_xlabel("Speedup") ax.set_xscale("log") diff --git a/benchmarks/src/pytest_benchmark/common.py b/benchmarks/src/pytest_benchmark/common.py index 9e82a47..f6dc2ef 100644 --- a/benchmarks/src/pytest_benchmark/common.py +++ b/benchmarks/src/pytest_benchmark/common.py @@ -29,6 +29,7 @@ import math import cupy +import cupynumeric import dpctl import dpnp import numpy as np @@ -38,19 +39,19 @@ # modify parameters for most benchmarks ROUNDS = 30 -NSIZE = 2**13 +NSIZE = 2**11 NNSIZE = NSIZE**2 DTYPE = "float32" # comment a line to remove that package from testing PKGDICT = { - "dpnp": dpnp, "numpy": np, "cupy": cupy, # "afcpu": af, "afopencl": af, - "afcuda": af, "afoneapi": af, + "dpnp": dpnp, + "cupynumeric": cupynumeric, } PKGS = [] @@ -66,11 +67,13 @@ def initialize_package(PKG_ID): pkg = PKGDICT[PKG_ID] try: + # Free all unused memory + gc.collect() af.device_gc() mempool = cupy.get_default_memory_pool() mempool.free_all_blocks() - except: - pass + except Exception as e: + print(e) if PKG_ID == "afcpu": af.set_backend(af.BackendType.cpu) @@ -98,8 +101,7 @@ def initialize_package(PKG_ID): print(cupy.cuda.Device()) mempool = cupy.get_default_memory_pool() mempool.free_all_blocks() + elif PKG_ID == "cupynumeric": + pass else: - raise NotImplementedError() - - # Free all unused memory - gc.collect() + raise NotImplementedError() \ No newline at end of file diff --git a/benchmarks/src/pytest_benchmark/test_blackscholes.py b/benchmarks/src/pytest_benchmark/test_blackscholes.py index fae2fc5..7daabc7 100644 --- a/benchmarks/src/pytest_benchmark/test_blackscholes.py +++ b/benchmarks/src/pytest_benchmark/test_blackscholes.py @@ -92,6 +92,29 @@ def cnd(x): return (C, P) +def black_scholes_cupynumeric(S, X, R, V, T): + # S = Underlying stock price + # X = Strike Price + # R = Risk free rate of interest + # V = Volatility + # T = Time to maturity + def cnd(x): + temp = x > 0 + erf = lambda arr: cupynumeric.exp(-arr * arr) + return temp * (0.5 + erf(x / sqrt2) / 2) + (1 - temp) * (0.5 - erf((-x) / sqrt2) / 2) + + d1 = cupynumeric.log(S / X) + d1 = d1 + (R + (V * V) * 0.5) * T + d1 = d1 / (V * cupynumeric.sqrt(T)) + + d2 = d1 - (V * cupynumeric.sqrt(T)) + cnd_d1 = cnd(d1) + cnd_d2 = cnd(d2) + + C = S * cnd_d1 - (X * cupynumeric.exp((-R) * T) * cnd_d2) + P = X * cupynumeric.exp((-R) * T) * (1 - cnd_d2) - (S * (1 - cnd_d1)) + + return (C, P) def black_scholes_arrayfire(S, X, R, V, T): def cnd(x): @@ -137,6 +160,9 @@ def generate_arrays(pkgid, count): elif "numpy" == pkg: for i in range(count): arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE)) + elif "cupynumeric" == pkg: + for i in range(count): + arr_list.append(cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE)) return arr_list @@ -146,4 +172,5 @@ def generate_arrays(pkgid, count): "numpy": black_scholes_numpy, "cupy": black_scholes_cupy, "arrayfire": black_scholes_arrayfire, + "cupynumeric": black_scholes_cupynumeric } diff --git a/benchmarks/src/pytest_benchmark/test_elementwise.py b/benchmarks/src/pytest_benchmark/test_elementwise.py index 0933ce6..771baee 100644 --- a/benchmarks/src/pytest_benchmark/test_elementwise.py +++ b/benchmarks/src/pytest_benchmark/test_elementwise.py @@ -52,7 +52,7 @@ def func_cupy(arr): cupy.cuda.runtime.deviceSynchronize() return x - GROUP_FUNCS = {"numpy": func, "cupy": func_cupy, "arrayfire": func_af, "dpnp": func} + GROUP_FUNCS = {"numpy": func, "cupy": func_cupy, "arrayfire": func_af, "dpnp": func, "cupynumeric": func} benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" result = benchmark.pedantic( @@ -312,5 +312,8 @@ def generate_arrays(pkgid, count): elif "numpy" == pkg: for i in range(count): arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE)) + elif "cupynumeric" == pkg: + for i in range(count): + arr_list.append(cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE)) return arr_list diff --git a/benchmarks/src/pytest_benchmark/test_fft.py b/benchmarks/src/pytest_benchmark/test_fft.py index 096573d..ac234f2 100644 --- a/benchmarks/src/pytest_benchmark/test_fft.py +++ b/benchmarks/src/pytest_benchmark/test_fft.py @@ -50,6 +50,9 @@ def generate_arrays(pkgid, count): elif "numpy" == pkg: for i in range(count): arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE)) + elif "cupynumeric" == pkg: + for i in range(count): + arr_list.append(cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE)) return arr_list @@ -86,5 +89,7 @@ def fft_cupy(arr): cupy.cuda.runtime.deviceSynchronize() return res +def fft_cupynumeric(arr): + return cupynumeric.fft.fft(arr) -FUNCS = {"dpnp": fft_dpnp, "numpy": fft_np, "cupy": fft_cupy, "arrayfire": fft_af} +FUNCS = {"dpnp": fft_dpnp, "numpy": fft_np, "cupy": fft_cupy, "arrayfire": fft_af, "cupynumeric": fft_cupynumeric} diff --git a/benchmarks/src/pytest_benchmark/test_gemm.py b/benchmarks/src/pytest_benchmark/test_gemm.py index e62d5a1..f66bbd4 100644 --- a/benchmarks/src/pytest_benchmark/test_gemm.py +++ b/benchmarks/src/pytest_benchmark/test_gemm.py @@ -81,6 +81,9 @@ def generate_arrays(pkgid, count): np.random.rand(1) for i in range(count): arr_list.append(np.random.rand(NSIZE, NSIZE).astype(DTYPE)) + elif "cupynumeric" == pkg: + for i in range(count): + arr_list.append(cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE)) return arr_list @@ -117,5 +120,7 @@ def gemm_cupy(A, B, C): cupy.cuda.runtime.deviceSynchronize() return C +def gemm_cupynumeric(A, B, C): + return alpha * cupynumeric.matmul(A, B) + beta * C -FUNCS = {"numpy": gemm_np, "cupy": gemm_cupy, "arrayfire": gemm_af, "dpnp": gemm_dpnp} +FUNCS = {"numpy": gemm_np, "cupy": gemm_cupy, "arrayfire": gemm_af, "dpnp": gemm_dpnp, "cupynumeric": gemm_cupynumeric} diff --git a/benchmarks/src/pytest_benchmark/test_kmeans.py b/benchmarks/src/pytest_benchmark/test_kmeans.py index 53c5183..291c248 100644 --- a/benchmarks/src/pytest_benchmark/test_kmeans.py +++ b/benchmarks/src/pytest_benchmark/test_kmeans.py @@ -12,7 +12,8 @@ class TestKmeans: def test_kmeans(self, benchmark, pkgid): initialize_package(pkgid) pkg = PKGDICT[pkgid] - kmean_class = {"dpnp": kmeans_dpnp, "numpy": kmeans_numpy, "cupy": kmeans_cupy, "arrayfire": kmeans_af} + kmean_class = {"dpnp": kmeans_dpnp, "numpy": kmeans_numpy, "cupy": kmeans_cupy, "arrayfire": kmeans_af, + "cupynumeric": kmeans_cupynumeric} obj = kmean_class[pkg.__name__]() benchmark.extra_info["description"] = f"{NSAMPLES}x{NFEATURES} over {K} centers" @@ -189,6 +190,94 @@ def kmeans(self): return centroids, cluster_assignments + +class kmeans_cupynumeric: + def __init__(self): + self.data = cupynumeric.random.random((NSAMPLES, NFEATURES)) + self.centroid_indices = cupynumeric.random.choice(self.data.shape[0], K, replace=False) + + def initialize_centroids(self): + """ + Randomly initializes k centroids from the data points. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + k (int): The number of clusters. + + Returns: + np.ndarray: Initial centroids (k, n_features). + """ + + return self.data[self.centroid_indices, :] + + def assign_to_clusters(self, centroids): + """ + Assigns each data point to the closest centroid. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + centroids (np.ndarray): The current centroids (k, n_features). + + Returns: + np.ndarray: An array of cluster assignments for each data point (n_samples,). + """ + distances = cupynumeric.sqrt(((self.data[:, cupynumeric.newaxis, :] - centroids[cupynumeric.newaxis, :, :]) ** 2).sum(axis=2)) + cluster_assignments = cupynumeric.argmin(distances, axis=1) + return cluster_assignments + + def update_centroids(self, cluster_assignments): + """ + Recalculates the centroids based on the mean of the assigned data points. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + cluster_assignments (np.ndarray): An array of cluster assignments. + k (int): The number of clusters. + + Returns: + np.ndarray: Updated centroids (k, n_features). + """ + new_centroids = cupynumeric.zeros((K, self.data.shape[1])) + for i in range(K): + points_in_cluster = self.data[cluster_assignments == i] + if len(points_in_cluster) > 0: + new_centroids[i] = cupynumeric.mean(points_in_cluster, axis=0) + return new_centroids + + def kmeans(self): + """ + Performs the K-Means clustering algorithm. + + Args: + data (np.ndarray): The input data points (n_samples, n_features). + k (int): The number of clusters. + max_iterations (int): Maximum number of iterations to run the algorithm. + tolerance (float): The tolerance for convergence (change in centroids). + + Returns: + tuple: A tuple containing: + - np.ndarray: Final centroids (k, n_features). + - np.ndarray: Final cluster assignments for each data point (n_samples,). + """ + centroids = self.initialize_centroids() + cluster_assignments = None + + for i in range(ITERATIONS): + old_centroids = cupynumeric.copy(centroids) + + # E-step: Assign points to clusters + cluster_assignments = self.assign_to_clusters(centroids) + + # M-step: Update centroids + centroids = self.update_centroids(cluster_assignments) + + # Check for convergence + if cupynumeric.linalg.norm(centroids - old_centroids) < TOLERANCE: + break + + return centroids, cluster_assignments + + class kmeans_af: def __init__(self): self.data = af.Array(np.random.random((NSAMPLES, NFEATURES)).flatten().tolist(), shape=(NSAMPLES, NFEATURES)) diff --git a/benchmarks/src/pytest_benchmark/test_linalg.py b/benchmarks/src/pytest_benchmark/test_linalg.py index 2b75601..870738b 100644 --- a/benchmarks/src/pytest_benchmark/test_linalg.py +++ b/benchmarks/src/pytest_benchmark/test_linalg.py @@ -63,7 +63,12 @@ def generate_arrays(pkgid, count, posdef=False): if posdef: x = x @ x.T + x.T @ x + eps arr_list.append(x) - + elif "cupynumeric" == pkg: + for i in range(count): + x = cupynumeric.random.rand(NSIZE, NSIZE).astype(DTYPE) + if posdef: + x = x @ x.T + x.T @ x + eps + arr_list.append(x) return arr_list @@ -88,6 +93,8 @@ def svd_cupy(arr): cupy.cuda.runtime.deviceSynchronize() return x +def svd_cupynumeric(arr): + return cupynumeric.linalg.svd(arr) def qr_np(arr): return np.linalg.qr(arr) @@ -110,6 +117,8 @@ def qr_cupy(arr): cupy.cuda.runtime.deviceSynchronize() return x +def qr_cupynumeric(arr): + return cupynumeric.linalg.qr(arr) def cholesky_np(arr): return np.linalg.cholesky(arr) @@ -131,11 +140,8 @@ def cholesky_cupy(arr): cupy.cuda.runtime.deviceSynchronize() return x - -def qr_cupy(arr): - x = cupy.linalg.qr(arr) - cupy.cuda.runtime.deviceSynchronize() - return x +def cholesky_cupynumeric(arr): + return cupynumeric.linalg.cholesky(arr) def inv_np(arr): @@ -147,7 +153,7 @@ def inv_dpnp(arr): def inv_af(arr): - x, info = af.inverse(arr) + x = af.inverse(arr) af.eval(x) af.sync() return x @@ -158,6 +164,8 @@ def inv_cupy(arr): cupy.cuda.runtime.deviceSynchronize() return x +def inv_cupynumeric(arr): + return cupynumeric.linalg.inv(arr) def det_np(arr): return np.linalg.det(arr) @@ -178,6 +186,8 @@ def det_cupy(arr): cupy.cuda.runtime.deviceSynchronize() return x +def det_cupynumeric(arr): + return cupynumeric.linalg.det(arr) def norm_np(arr): return np.linalg.norm(arr) @@ -198,6 +208,8 @@ def norm_cupy(arr): cupy.cuda.runtime.deviceSynchronize() return x +def norm_cupynumeric(arr): + return cupynumeric.linalg.norm(arr) @pytest.mark.parametrize("pkgid", IDS, ids=IDS) class TestLinalg: @@ -208,7 +220,8 @@ def test_cholesky(self, benchmark, pkgid): benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" pkg = PKGDICT[pkgid] - CHOLESKY_FUNCS = {"numpy": cholesky_np, "cupy": cholesky_cupy, "arrayfire": cholesky_af, "dpnp": cholesky_dpnp} + CHOLESKY_FUNCS = {"numpy": cholesky_np, "cupy": cholesky_cupy, "arrayfire": cholesky_af, "dpnp": cholesky_dpnp, + "cupynumeric": cholesky_cupynumeric } result = benchmark.pedantic( target=CHOLESKY_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS ) @@ -220,7 +233,8 @@ def test_svd(self, benchmark, pkgid): benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" pkg = PKGDICT[pkgid] - SVD_FUNCS = {"numpy": svd_np, "cupy": svd_cupy, "arrayfire": svd_af, "dpnp": svd_dpnp} + SVD_FUNCS = {"numpy": svd_np, "cupy": svd_cupy, "arrayfire": svd_af, "dpnp": svd_dpnp, + "cupynumeric": svd_cupynumeric } result = benchmark.pedantic(target=SVD_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS) def test_qr(self, benchmark, pkgid): @@ -230,7 +244,8 @@ def test_qr(self, benchmark, pkgid): benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" pkg = PKGDICT[pkgid] - QR_FUNCS = {"numpy": qr_np, "cupy": qr_cupy, "arrayfire": qr_af, "dpnp": qr_dpnp} + QR_FUNCS = {"numpy": qr_np, "cupy": qr_cupy, "arrayfire": qr_af, "dpnp": qr_dpnp, + "cupynumeric": qr_cupynumeric } result = benchmark.pedantic(target=QR_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS) def test_inv(self, benchmark, pkgid): @@ -240,7 +255,8 @@ def test_inv(self, benchmark, pkgid): benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" pkg = PKGDICT[pkgid] - INV_FUNCS = {"numpy": inv_np, "cupy": inv_cupy, "arrayfire": inv_af, "dpnp": inv_dpnp} + INV_FUNCS = {"numpy": inv_np, "cupy": inv_cupy, "arrayfire": inv_af, "dpnp": inv_dpnp, + "cupynumeric": inv_cupynumeric } result = benchmark.pedantic(target=INV_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS) def test_det(self, benchmark, pkgid): @@ -250,7 +266,8 @@ def test_det(self, benchmark, pkgid): benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" pkg = PKGDICT[pkgid] - DET_FUNCS = {"numpy": det_np, "cupy": det_cupy, "arrayfire": det_af, "dpnp": det_dpnp} + DET_FUNCS = {"numpy": det_np, "cupy": det_cupy, "arrayfire": det_af, "dpnp": det_dpnp, + "cupynumeric": det_cupynumeric } result = benchmark.pedantic(target=DET_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS) def test_norm(self, benchmark, pkgid): @@ -260,5 +277,6 @@ def test_norm(self, benchmark, pkgid): benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} Matrix" pkg = PKGDICT[pkgid] - NORM_FUNCS = {"numpy": norm_np, "cupy": norm_cupy, "arrayfire": norm_af, "dpnp": norm_dpnp} + NORM_FUNCS = {"numpy": norm_np, "cupy": norm_cupy, "arrayfire": norm_af, "dpnp": norm_dpnp, + "cupynumeric": norm_cupynumeric } result = benchmark.pedantic(target=NORM_FUNCS[pkg.__name__], setup=setup, rounds=ROUNDS, iterations=ITERATIONS) diff --git a/benchmarks/src/pytest_benchmark/test_mandelbrot.py b/benchmarks/src/pytest_benchmark/test_mandelbrot.py index a9cd73b..1ceba09 100644 --- a/benchmarks/src/pytest_benchmark/test_mandelbrot.py +++ b/benchmarks/src/pytest_benchmark/test_mandelbrot.py @@ -12,8 +12,8 @@ xmax = 2 ymin = -2 ymax = 2 -xn = NSIZE -yn = NSIZE +xn = int(NSIZE / 2) +yn = int(NSIZE / 2) itermax = 20 horizon = 2.0 @@ -24,7 +24,7 @@ def test_mandelbrot(self, benchmark, pkgid): initialize_package(pkgid) pkg = PKGDICT[pkgid] - benchmark.extra_info["description"] = f"{NSIZE}x{NSIZE} grid iterated {itermax}x" + benchmark.extra_info["description"] = f"{xn}x{yn} grid iterated {itermax}x" result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=1) @@ -132,12 +132,49 @@ def mandelbrot_cupy(): Xi, Yi = Xi[I], Yi[I] C = C[I] + if i % 2 == 1: + mempool = cupy.get_default_memory_pool() + mempool.free_all_blocks() + Z_ = Z_.T N_ = N_.T cupy.cuda.runtime.deviceSynchronize() return Z_, N_ +def mandelbrot_cupynumeric(): + # Adapted from + # https://thesamovar.wordpress.com/2009/03/22/fast-fractals-with-python-and-numpy/ + Xi, Yi = np.mgrid[0:xn, 0:yn] + X = cupynumeric.linspace(xmin, xmax, xn, dtype=cupynumeric.float64)[Xi] + Y = cupynumeric.linspace(ymin, ymax, yn, dtype=cupynumeric.float64)[Yi] + C = X + Y * 1j + + N_ = cupynumeric.zeros(C.shape, dtype=cupynumeric.int64) + Z_ = cupynumeric.zeros(C.shape, dtype=cupynumeric.complex128) + Xi.shape = Yi.shape = C.shape = xn * yn + + Z = cupynumeric.zeros(C.shape, cupynumeric.complex128) + for i in range(itermax): + if not len(Z): + break + + # Compute for relevant points only + cupynumeric.multiply(Z, Z, Z) + cupynumeric.add(Z, C, Z) + + # Failed convergence + I = abs(Z) > horizon # noqa: E741 math variable + N_[Xi[I], Yi[I]] = i + 1 + Z_[Xi[I], Yi[I]] = Z[I] + + # Keep going with those who have not diverged yet + cupynumeric.logical_not(I, I) # np.negative(I, I) not working any longer + Z = Z[I] + Xi, Yi = Xi[I], Yi[I] + C = C[I] + + return Z_.T, N_.T def mandelbrot_af(): Xi = af.flat(af.range((xn, yn), axis=0, dtype=af.int64)) @@ -173,6 +210,9 @@ def mandelbrot_af(): Yi = Yi[I] C = C[I] + if i % 2 == 1: + af.device_gc() + Z_ = Z_.T N_ = N_.T af.eval(Z_) @@ -181,4 +221,4 @@ def mandelbrot_af(): return Z_, N_ -FUNCS = {"dpnp": mandelbrot_dpnp, "numpy": mandelbrot_np, "cupy": mandelbrot_cupy, "arrayfire": mandelbrot_af} +FUNCS = {"dpnp": mandelbrot_dpnp, "numpy": mandelbrot_np, "cupy": mandelbrot_cupy, "arrayfire": mandelbrot_af, "cupynumeric" : mandelbrot_cupynumeric} diff --git a/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py b/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py index 752b701..a296cf9 100644 --- a/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py +++ b/benchmarks/src/pytest_benchmark/test_montecarlo_pi.py @@ -47,5 +47,11 @@ def calc_pi_dpnp(samples): y = dpnp.random.rand(samples).astype(dpnp.float32) return 4.0 * dpnp.sum(in_circle(x, y)) / samples +def calc_pi_cupynumeric(samples): + x = cupynumeric.random.rand(samples).astype(cupynumeric.float32) + y = cupynumeric.random.rand(samples).astype(cupynumeric.float32) + return 4.0 * cupynumeric.sum(in_circle(x, y)) / samples -FUNCS = {"dpnp": calc_pi_dpnp, "numpy": calc_pi_numpy, "cupy": calc_pi_cupy, "arrayfire": calc_pi_af} + +FUNCS = {"dpnp": calc_pi_dpnp, "numpy": calc_pi_numpy, "cupy": calc_pi_cupy, "arrayfire": calc_pi_af, + "cupynumeric": calc_pi_cupynumeric } diff --git a/benchmarks/src/pytest_benchmark/test_nbody.py b/benchmarks/src/pytest_benchmark/test_nbody.py index 8d35aa9..1255faf 100644 --- a/benchmarks/src/pytest_benchmark/test_nbody.py +++ b/benchmarks/src/pytest_benchmark/test_nbody.py @@ -10,6 +10,7 @@ @pytest.mark.parametrize("pkgid", IDS, ids=IDS) class TestNbody: def test_nbody(self, benchmark, pkgid): + initialize_package(pkgid) pkg = PKGDICT[pkgid] setup = lambda: (generate_arrays(pkgid), {}) @@ -107,5 +108,8 @@ def generate_arrays(pkgid): arr_list.append(M * np.ones((NSIZE, 1), dtype=DTYPE)) for i in range(count): arr_list.append(np.random.rand(NSIZE, 3).astype(DTYPE)) - + elif "cupynumeric" == pkgname: + arr_list.append(M * cupynumeric.ones((NSIZE, 1), dtype=DTYPE)) + for i in range(count): + arr_list.append(cupynumeric.random.rand(NSIZE, 3).astype(DTYPE)) return (pkg, arr_list[0], arr_list[1], arr_list[2]) diff --git a/benchmarks/src/pytest_benchmark/test_nn.py b/benchmarks/src/pytest_benchmark/test_nn.py index 7940cb0..e61f704 100644 --- a/benchmarks/src/pytest_benchmark/test_nn.py +++ b/benchmarks/src/pytest_benchmark/test_nn.py @@ -19,6 +19,7 @@ def test_neural_network(self, benchmark, pkgid): "numpy": NeuralNetwork_numpy, "cupy": NeuralNetwork_cupy, "arrayfire": NeuralNetwork_af, + "cupynumeric": NeuralNetwork_cupynumeric, } obj = nn[pkg.__name__]() @@ -281,6 +282,87 @@ def train(self): def predict(self, X): return cupy.argmax(self.forward(X), axis=1) +class NeuralNetwork_cupynumeric: + def __init__(self): + self.input_size = INPUT_SIZE + self.hidden_size = HIDDEN_SIZE + self.output_size = OUTPUT_SIZE + self.learning_rate = LEARNING_RATE + + # Initialize weights and biases + # He initialization (for ReLU) is often a good choice + self.W1 = cupynumeric.random.randn(self.input_size, self.hidden_size) * cupynumeric.sqrt(2.0 / self.input_size) + self.b1 = cupynumeric.zeros((1, self.hidden_size)) + self.W2 = cupynumeric.random.randn(self.hidden_size, self.output_size) * cupynumeric.sqrt(2.0 / self.hidden_size) + self.b2 = cupynumeric.zeros((1, self.output_size)) + + self.X_train = cupynumeric.random.rand(SAMPLES, INPUT_SIZE) + self.y_train = cupynumeric.zeros((SAMPLES * OUTPUT_SIZE)) + self.y_train[ + cupynumeric.arange(SAMPLES) * OUTPUT_SIZE + cupynumeric.floor(cupynumeric.random.rand(SAMPLES) * OUTPUT_SIZE).astype(int) + ] = 1 + self.y_train = self.y_train.reshape((SAMPLES, OUTPUT_SIZE)) + + def relu(self, x): + return cupynumeric.maximum(0, x) + + def relu_derivative(self, x): + return (x > 0).astype(float) + + def softmax(self, x): + exp_scores = cupynumeric.exp(x - cupynumeric.max(x, axis=1, keepdims=True)) # Subtract max for numerical stability + return exp_scores / cupynumeric.sum(exp_scores, axis=1, keepdims=True) + + def forward(self, X): + # Hidden layer + self.z1 = cupynumeric.dot(X, self.W1) + self.b1 + self.a1 = self.relu(self.z1) + + # Output layer + self.z2 = cupynumeric.dot(self.a1, self.W2) + self.b2 + self.a2 = self.softmax(self.z2) + return self.a2 + + def backward(self, X, y, output): + # Calculate gradients for the output layer + error_output = output - y # Derivative of cross-entropy loss w.r.t. softmax input + dW2 = cupynumeric.dot(self.a1.T, error_output) + db2 = cupynumeric.sum(error_output, axis=0, keepdims=True) + + # Calculate gradients for the hidden layer + error_hidden = cupynumeric.dot(error_output, self.W2.T) * self.relu_derivative(self.z1) + dW1 = cupynumeric.dot(X.T, error_hidden) + db1 = cupynumeric.sum(error_hidden, axis=0, keepdims=True) + + # Update weights and biases + self.W2 -= self.learning_rate * dW2 + self.b2 -= self.learning_rate * db2 + self.W1 -= self.learning_rate * dW1 + self.b1 -= self.learning_rate * db1 + + def train(self): + X_train = self.X_train + y_train = self.y_train + + num_samples = X_train.shape[0] + + for epoch in range(ITERATIONS): + # Shuffle data for each epoch + X_shuffled = X_train + y_shuffled = y_train + + for i in range(0, num_samples, BATCH_SIZE): + X_batch = X_shuffled[i : i + BATCH_SIZE, :] + y_batch = y_shuffled[i : i + BATCH_SIZE, :] + + # Forward pass + output = self.forward(X_batch) + + # Backward pass and update weights + self.backward(X_batch, y_batch, output) + + def predict(self, X): + return cupynumeric.argmax(self.forward(X), axis=1) class NeuralNetwork_af: def __init__(self): @@ -299,8 +381,7 @@ def __init__(self): self.X_train = af.randu((SAMPLES, INPUT_SIZE)) self.y_train = af.constant(0, (SAMPLES, OUTPUT_SIZE)) - self.y_train = af.constant(0, (SAMPLES, OUTPUT_SIZE)) - self.y_train[af.iota(SAMPLES), af.floor(af.randu(SAMPLES) * OUTPUT_SIZE)] = 1 + self.y_train[af.iota(SAMPLES) * OUTPUT_SIZE + af.floor(af.randu(SAMPLES) * OUTPUT_SIZE)] = 1 af.eval(self.X_train) af.eval(self.y_train) diff --git a/benchmarks/src/pytest_benchmark/test_random.py b/benchmarks/src/pytest_benchmark/test_random.py index 19caebb..9f32a3e 100644 --- a/benchmarks/src/pytest_benchmark/test_random.py +++ b/benchmarks/src/pytest_benchmark/test_random.py @@ -15,6 +15,8 @@ def randn_cupy(): arr = cupy.random.normal(size=(NNSIZE)) cupy.cuda.runtime.deviceSynchronize() +def randn_cupynumeric(): + arr = cupynumeric.random.normal(size=(NNSIZE)) def randn_af(): arr = af.randn((NNSIZE)) @@ -34,6 +36,8 @@ def randu_cupy(): arr = cupy.random.uniform(size=(NNSIZE)) cupy.cuda.runtime.deviceSynchronize() +def randu_cupynumeric(): + arr = cupynumeric.random.uniform(size=(NNSIZE)) def randu_af(): arr = af.randu((NNSIZE)) @@ -47,7 +51,8 @@ def test_normal(self, benchmark, pkgid): initialize_package(pkgid) pkg = PKGDICT[pkgid] - FUNCS = {"dpnp": randn_dpnp, "numpy": randn_np, "cupy": randn_cupy, "arrayfire": randn_af} + FUNCS = {"dpnp": randn_dpnp, "numpy": randn_np, "cupy": randn_cupy, "arrayfire": randn_af, + "cupynumeric": randn_cupynumeric} benchmark.extra_info["description"] = f"{NNSIZE:.2e} Samples" result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=ITERATIONS) @@ -56,6 +61,7 @@ def test_uniform(self, benchmark, pkgid): initialize_package(pkgid) pkg = PKGDICT[pkgid] - FUNCS = {"dpnp": randu_dpnp, "numpy": randu_np, "cupy": randu_cupy, "arrayfire": randu_af} + FUNCS = {"dpnp": randu_dpnp, "numpy": randu_np, "cupy": randu_cupy, "arrayfire": randu_af, + "cupynumeric": randu_cupynumeric} result = benchmark.pedantic(target=FUNCS[pkg.__name__], rounds=ROUNDS, iterations=ITERATIONS) diff --git a/benchmarks/src/requirements.txt b/benchmarks/src/requirements.txt index 30b908b..466f995 100644 --- a/benchmarks/src/requirements.txt +++ b/benchmarks/src/requirements.txt @@ -2,6 +2,7 @@ dpnp numpy cupy-cuda12x +nvidia-cupynumeric pytest pytest-benchmark matplotlib diff --git a/docs/installation.rst b/docs/installation.rst index 017d733..b147e15 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -5,17 +5,32 @@ To use ArrayFire-Python you require three things: .. list-table:: - * - :literal:`arrayfire-python` package - - You may install it through pip :literal:`pip install arrayfire-python` or from `building arrayfire-python wheel <../README.md>`_ + * - :literal:`arrayfire-py` package + - It is the intended User Interface that provides a numpy-like layer to execute math and array operations with ArrayFire. * - :literal:`arrayfire-binary-python-wrapper` - - You may download and install through pip :literal:`pip install arrayfire_binary_python_wrapper-0.8.0+af3.10.0 -f https://arrayfire.com/python/binaries` which will come with ArrayFire C Libraries as well. You may also `build from source `_ without the C Libraries. + - It is a thin wrapper that provides Python direct access to the ArrayFire functions in the C library. This package must have access to ArrayFire binaries, either through a system-wide install, or through a pre-bundled wheel that includes binaries. * - :literal:`ArrayFire C/C++ Libraries` - - If you build the binary wrapper from source or wish to program with ArrayFire in C/C++, navigate to https://arrayfire.com/download and download the appropriate installer for the target architecture and operating system. Although ArrayFire can be `built from source `_, the installers conveniently package necessary dependencies. + - They are the binaries obtained from compiling the ArrayFire C/C++ Project or more simply by downloading installers in the ArrayFire download page. Binaries can also be obtained as part of a pre-packaged arrayfire-binary-python-wrapper wheel. Below we detail more on the ArrayFire C Libraries installation procedure. Install the latest device drivers before using ArrayFire. Drivers and runtimes should be downloaded and installed from each device vendor's website. +Install Instructions for ArrayFire Python +############################################### + +arrayfire-py +************* +You may install it through pip :literal:`pip install arrayfire-python` or from `building arrayfire-py wheel <../README.md>`_. By installing through pip, you will automatically download the latest arrayfire-binary-python-wrapper containing the ArrayFire C Libraries. + +arrayfire-binary-python-wrapper +******************************** + +If you wish to use some specific version of ArrayFire, you can must download a specifc version of t he arrayfire-binary-python-wrapper. +You may download one and install through pip which will come with ArrayFire C Libraries: :literal:`pip install arrayfire_binary_python_wrapper-0.8.0+af3.10.0 -f https://arrayfire.com/python/binaries`. +You may also `build from source `_ without the C Libraries and then install the ArrayFire Binaries via the ArrayFire Installers. + + Install Instructions for ArrayFire C Libraries ############################################### diff --git a/docs/release_notes.md b/docs/release_notes.md index cc17462..9657051 100644 --- a/docs/release_notes.md +++ b/docs/release_notes.md @@ -1,8 +1,8 @@ Release Notes {#releasenotes} ============== -v0.1.0 -====== +arrayfire-py v0.1.0 +==================== Welcome to the ArrayFire Python Bindings! These are the currently supported features: - Support for all backends (cpu, opencl, oneapi, cuda)