diff --git a/brainpy/connect/base.py b/brainpy/connect/base.py index 7bfaae917..fbde670c7 100644 --- a/brainpy/connect/base.py +++ b/brainpy/connect/base.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- import abc -from typing import Union, List, Tuple +from typing import Union, List, Tuple, Any -import numpy as np +import numpy as onp from brainpy import tools, math as bm from brainpy.errors import ConnectorError @@ -42,8 +42,8 @@ PRE2SYN, POST2SYN, PRE_SLICE, POST_SLICE] -MAT_DTYPE = np.bool_ -IDX_DTYPE = np.uint32 +MAT_DTYPE = onp.bool_ +IDX_DTYPE = onp.uint32 def set_default_dtype(mat_dtype=None, idx_dtype=None): @@ -92,7 +92,39 @@ class Connector(abc.ABC): class TwoEndConnector(Connector): - """Synaptic connector to build synapse connections between two neuron groups.""" + """Synaptic connector to build connections between two neuron groups. + + If users want to customize their `Connector`, there are two ways: + + 1. Implementing ``build_conn(self)`` function, which returns one of + the connection data ``csr`` (CSR sparse data, a tuple of ), + ``ij`` (COO sparse data, a tuple of ), and ``mat`` + (a binary connection matrix). For instance, + + .. code-block:: python + + import brainpy as bp + class MyConnector(bp.conn.TwoEndConnector): + def build_conn(self): + return dict(csr=, mat=, ij=) + + 2. Implementing functions ``build_mat()``, ``build_csr()``, and + ``build_coo()``. Users can provide all three functions, or one of them. + + .. code-block:: python + + import brainpy as bp + class MyConnector(bp.conn.TwoEndConnector): + def build_mat(self, pre_size, post_size): + return conn_matrix + + def build_csr(self, pre_size, post_size): + return post_ids, inptr + + def build_coo(self, pre_size, post_size): + return pre_ids, post_ids + + """ def __init__(self, ): self.pre_size = None @@ -100,6 +132,9 @@ def __init__(self, ): self.pre_num = None self.post_num = None + def __repr__(self): + return self.__class__.__name__ + def __call__(self, pre_size, post_size): """Create the concrete connections between two end objects. @@ -140,15 +175,16 @@ def _reset_conn(self, pre_size, post_size): """ self.__call__(pre_size, post_size) - def check(self, structures: Union[Tuple, List, str]): - # check "pre_num" and "post_num" - try: - assert self.pre_num is not None and self.post_num is not None - except AssertionError: - raise ConnectorError(f'self.pre_num or self.post_num is not defined. ' - f'Please use self.__call__(pre_size, post_size) ' - f'before requiring properties.') + @property + def is_version2_style(self): + if ((hasattr(self.build_coo, 'not_customized') and self.build_coo.not_customized) and + (hasattr(self.build_csr, 'not_customized') and self.build_csr.not_customized) and + (hasattr(self.build_mat, 'not_customized') and self.build_mat.not_customized)): + return False + else: + return True + def check(self, structures: Union[Tuple, List, str]): # check synaptic structures if isinstance(structures, str): structures = [structures] @@ -159,22 +195,22 @@ def check(self, structures: Union[Tuple, List, str]): raise ConnectorError(f'Unknown synapse structure "{n}". ' f'Only {SUPPORTED_SYN_STRUCTURE} is supported.') + + def _return_by_mat(self, structures, mat, all_data: dict): - assert isinstance(mat, np.ndarray) and np.ndim(mat) == 2 + assert mat.ndim == 2 if (CONN_MAT in structures) and (CONN_MAT not in all_data): all_data[CONN_MAT] = bm.asarray(mat, dtype=MAT_DTYPE) require_other_structs = len([s for s in structures if s != CONN_MAT]) > 0 if require_other_structs: - pre_ids, post_ids = np.where(mat > 0) - pre_ids = np.ascontiguousarray(pre_ids, dtype=IDX_DTYPE) - post_ids = np.ascontiguousarray(post_ids, dtype=IDX_DTYPE) + pre_ids, post_ids = onp.where(mat > 0) + pre_ids = onp.ascontiguousarray(pre_ids, dtype=IDX_DTYPE) + post_ids = onp.ascontiguousarray(post_ids, dtype=IDX_DTYPE) self._return_by_ij(structures, ij=(pre_ids, post_ids), all_data=all_data) def _return_by_csr(self, structures, csr: tuple, all_data: dict): indices, indptr = csr - assert isinstance(indices, np.ndarray) - assert isinstance(indptr, np.ndarray) assert self.pre_num == indptr.size - 1 if (CONN_MAT in structures) and (CONN_MAT not in all_data): @@ -182,7 +218,7 @@ def _return_by_csr(self, structures, csr: tuple, all_data: dict): all_data[CONN_MAT] = bm.asarray(conn_mat, dtype=MAT_DTYPE) if (PRE_IDS in structures) and (PRE_IDS not in all_data): - pre_ids = np.repeat(np.arange(self.pre_num), np.diff(indptr)) + pre_ids = onp.repeat(onp.arange(self.pre_num), onp.diff(indptr)) all_data[PRE_IDS] = bm.asarray(pre_ids, dtype=IDX_DTYPE) if (POST_IDS in structures) and (POST_IDS not in all_data): @@ -198,20 +234,18 @@ def _return_by_csr(self, structures, csr: tuple, all_data: dict): bm.asarray(indptrc, dtype=IDX_DTYPE)) if (PRE2SYN in structures) and (PRE2SYN not in all_data): - syn_seq = np.arange(indices.size, dtype=IDX_DTYPE) + syn_seq = onp.arange(indices.size, dtype=IDX_DTYPE) all_data[PRE2SYN] = (bm.asarray(syn_seq, dtype=IDX_DTYPE), bm.asarray(indptr, dtype=IDX_DTYPE)) if (POST2SYN in structures) and (POST2SYN not in all_data): - syn_seq = np.arange(indices.size, dtype=IDX_DTYPE) + syn_seq = onp.arange(indices.size, dtype=IDX_DTYPE) _, indptrc, syn_seqc = csr2csc((indices, indptr), self.post_num, syn_seq) all_data[POST2SYN] = (bm.asarray(syn_seqc, dtype=IDX_DTYPE), bm.asarray(indptrc, dtype=IDX_DTYPE)) def _return_by_ij(self, structures, ij: tuple, all_data: dict): pre_ids, post_ids = ij - assert isinstance(pre_ids, np.ndarray) - assert isinstance(post_ids, np.ndarray) if (CONN_MAT in structures) and (CONN_MAT not in all_data): all_data[CONN_MAT] = bm.asarray(ij2mat(ij, self.pre_num, self.post_num), dtype=MAT_DTYPE) @@ -232,9 +266,9 @@ def make_returns(self, structures, conn_data, csr=None, mat=None, ij=None): """Make the desired synaptic structures and return them. """ if isinstance(conn_data, dict): - csr = conn_data['csr'] - mat = conn_data['mat'] - ij = conn_data['ij'] + csr = conn_data.get('csr', None) + mat = conn_data.get('mat', None) + ij = conn_data.get('ij', None) elif isinstance(conn_data, tuple): if conn_data[0] == 'csr': csr = conn_data[1] @@ -244,6 +278,8 @@ def make_returns(self, structures, conn_data, csr=None, mat=None, ij=None): ij = conn_data[1] else: raise ConnectorError(f'Must provide one of "csr", "mat" or "ij". Got "{conn_data[0]}" instead.') + else: + raise ConnectorError # checking all_data = dict() @@ -254,22 +290,20 @@ def make_returns(self, structures, conn_data, csr=None, mat=None, ij=None): # "csr" structure if csr is not None: - assert isinstance(csr[0], np.ndarray) - assert isinstance(csr[1], np.ndarray) if (PRE2POST in structures) and (PRE2POST not in all_data): all_data[PRE2POST] = (bm.asarray(csr[0], dtype=IDX_DTYPE), bm.asarray(csr[1], dtype=IDX_DTYPE)) self._return_by_csr(structures, csr=csr, all_data=all_data) + # "mat" structure if mat is not None: - assert isinstance(mat, np.ndarray) and np.ndim(mat) == 2 + assert isinstance(mat, onp.ndarray) and onp.ndim(mat) == 2 if (CONN_MAT in structures) and (CONN_MAT not in all_data): all_data[CONN_MAT] = bm.asarray(mat, dtype=MAT_DTYPE) self._return_by_mat(structures, mat=mat, all_data=all_data) + # "ij" structure if ij is not None: - assert isinstance(ij[0], np.ndarray) - assert isinstance(ij[1], np.ndarray) if (PRE_IDS in structures) and (PRE_IDS not in structures): all_data[PRE_IDS] = bm.asarray(ij[0], dtype=IDX_DTYPE) if (POST_IDS in structures) and (POST_IDS not in structures): @@ -294,13 +328,73 @@ def build_conn(self): """ raise NotImplementedError - def require(self, *structures): + def require(self, *sizes_or_structures): + sizes_or_structures = list(sizes_or_structures) + pre_size = sizes_or_structures.pop(0) if len(sizes_or_structures) >= 1 else None + post_size = sizes_or_structures.pop(0) if len(sizes_or_structures) >= 1 else None + structures = sizes_or_structures + if isinstance(post_size, str): + structures.insert(0, post_size) + post_size = None + if isinstance(pre_size, str): + structures.insert(0, pre_size) + pre_size = None + + version2_style = (pre_size is not None) and (post_size is not None) + if not version2_style: + try: + assert self.pre_num is not None and self.post_num is not None + except AssertionError: + raise ConnectorError(f'self.pre_num or self.post_num is not defined. ' + f'Please use self.__call__(pre_size, post_size) ' + f'before requiring connection data.') + if pre_size is None: + pre_size = self.pre_size + if post_size is None: + post_size = self.post_size + self.check(structures) - conn_data = self.build_conn() + if self.is_version2_style: + if (pre_size is None) or (post_size is None): + raise ConnectorError('Please provide both "pre_size" and "post_size".') + if len(structures) == 1: + if PRE2POST in structures: + return self.build_csr(pre_size, post_size) + elif CONN_MAT in structures: + return self.build_mat(pre_size, post_size) + elif PRE_IDS in structures: + return self.build_coo(pre_size, post_size)[0] + elif POST_IDS in structures: + return self.build_coo(pre_size, post_size)[1] + elif len(structures) == 2: + if PRE_IDS in structures and POST_IDS in structures: + return self.build_coo(pre_size, post_size) + + conn_data = dict(csr=None, ij=None, mat=None) + if not hasattr(self.build_csr, 'not_customized'): + conn_data['csr'] = self.build_csr(pre_size, post_size) + elif not hasattr(self.build_coo, 'not_customized'): + conn_data['ij'] = self.build_coo(pre_size, post_size) + elif not hasattr(self.build_mat, 'not_customized'): + conn_data['mat'] = self.build_mat(pre_size, post_size) + else: + conn_data = self.build_conn() return self.make_returns(structures, conn_data) - def requires(self, *structures): - return self.require(*structures) + def requires(self, *sizes_or_structures): + return self.require(*sizes_or_structures) + + @tools.not_customized + def build_mat(self, pre_size, post_size): + pass + + @tools.not_customized + def build_csr(self, pre_size, post_size): + pass + + @tools.not_customized + def build_coo(self, pre_size, post_size): + pass class OneEndConnector(TwoEndConnector): @@ -336,16 +430,18 @@ def __call__(self, pre_size, post_size=None): def _reset_conn(self, pre_size, post_size=None): self.__init__() - self.__call__(pre_size, post_size) def csr2csc(csr, post_num, data=None): """Convert csr to csc.""" indices, indptr = csr + np = onp if isinstance(indices, onp.ndarray) else bm + kind = 'quicksort' if isinstance(indices, onp.ndarray) else 'stable' + pre_ids = np.repeat(np.arange(indptr.size - 1), np.diff(indptr)) - sort_ids = np.argsort(indices, kind='mergesort') # to maintain the original order of the elements with the same value + sort_ids = np.argsort(indices, kind=kind) # to maintain the original order of the elements with the same value pre_ids_new = np.asarray(pre_ids[sort_ids], dtype=IDX_DTYPE) unique_post_ids, count = np.unique(indices, return_counts=True) @@ -365,8 +461,8 @@ def csr2csc(csr, post_num, data=None): def mat2csr(dense): """convert a dense matrix to (indices, indptr).""" - if isinstance(dense, bm.ndarray): - dense = np.asarray(dense) + np = onp if isinstance(dense, onp.ndarray) else bm + pre_ids, post_ids = np.where(dense > 0) pre_num = dense.shape[0] @@ -382,6 +478,8 @@ def mat2csr(dense): def csr2mat(csr, num_pre, num_post): """convert (indices, indptr) to a dense matrix.""" indices, indptr = csr + np = onp if isinstance(indices, onp.ndarray) else bm + d = np.zeros((num_pre, num_post), dtype=MAT_DTYPE) # num_pre, num_post pre_ids = np.repeat(np.arange(indptr.size - 1), np.diff(indptr)) d[pre_ids, indices] = True @@ -391,6 +489,8 @@ def csr2mat(csr, num_pre, num_post): def ij2mat(ij, num_pre, num_post): """convert (indices, indptr) to a dense matrix.""" pre_ids, post_ids = ij + np = onp if isinstance(pre_ids, onp.ndarray) else bm + d = np.zeros((num_pre, num_post), dtype=MAT_DTYPE) # num_pre, num_post d[pre_ids, post_ids] = True return d @@ -398,8 +498,11 @@ def ij2mat(ij, num_pre, num_post): def ij2csr(pre_ids, post_ids, num_pre): """convert pre_ids, post_ids to (indices, indptr).""" + np = onp if isinstance(pre_ids, onp.ndarray) else bm + kind = 'quicksort' if isinstance(pre_ids, onp.ndarray) else 'stable' + # sorting - sort_ids = np.argsort(pre_ids, kind='mergesort') + sort_ids = np.argsort(pre_ids, kind=kind) post_ids = post_ids[sort_ids] indices = post_ids diff --git a/brainpy/connect/random_conn.py b/brainpy/connect/random_conn.py index 9e93c0295..25841f3cb 100644 --- a/brainpy/connect/random_conn.py +++ b/brainpy/connect/random_conn.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- - +import jax import numpy as np +from brainpy import math as bm from brainpy.errors import ConnectorError from brainpy.tools.others import numba_seed, numba_jit, SUPPORT_NUMBA, format_seed from .base import * @@ -42,29 +43,36 @@ def __init__(self, prob, pre_ratio=1., include_self=True, seed=None): self.pre_ratio = pre_ratio self.include_self = include_self self.seed = format_seed(seed) - self.rng = np.random.RandomState(seed=self.seed) + self.rng = bm.random.RandomState(seed=self.seed) - rng = np.random if SUPPORT_NUMBA else self.rng + def __repr__(self): + return (f'{self.__class__.__name__}(prob={self.prob}, pre_ratio={self.pre_ratio}, ' + f'include_self={self.include_self}, seed={self.seed})') + + def build_conn(self): + if SUPPORT_NUMBA: + numba_seed(self.seed) + rng = np.random + else: + rng = np.random.RandomState(self.seed) + + include_self = self.include_self + pre_ratio = self.pre_ratio + prob = self.prob - def _connect(pre_i, num_post): + @numba_jit + def f_connect(pre_i, num_post): if rng.random() < pre_ratio: p = rng.random(num_post) <= prob if (not include_self) and pre_i < num_post: p[pre_i] = False return np.where(p)[0] - self._connect = numba_jit(_connect) - - def build_conn(self): - # seed - self.seed = self.rng.randint(1, int(1e7)) - if SUPPORT_NUMBA: numba_seed(self.seed) - # make connections ind = [] count = np.zeros(self.pre_num, dtype=IDX_DTYPE) for i in range(self.pre_num): - posts = self._connect(pre_i=i, num_post=self.post_num) + posts = f_connect(pre_i=i, num_post=self.post_num) if posts is not None: ind.append(posts) count[i] = len(posts) @@ -73,6 +81,56 @@ def build_conn(self): return 'csr', (ind, indptr) + def build_mat(self, pre_size, post_size): + pre_num = np.prod(pre_size) + post_num = np.prod(post_size) + pre_state = self.rng.rand(pre_num, 1) < self.pre_ratio + mat = (self.rng.rand(pre_num, post_num) < self.prob) * pre_state + if not self.include_self: + bm.fill_diagonal(mat, False) + return mat.astype(MAT_DTYPE) + + def build_coo(self, pre_size, post_size): + pre_num = np.prod(pre_size) + post_num = np.prod(post_size) + post_num_to_select = int(post_num * self.prob) + post_ids = bm.arange(post_num) + if self.pre_ratio < 1.: + pre_num_to_select = int(pre_num * self.pre_ratio) + pre_ids = self.rng.choice(pre_num, size=pre_num_to_select, replace=False) + else: + pre_ids = bm.arange(pre_num) + + @jax.vmap + def f(i, key): + posts = bm.delete(post_ids, i) if not self.include_self else post_ids + return self.rng.permutation(posts, key=key)[:post_num_to_select] + + selected_pre_ids = bm.repeat(pre_ids, post_num_to_select) + selected_post_ids = f(pre_ids, self.rng.split_keys(pre_ids.size)).flatten() + return selected_pre_ids.astype(IDX_DTYPE), selected_post_ids.astype(IDX_DTYPE) + + def build_csr(self, pre_size, post_size): + pre_num = np.prod(pre_size) + post_num = np.prod(post_size) + post_num_to_select = int(post_num * self.prob) + post_ids = bm.arange(post_num) + if self.pre_ratio < 1.: + pre_num_to_select = int(pre_num * self.pre_ratio) + pre_ids = self.rng.choice(pre_num, size=pre_num_to_select, replace=False) + else: + pre_num_to_select = pre_num + pre_ids = bm.arange(pre_num) + + @jax.vmap + def f(i, key): + posts = bm.delete(post_ids, i) if not self.include_self else post_ids + return self.rng.permutation(posts, key=key)[:post_num_to_select] + + selected_post_ids = f(pre_ids, self.rng.split_keys(pre_ids.size)).flatten() + selected_pre_inptr = bm.cumsum(bm.concatenate([bm.zeros(1), bm.ones(pre_num_to_select) * post_num_to_select])) + return selected_post_ids.astype(IDX_DTYPE), selected_pre_inptr.astype(IDX_DTYPE) + class FixedNum(TwoEndConnector): """Connect with fixed number for each pre- or post-synaptic neuron. @@ -111,6 +169,11 @@ def _fixed_num_prob(num_need, num_total, i=0): self._connect = numba_jit(_fixed_num_prob) + def __repr__(self): + return (f'{self.__class__.__name__}(num={self.num}, ' + f'include_self={self.include_self}, ' + f'seed={self.seed})') + class FixedPreNum(FixedNum): """Connect the pre-synaptic neurons with fixed number for each post-synaptic neuron. @@ -124,6 +187,11 @@ class FixedPreNum(FixedNum): Whether create (i, i) conn ? """ + def __repr__(self): + return (f'{self.__class__.__name__}(num={self.num}, ' + f'include_self={self.include_self}, ' + f'seed={self.seed})') + def build_conn(self): # check if isinstance(self.num, int): @@ -163,6 +231,11 @@ class FixedPostNum(FixedNum): Seed the random generator. """ + def __repr__(self): + return (f'{self.__class__.__name__}(num={self.num}, ' + f'include_self={self.include_self}, ' + f'seed={self.seed})') + def build_conn(self): # check if isinstance(self.num, int): @@ -245,6 +318,13 @@ def __init__( self.seed = format_seed(seed) self.rng = np.random.RandomState(self.seed) + def __repr__(self): + return (f'{self.__class__.__name__}(sigma={self.sigma}, ' + f'normalize={self.normalize}, ' + f'periodic_boundary={self.periodic_boundary}, ' + f'include_self={self.include_self}, ' + f'seed={self.seed})') + def build_conn(self): # value range to encode if self.encoding_values is None: @@ -373,6 +453,13 @@ def _smallworld_rewire(i, all_j): self._connect = numba_jit(_smallworld_rewire) + def __repr__(self): + return (f'{self.__class__.__name__}(prob={self.prob}, ' + f'directed={self.directed}, ' + f'num_neighbor={self.num_neighbor}, ' + f'include_self={self.include_self}, ' + f'seed={self.seed})') + def build_conn(self): assert self.pre_size == self.post_size @@ -487,6 +574,11 @@ def _random_subset(seq, m): self._connect = numba_jit(_random_subset) + def __repr__(self): + return (f'{self.__class__.__name__}(m={self.m}, ' + f'directed={self.directed}, ' + f'seed={self.seed})') + def build_conn(self): assert self.pre_num == self.post_num @@ -573,6 +665,10 @@ def _random_subset(seq, m): self._connect = numba_jit(_random_subset) + def __repr__(self): + return (f'{self.__class__.__name__}(m1={self.m1}, m2={self.m2}, ' + f'p={self.p}, directed={self.directed}, seed={self.seed})') + def build_conn(self): assert self.pre_num == self.post_num # seed @@ -683,6 +779,9 @@ def _random_subset(seq, m): self._connect = numba_jit(_random_subset) + def __repr__(self): + return (f'{self.__class__.__name__}(m={self.m}, p={self.p}, directed={self.directed}, seed={self.seed})') + def build_conn(self): assert self.pre_num == self.post_num # seed diff --git a/brainpy/connect/regular_conn.py b/brainpy/connect/regular_conn.py index 98c2f40a5..0378d2d23 100644 --- a/brainpy/connect/regular_conn.py +++ b/brainpy/connect/regular_conn.py @@ -24,6 +24,7 @@ class One2One(TwoEndConnector): """Connect two neuron groups one by one. This means The two neuron groups should have the same size. """ + def __init__(self): super(One2One, self).__init__() @@ -56,6 +57,9 @@ def __init__(self, include_self=True): self.include_self = include_self super(All2All, self).__init__() + def __repr__(self): + return (f'{self.__class__.__name__}(include_self={self.include_self})') + def build_conn(self): mat = np.ones((self.pre_num, self.post_num), dtype=MAT_DTYPE) if not self.include_self: @@ -103,6 +107,9 @@ def __init__(self, include_self=False): super(GridFour, self).__init__() self.include_self = include_self + def __repr__(self): + return (f'{self.__class__.__name__}(include_self={self.include_self})') + def build_conn(self): # only the 1- or 2-D structure is supported if len(self.pre_size) == 1: @@ -170,6 +177,9 @@ def __init__(self, N=1, include_self=False): self.N = N self.include_self = include_self + def __repr__(self): + return (f'{self.__class__.__name__}(N={self.N}, include_self={self.include_self})') + def build_conn(self): if len(self.pre_size) == 1: height, width = self.pre_size[0], 1 diff --git a/brainpy/connect/tests/test_random_conn.py b/brainpy/connect/tests/test_random_conn.py index 11996df99..ac76468a3 100644 --- a/brainpy/connect/tests/test_random_conn.py +++ b/brainpy/connect/tests/test_random_conn.py @@ -2,23 +2,27 @@ import pytest +import unittest + import brainpy as bp -def test_random_prob(): - conn1 = bp.connect.FixedProb(prob=0.1, seed=123) - conn1(pre_size=(10, 20), post_size=(10, 20)) - pre_ids, post_ids, pre2post = conn1.require('pre_ids', 'post_ids', 'pre2post') +class TestFixedProb(unittest.TestCase): + def test_size_consistent(self): + conn1 = bp.connect.FixedProb(prob=0.1, seed=123) + conn1(pre_size=(10, 20), post_size=(10, 20)) + pre_ids, post_ids, pre2post = conn1.require('pre_ids', 'post_ids', 'pre2post') + self.assertTrue(len(pre_ids) == len(post_ids)) + self.assertTrue(len(pre_ids) == len(pre2post[0])) - conn2 = bp.connect.FixedProb(prob=0.1, seed=123) - conn2(pre_size=(10, 20), post_size=(10, 20)) - mat = conn2.require(bp.connect.CONN_MAT) - pre_ids2, post_ids2 = bp.math.where(mat) + def test_require_method(self): + conn2 = bp.connect.FixedProb(prob=0.1, seed=123) + conn2(pre_size=(10, 20), post_size=(10, 20)) + mat = conn2.require(100, 1000, bp.connect.CONN_MAT) + self.assertTrue(mat.shape == (100, 1000)) - print() - assert bp.math.array_equal(pre_ids, pre_ids2) - assert bp.math.array_equal(post_ids, post_ids2) - print('weight_mat', mat) + mat = conn2.require(bp.connect.CONN_MAT) + self.assertTrue(mat.shape == (200, 200)) def test_random_fix_pre1(): diff --git a/brainpy/math/tests/test_numpy_indexing.py b/brainpy/math/tests/test_numpy_indexing.py index 9ee2e3893..039f2d516 100644 --- a/brainpy/math/tests/test_numpy_indexing.py +++ b/brainpy/math/tests/test_numpy_indexing.py @@ -15,6 +15,7 @@ # limitations under the License. +import pytest import enum import itertools import typing @@ -403,7 +404,7 @@ def check_grads(f, args, order, atol=None, rtol=None, eps=None): MODES = ["clip", "drop", "promise_in_bounds"] - +@pytest.mark.skipif(True, reason="No longer need to test.") class IndexingTest(jtu.JaxTestCase): """Tests for Numpy indexing translation rules.""" @@ -1013,6 +1014,7 @@ def _update_tol(op): return tol +@pytest.mark.skipif(True, reason="No longer need to test.") @jtu.with_config(jax_numpy_dtype_promotion='standard') class IndexedUpdateTest(jtu.JaxTestCase): diff --git a/brainpy/math/tests/test_numpy_ops.py b/brainpy/math/tests/test_numpy_ops.py index 678bcd555..4522bae19 100644 --- a/brainpy/math/tests/test_numpy_ops.py +++ b/brainpy/math/tests/test_numpy_ops.py @@ -13,6 +13,7 @@ # limitations under the License. +import pytest import collections import functools from functools import partial @@ -545,7 +546,7 @@ def wrapper(*args, **kw): return wrapper - +@pytest.mark.skipif(True, reason="No longer need to test.") @jtu.with_config(jax_numpy_dtype_promotion='standard') class LaxBackedNumpyTests(jtu.JaxTestCase): """Tests for LAX-backed Numpy implementation.""" @@ -5991,6 +5992,7 @@ def grad_test_spec(op, nargs, order, rng_factory, dtypes, name=None, tol=None): GradSpecialValuesTestSpec(bm.sinc, [0.], 1), ] +@pytest.mark.skipif(True, reason="No longer need to test.") @jtu.with_config(jax_numpy_dtype_promotion='standard') class NumpyGradTests(jtu.JaxTestCase): @parameterized.named_parameters(itertools.chain.from_iterable( @@ -6095,6 +6097,7 @@ def _dtypes_for_ufunc(name: str) -> Iterator[Tuple[str, ...]]: else: yield arg_dtypes +@pytest.mark.skipif(True, reason="No longer need to test.") @jtu.with_config(jax_numpy_dtype_promotion='standard') class NumpyUfuncTests(jtu.JaxTestCase): @parameterized.named_parameters( diff --git a/extensions/CMakeLists.txt b/extensions/CMakeLists.txt index 85a048270..1216f9a65 100644 --- a/extensions/CMakeLists.txt +++ b/extensions/CMakeLists.txt @@ -22,17 +22,17 @@ include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) pybind11_add_module( gpu_ops ${CMAKE_CURRENT_LIST_DIR}/lib/gpu_ops.cc - ${CMAKE_CURRENT_LIST_DIR}/lib/event_sum_gpu.cu - ${CMAKE_CURRENT_LIST_DIR}/lib/atomic_prod_gpu.cu - ${CMAKE_CURRENT_LIST_DIR}/lib/atomic_sum_gpu.cu) + ${CMAKE_CURRENT_LIST_DIR}/lib/gpu_event_sum.cu + ${CMAKE_CURRENT_LIST_DIR}/lib/gpu_atomic_prod.cu + ${CMAKE_CURRENT_LIST_DIR}/lib/gpu_atomic_sum.cu) install(TARGETS gpu_ops DESTINATION brainpylib) pybind11_add_module( cpu_ops ${CMAKE_CURRENT_LIST_DIR}/lib/cpu_ops.cc - ${CMAKE_CURRENT_LIST_DIR}/lib/event_prod_cpu.cc - ${CMAKE_CURRENT_LIST_DIR}/lib/event_sum_cpu.cc - ${CMAKE_CURRENT_LIST_DIR}/lib/atomic_prod_cpu.cc - ${CMAKE_CURRENT_LIST_DIR}/lib/atomic_sum_cpu.cc + ${CMAKE_CURRENT_LIST_DIR}/lib/cpu_event_prod.cc + ${CMAKE_CURRENT_LIST_DIR}/lib/cpu_event_sum.cc + ${CMAKE_CURRENT_LIST_DIR}/lib/cpu_atomic_prod.cc + ${CMAKE_CURRENT_LIST_DIR}/lib/cpu_atomic_sum.cc ) install(TARGETS cpu_ops DESTINATION brainpylib) diff --git a/extensions/brainpylib/__init__.py b/extensions/brainpylib/__init__.py index 2328558f4..b50536a70 100644 --- a/extensions/brainpylib/__init__.py +++ b/extensions/brainpylib/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -__version__ = "0.0.6" +__version__ = "0.0.7" # IMPORTANT, must import first from . import register_custom_calls diff --git a/extensions/brainpylib/atomic_prod.py b/extensions/brainpylib/atomic_prod.py index f6cf27172..3b6973c00 100644 --- a/extensions/brainpylib/atomic_prod.py +++ b/extensions/brainpylib/atomic_prod.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- __all__ = [ - 'atomic_prod', + 'coo_atomic_prod', ] from functools import partial @@ -21,10 +21,10 @@ x_shape = xla_client.Shape.array_shape x_ops = xla_client.ops -_atomic_prod_prim = core.Primitive("atomic_prod") +coo_atomic_prod_p1 = core.Primitive("coo_atomic_prod_p1") -def atomic_prod(values, post_ids, post_num, pre_ids=None): +def coo_atomic_prod(values, post_ids, post_num, pre_ids=None): # connections if jnp.size(values) != 1: assert pre_ids is not None, 'Must provide "pre_ids" when "values" is not a scalar.' @@ -50,18 +50,17 @@ def atomic_prod(values, post_ids, post_num, pre_ids=None): raise ValueError(f'The size of "values" must be 1 (a scalar) or longer than pre_size (a vector), ' f'while we got {values.size} != 1 <= {pre_ids.max()}') values = values.flatten() - out = jnp.zeros(post_num, dtype=values.dtype) # bind operator - return _atomic_prod_prim.bind(values, pre_ids, post_ids, post_num=post_num) + return coo_atomic_prod_p1.bind(values, pre_ids, post_ids, post_num=post_num) def _atomic_prod_abstract(values, pre_ids, post_ids, *, post_num): return ShapedArray(shape=(post_num, ), dtype=values.dtype) -_atomic_prod_prim.def_abstract_eval(_atomic_prod_abstract) -_atomic_prod_prim.def_impl(partial(xla.apply_primitive, _atomic_prod_prim)) +coo_atomic_prod_p1.def_abstract_eval(_atomic_prod_abstract) +coo_atomic_prod_p1.def_impl(partial(xla.apply_primitive, coo_atomic_prod_p1)) def _atomic_prod_translation(c, values, pre_ids, post_ids, *, post_num, platform="cpu"): @@ -83,7 +82,7 @@ def _atomic_prod_translation(c, values, pre_ids, post_ids, *, post_num, platform # We dispatch a different call depending on the dtype values_dim = values_info.dimensions() - v_type = b'_atomic_prod_homo' if (values_dim[0] == 1) else b'_atomic_prod_heter' + v_type = b'_coo_atomic_prod_homo' if (values_dim[0] == 1) else b'_coo_atomic_prod_heter' f_type = b'_f32' if values_dtype == np.float32 else b'_f64' i_type = b'_i32' if Itype == np.uint32 else b'_i64' @@ -91,7 +90,8 @@ def _atomic_prod_translation(c, values, pre_ids, post_ids, *, post_num, platform if platform == "cpu": if values_dim[0] != 1: return x_ops.CustomCallWithLayout( - c, platform.encode() + v_type + f_type + i_type, + c, + platform.encode() + v_type + f_type + i_type, operands=(values, pre_ids, post_ids, @@ -106,7 +106,8 @@ def _atomic_prod_translation(c, values, pre_ids, post_ids, *, post_num, platform ) else: return x_ops.CustomCallWithLayout( - c, platform.encode() + v_type + f_type + i_type, + c, + platform.encode() + v_type + f_type + i_type, operands=(values, post_ids, x_ops.ConstantLiteral(c, conn_size), @@ -120,7 +121,7 @@ def _atomic_prod_translation(c, values, pre_ids, post_ids, *, post_num, platform elif platform == 'gpu': if gpu_ops is None: raise ValueError('Cannot find compiled gpu wheels.') - opaque = gpu_ops.build_atomic_prod_descriptor(conn_size, post_num) + opaque = gpu_ops.build_coo_atomic_prod_descriptor(conn_size, post_num) if values_dim[0] != 1: return x_ops.CustomCallWithLayout( c, platform.encode() + v_type + f_type + i_type, @@ -148,5 +149,5 @@ def _atomic_prod_translation(c, values, pre_ids, post_ids, *, post_num, platform raise ValueError("Unsupported platform; this must be either 'cpu' or 'gpu'") -xla.backend_specific_translations["cpu"][_atomic_prod_prim] = partial(_atomic_prod_translation, platform="cpu") -xla.backend_specific_translations["gpu"][_atomic_prod_prim] = partial(_atomic_prod_translation, platform="gpu") +xla.backend_specific_translations["cpu"][coo_atomic_prod_p1] = partial(_atomic_prod_translation, platform="cpu") +xla.backend_specific_translations["gpu"][coo_atomic_prod_p1] = partial(_atomic_prod_translation, platform="gpu") diff --git a/extensions/brainpylib/atomic_sum.py b/extensions/brainpylib/atomic_sum.py index 6f6246d12..457831249 100644 --- a/extensions/brainpylib/atomic_sum.py +++ b/extensions/brainpylib/atomic_sum.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- __all__ = [ - 'atomic_sum', + 'coo_atomic_sum', ] from functools import partial @@ -21,10 +21,10 @@ x_shape = xla_client.Shape.array_shape x_ops = xla_client.ops -_atomic_sum_prim = core.Primitive("atomic_sum") +coo_atomic_sum_p1 = core.Primitive("coo_atomic_sum_p1") -def atomic_sum(values, post_ids, post_num, pre_ids=None): +def coo_atomic_sum(values, post_ids, post_num, pre_ids=None): # connections if jnp.size(values) != 1: assert pre_ids is not None, 'Must provide "pre_ids" when "values" is not a scalar.' @@ -52,15 +52,15 @@ def atomic_sum(values, post_ids, post_num, pre_ids=None): values = values.flatten() # bind operator - return _atomic_sum_prim.bind(values, pre_ids, post_ids, post_num=post_num) + return coo_atomic_sum_p1.bind(values, pre_ids, post_ids, post_num=post_num) def _atomic_sum_abstract(values, pre_ids, post_ids, *, post_num): return ShapedArray(dtype=values.dtype, shape=(post_num,)) -_atomic_sum_prim.def_abstract_eval(_atomic_sum_abstract) -_atomic_sum_prim.def_impl(partial(xla.apply_primitive, _atomic_sum_prim)) +coo_atomic_sum_p1.def_abstract_eval(_atomic_sum_abstract) +coo_atomic_sum_p1.def_impl(partial(xla.apply_primitive, coo_atomic_sum_p1)) def _atomic_sum_translation(c, values, pre_ids, post_ids, *, post_num, platform="cpu"): @@ -80,7 +80,7 @@ def _atomic_sum_translation(c, values, pre_ids, post_ids, *, post_num, platform= # We dispatch a different call depending on the dtype values_dim = values_info.dimensions() - v_type = b'_atomic_sum_homo' if (values_dim[0] == 1) else b'_atomic_sum_heter' + v_type = b'_coo_atomic_sum_homo' if (values_dim[0] == 1) else b'_coo_atomic_sum_heter' f_type = b'_f32' if values_dtype == np.float32 else b'_f64' i_type = b'_i32' if Itype == np.uint32 else b'_i64' @@ -118,7 +118,7 @@ def _atomic_sum_translation(c, values, pre_ids, post_ids, *, post_num, platform= if gpu_ops is None: raise ValueError('Cannot find compiled gpu wheels.') - opaque = gpu_ops.build_atomic_sum_descriptor(conn_size, post_num) + opaque = gpu_ops.build_coo_atomic_sum_descriptor(conn_size, post_num) if values_dim[0] != 1: return x_ops.CustomCallWithLayout( c, platform.encode() + v_type + f_type + i_type, @@ -144,5 +144,5 @@ def _atomic_sum_translation(c, values, pre_ids, post_ids, *, post_num, platform= raise ValueError("Unsupported platform; this must be either 'cpu' or 'gpu'") -xla.backend_specific_translations["cpu"][_atomic_sum_prim] = partial(_atomic_sum_translation, platform="cpu") -xla.backend_specific_translations["gpu"][_atomic_sum_prim] = partial(_atomic_sum_translation, platform="gpu") +xla.backend_specific_translations["cpu"][coo_atomic_sum_p1] = partial(_atomic_sum_translation, platform="cpu") +xla.backend_specific_translations["gpu"][coo_atomic_sum_p1] = partial(_atomic_sum_translation, platform="gpu") diff --git a/extensions/brainpylib/event_prod.py b/extensions/brainpylib/event_prod.py index 7219d7a4a..ad549a5bd 100644 --- a/extensions/brainpylib/event_prod.py +++ b/extensions/brainpylib/event_prod.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- __all__ = [ - 'event_prod', + 'csr_event_prod', ] from functools import partial @@ -21,10 +21,10 @@ x_shape = xla_client.Shape.array_shape x_ops = xla_client.ops -_event_prod_prim = core.Primitive("event_prod") +csr_event_prod_p1 = core.Primitive("csr_event_prod") -def event_prod(events, pre2post, post_num, values): +def csr_event_prod(events, pre2post, post_num, values): # events if events.dtype != jnp.bool_: raise ValueError(f'"events" must be a vector of bool, while we got {events.dtype}') @@ -49,15 +49,15 @@ def event_prod(events, pre2post, post_num, values): f'while we got {values.size} != 1 != {indices.size}') values = values.flatten() # bind operator - return _event_prod_prim.bind(events, indices, indptr, values, post_num=post_num) + return csr_event_prod_p1.bind(events, indices, indptr, values, post_num=post_num) def _event_prod_abstract(events, indices, indptr, values, *, post_num): return ShapedArray(dtype=values.dtype, shape=(post_num,)) -_event_prod_prim.def_abstract_eval(_event_prod_abstract) -_event_prod_prim.def_impl(partial(xla.apply_primitive, _event_prod_prim)) +csr_event_prod_p1.def_abstract_eval(_event_prod_abstract) +csr_event_prod_p1.def_impl(partial(xla.apply_primitive, csr_event_prod_p1)) def _event_prod_translation(c, events, indices, indptr, values, *, post_num, platform="cpu"): @@ -83,7 +83,7 @@ def _event_prod_translation(c, events, indices, indptr, values, *, post_num, pla # And then the following is what changes between the GPU and CPU if platform == "cpu": - v_type = b'_event_prod_homo' if values_dim[0] == 1 else b'_event_prod_heter' + v_type = b'_csr_event_prod_homo' if values_dim[0] == 1 else b'_csr_event_prod_heter' return x_ops.CustomCallWithLayout( c, platform.encode() + v_type + f_type + i_type, operands=(x_ops.ConstantLiteral(c, pre_size), @@ -103,8 +103,8 @@ def _event_prod_translation(c, events, indices, indptr, values, *, post_num, pla elif platform == 'gpu': if gpu_ops is None: raise ValueError('Cannot find compiled gpu wheels.') - v_type = b'_event_prod_homo' if values_dim[0] == 1 else b'_event_prod_heter' - opaque = gpu_ops.build_event_prod_descriptor(pre_size, post_num) + v_type = b'_csr_event_prod_homo' if values_dim[0] == 1 else b'_csr_event_prod_heter' + opaque = gpu_ops.build_csr_event_prod_descriptor(pre_size, post_num) return x_ops.CustomCallWithLayout( c, platform.encode() + v_type + f_type + i_type, operands=(events, @@ -123,7 +123,7 @@ def _event_prod_translation(c, events, indices, indptr, values, *, post_num, pla raise ValueError("Unsupported platform, we only support 'cpu' or 'gpu'") -xla.backend_specific_translations["cpu"][_event_prod_prim] = partial(_event_prod_translation, platform="cpu") -xla.backend_specific_translations["gpu"][_event_prod_prim] = partial(_event_prod_translation, platform="gpu") +xla.backend_specific_translations["cpu"][csr_event_prod_p1] = partial(_event_prod_translation, platform="cpu") +xla.backend_specific_translations["gpu"][csr_event_prod_p1] = partial(_event_prod_translation, platform="gpu") diff --git a/extensions/brainpylib/event_sum.py b/extensions/brainpylib/event_sum.py index 433bf8373..e7bebb656 100644 --- a/extensions/brainpylib/event_sum.py +++ b/extensions/brainpylib/event_sum.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- __all__ = [ - 'event_sum', - 'event_sum2', + 'csr_event_sum', + 'coo_event_sum', ] from functools import partial @@ -26,13 +26,13 @@ x_shape = xla_client.Shape.array_shape x_ops = xla_client.ops -_event_sum_prim = core.Primitive("event_sum") +csr_event_sum_p1 = core.Primitive("csr_event_sum_p1") -def event_sum(events: jnp.ndarray, - pre2post: Tuple[jnp.ndarray, jnp.ndarray], - post_num: int, - values: Union[float, jnp.ndarray]): +def csr_event_sum(events: jnp.ndarray, + pre2post: Tuple[jnp.ndarray, jnp.ndarray], + post_num: int, + values: Union[float, jnp.ndarray]): # events if events.dtype != jnp.bool_: raise ValueError(f'"events" must be a vector of bool, while we got {events.dtype}') @@ -49,14 +49,16 @@ def event_sum(events: jnp.ndarray, raise ValueError(f'The dtype of pre2post must be integer, while we got {indices.dtype}') # output value - dtype = values.dtype if isinstance(values, jnp.ndarray) else dtypes.canonicalize_dtype(type(values)) + if not isinstance(values, jnp.ndarray): + values = jnp.asarray([values]) + dtype = values.dtype if dtype not in [jnp.float32, jnp.float64]: raise ValueError(f'The dtype of "values" must be float32 or float64, while we got {dtype}.') if np.size(values) not in [1, indices.size]: raise ValueError(f'The size of "values" must be 1 (a scalar) or len(pre2post[0]) (a vector), ' f'while we got {np.size(values)} != 1 != {indices.size}') # bind operator - return _event_sum_prim.bind(events, indices, indptr, values, post_num=post_num) + return csr_event_sum_p1.bind(events, indices, indptr, values, post_num=post_num) def _event_sum_abstract(events, indices, indptr, values, *, post_num): @@ -79,14 +81,14 @@ def _event_sum_translation(c, events, indices, indptr, values, *, post_num, plat values_dim = values_shape.dimensions() # We dispatch a different call depending on the dtype - f_type = b'_f32' if Ftype in np.float32 else b'_f64' + f_type = b'_f32' if Ftype == np.float32 else b'_f64' i_type = b'_i32' if Itype in [np.uint32, np.int32] else b'_i64' if platform == "cpu": - v_type = b'_event_sum_homo' if len(values_dim) == 0 else b'_event_sum_heter' + v_type = b'cpu_csr_event_sum_homo' if values_dim[0] == 1 else b'cpu_csr_event_sum_heter' return x_ops.CustomCallWithLayout( c, - platform.encode() + v_type + f_type + i_type, + v_type + f_type + i_type, operands=(x_ops.ConstantLiteral(c, pre_size), x_ops.ConstantLiteral(c, post_num), events, @@ -107,11 +109,11 @@ def _event_sum_translation(c, events, indices, indptr, values, *, post_num, plat if gpu_ops is None: raise GPUOperatorNotFound('event_sum') - v_type = b'_event_sum_homo' if values_dim[0] == 1 else b'_event_sum_heter' - opaque = gpu_ops.build_event_sum_descriptor(pre_size, post_num) + v_type = b'gpu_csr_event_sum_homo' if values_dim[0] == 1 else b'gpu_csr_event_sum_heter' + opaque = gpu_ops.build_csr_event_sum_descriptor(pre_size, post_num) return x_ops.CustomCallWithLayout( c, - platform.encode() + v_type + f_type + i_type, + v_type + f_type + i_type, operands=(events, indices, indptr, @@ -140,26 +142,26 @@ def _event_sum_batch(args, axes, *, post_num): def f(_, x): pars = tuple([(x[f'ax{i}'] if i in batch_axes else non_batch_args[f'ax{i}']) for i in range(len(axes))]) - return 0, _event_sum_prim.bind(*pars, post_num=post_num) + return 0, csr_event_sum_p1.bind(*pars, post_num=post_num) _, outs = scan(f, 0, batch_args) return outs, 0 -_event_sum_prim.def_abstract_eval(_event_sum_abstract) -_event_sum_prim.def_impl(partial(xla.apply_primitive, _event_sum_prim)) -batching.primitive_batchers[_event_sum_prim] = _event_sum_batch -xla.backend_specific_translations["cpu"][_event_sum_prim] = partial(_event_sum_translation, platform="cpu") -xla.backend_specific_translations["gpu"][_event_sum_prim] = partial(_event_sum_translation, platform="gpu") +csr_event_sum_p1.def_abstract_eval(_event_sum_abstract) +csr_event_sum_p1.def_impl(partial(xla.apply_primitive, csr_event_sum_p1)) +batching.primitive_batchers[csr_event_sum_p1] = _event_sum_batch +xla.backend_specific_translations["cpu"][csr_event_sum_p1] = partial(_event_sum_translation, platform="cpu") +xla.backend_specific_translations["gpu"][csr_event_sum_p1] = partial(_event_sum_translation, platform="gpu") # --------------------------- # event sum kernel 2 # --------------------------- -_event_sum2_prim = core.Primitive("event_sum2") +coo_event_sum_p1 = core.Primitive("coo_event_sum_p1") -def event_sum2(events, pre_ids, post_ids, post_num, values): +def coo_event_sum(events, pre_ids, post_ids, post_num, values): # events if events.dtype != jnp.bool_: raise ValueError(f'"events" must be a vector of bool, while we got {events.dtype}') @@ -176,7 +178,8 @@ def event_sum2(events, pre_ids, post_ids, post_num, values): f'while we got {pre_ids.dtype}') # output value - values = jnp.asarray([values]) + if not isinstance(values, jnp.ndarray): + values = jnp.asarray([values]) if values.dtype not in [jnp.float32, jnp.float64]: raise ValueError(f'The dtype of "values" must be float32 or float64, while we got {values.dtype}.') if values.size not in [1, pre_ids.size]: @@ -185,17 +188,13 @@ def event_sum2(events, pre_ids, post_ids, post_num, values): values = values.flatten() # bind operator - return _event_sum2_prim.bind(events, pre_ids, post_ids, values, post_num=post_num) + return coo_event_sum_p1.bind(events, pre_ids, post_ids, values, post_num=post_num) def _event_sum2_abstract(events, pre_ids, post_ids, value, *, post_num): return ShapedArray(dtype=value.dtype, shape=(post_num,)) -_event_sum2_prim.def_abstract_eval(_event_sum2_abstract) -_event_sum2_prim.def_impl(partial(xla.apply_primitive, _event_sum2_prim)) - - def _event_sum2_translation(c, events, pre_ids, post_ids, values, *, post_num, platform="cpu"): # The conn/post shape conn_size = np.array(c.get_shape(pre_ids).dimensions()[0], dtype=np.uint32) @@ -219,10 +218,10 @@ def _event_sum2_translation(c, events, pre_ids, post_ids, values, *, post_num, p # And then the following is what changes between the GPU and CPU if platform == "cpu": - v_type = b'_event_sum2_homo' if values_dim[0] == 1 else b'_event_sum2_heter' + v_type = b'cpu_coo_event_sum_homo' if values_dim[0] == 1 else b'cpu_coo_event_sum_heter' return x_ops.CustomCallWithLayout( c, - platform.encode() + v_type + f_type + i_type, + v_type + f_type + i_type, operands=(x_ops.ConstantLiteral(c, conn_size), x_ops.ConstantLiteral(c, post_num), events, @@ -240,11 +239,11 @@ def _event_sum2_translation(c, events, pre_ids, post_ids, values, *, post_num, p elif platform == 'gpu': if gpu_ops is None: raise ValueError('Cannot find compiled gpu wheels.') - v_type = b'_event_sum2_homo' if values_dim[0] == 1 else b'_event_sum2_heter' - opaque = gpu_ops.build_event_sum2_descriptor(conn_size, post_num) + v_type = b'gpu_coo_event_sum_homo' if values_dim[0] == 1 else b'gpu_coo_event_sum_heter' + opaque = gpu_ops.build_csr_event_sum_descriptor(conn_size, post_num) return x_ops.CustomCallWithLayout( c, - platform.encode() + v_type + f_type + i_type, + v_type + f_type + i_type, operands=(events, pre_ids, post_ids, @@ -259,5 +258,7 @@ def _event_sum2_translation(c, events, pre_ids, post_ids, values, *, post_num, p raise ValueError("Unsupported platform; this must be either 'cpu' or 'gpu'") -xla.backend_specific_translations["cpu"][_event_sum2_prim] = partial(_event_sum2_translation, platform="cpu") -xla.backend_specific_translations["gpu"][_event_sum2_prim] = partial(_event_sum2_translation, platform="gpu") +coo_event_sum_p1.def_abstract_eval(_event_sum2_abstract) +coo_event_sum_p1.def_impl(partial(xla.apply_primitive, coo_event_sum_p1)) +xla.backend_specific_translations["cpu"][coo_event_sum_p1] = partial(_event_sum2_translation, platform="cpu") +xla.backend_specific_translations["gpu"][coo_event_sum_p1] = partial(_event_sum2_translation, platform="gpu") diff --git a/extensions/brainpylib/tests/test_atomic_prod.py b/extensions/brainpylib/tests/test_atomic_prod_cpu.py similarity index 80% rename from extensions/brainpylib/tests/test_atomic_prod.py rename to extensions/brainpylib/tests/test_atomic_prod_cpu.py index 14c8ecb96..5d83b300e 100644 --- a/extensions/brainpylib/tests/test_atomic_prod.py +++ b/extensions/brainpylib/tests/test_atomic_prod_cpu.py @@ -4,7 +4,7 @@ import unittest import jax.numpy as jnp -from brainpylib import atomic_prod +from brainpylib import coo_atomic_prod import brainpy as bp @@ -19,7 +19,7 @@ def test_heter_values1(self): pre_ids = jnp.arange(size, dtype=jnp.uint32) sps = bp.math.asarray(bp.math.random.randint(0, 2, size), dtype=bp.math.dftype()) - a = atomic_prod(sps.value, post_ids, size, pre_ids) + a = coo_atomic_prod(sps.value, post_ids, size, pre_ids) print(a) self.assertTrue(jnp.array_equal(a, sps.value)) @@ -27,7 +27,7 @@ def test_homo_value1(self): size = 200 value = 2. post_ids = jnp.arange(size, dtype=jnp.uint32) - a = atomic_prod(value, post_ids, size) + a = coo_atomic_prod(value, post_ids, size) print(a) self.assertTrue(jnp.all(a == value)) @@ -37,7 +37,7 @@ def test_homo_fixedpro(self): conn = bp.conn.FixedProb(prob=1, seed=123) conn(pre_size=size, post_size=size) post_ids = conn.require('post_ids') - a = atomic_prod(value, post_ids.value, size) + a = coo_atomic_prod(value, post_ids.value, size) print(a) def test_heter_fixedpro(self): @@ -46,5 +46,5 @@ def test_heter_fixedpro(self): conn = bp.conn.FixedProb(prob=1, seed=123) conn(pre_size=size, post_size=size) pre_ids, post_ids = conn.require('pre_ids', 'post_ids') - a = atomic_prod(value, post_ids.value, size, pre_ids.value) + a = coo_atomic_prod(value, post_ids.value, size, pre_ids.value) print(a) diff --git a/extensions/brainpylib/tests/test_atomic_prod_gpu.py b/extensions/brainpylib/tests/test_atomic_prod_gpu.py new file mode 100644 index 000000000..4e7296e70 --- /dev/null +++ b/extensions/brainpylib/tests/test_atomic_prod_gpu.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- + + +import unittest + +import jax.numpy as jnp +from brainpylib import coo_atomic_prod + +import brainpy as bp + +bp.math.set_platform('gpu') + + +class TestAtomicProd(unittest.TestCase): + def test_heter_values1(self): + bp.math.random.seed(12345) + size = 200 + post_ids = jnp.arange(size, dtype=jnp.uint32) + pre_ids = jnp.arange(size, dtype=jnp.uint32) + sps = bp.math.asarray(bp.math.random.randint(0, 2, size), + dtype=bp.math.dftype()) + a = coo_atomic_prod(sps.value, post_ids, size, pre_ids) + print(a) + self.assertTrue(jnp.allclose(a, sps.value)) + + def test_homo_value1(self): + size = 200 + value = 2. + post_ids = jnp.arange(size, dtype=jnp.uint32) + a = coo_atomic_prod(value, post_ids, size) + print(a) + self.assertTrue(jnp.all(a == value)) + + def test_homo_fixedpro(self): + size = 10 + value = 2. + conn = bp.conn.FixedProb(prob=1, seed=123) + conn(pre_size=size, post_size=size) + post_ids = conn.require('post_ids') + a = coo_atomic_prod(value, post_ids.value, size) + print(a) + + def test_heter_fixedpro(self): + size = 10 + value = jnp.ones(size) * 2. + conn = bp.conn.FixedProb(prob=1, seed=123) + conn(pre_size=size, post_size=size) + pre_ids, post_ids = conn.require('pre_ids', 'post_ids') + a = coo_atomic_prod(value, post_ids.value, size, pre_ids.value) + print(a) diff --git a/extensions/brainpylib/tests/test_atomic_sum.py b/extensions/brainpylib/tests/test_atomic_sum_cpu.py similarity index 77% rename from extensions/brainpylib/tests/test_atomic_sum.py rename to extensions/brainpylib/tests/test_atomic_sum_cpu.py index 761492ce0..9e1b7db11 100644 --- a/extensions/brainpylib/tests/test_atomic_sum.py +++ b/extensions/brainpylib/tests/test_atomic_sum_cpu.py @@ -4,7 +4,7 @@ import unittest import jax.numpy as jnp -from brainpylib import atomic_sum +from brainpylib import coo_atomic_sum import brainpy as bp @@ -19,15 +19,15 @@ def test_heter_values1(self): pre_ids = jnp.arange(size, dtype=jnp.uint32) sps = bp.math.asarray(bp.math.random.randint(0, 2, size), dtype=bp.math.dftype()) - a = atomic_sum(sps.value, post_ids, size, pre_ids) + a = coo_atomic_sum(sps.value, post_ids, size, pre_ids) print(a) - self.assertTrue(jnp.array_equal(a, sps.value)) + self.assertTrue(jnp.allclose(a, sps.value)) def test_homo_value1(self): size = 200 value = 2. post_ids = jnp.arange(size, dtype=jnp.uint32) - a = atomic_sum(value, post_ids, size) + a = coo_atomic_sum(value, post_ids, size) print(a) self.assertTrue(jnp.all(a == value)) @@ -37,7 +37,7 @@ def test_homo_fixedpro(self): conn = bp.conn.FixedProb(prob=1, seed=123) conn(pre_size=size, post_size=size) post_ids = conn.require('post_ids') - a = atomic_sum(value, post_ids.value, size) + a = coo_atomic_sum(value, post_ids.value, size) print(a) def test_heter_fixedpro(self): @@ -46,5 +46,5 @@ def test_heter_fixedpro(self): conn = bp.conn.FixedProb(prob=1, seed=123) conn(pre_size=size, post_size=size) pre_ids, post_ids = conn.require('pre_ids', 'post_ids') - a = atomic_sum(value, post_ids.value, size, pre_ids.value) + a = coo_atomic_sum(value, post_ids.value, size, pre_ids.value) print(a) diff --git a/extensions/brainpylib/tests/test_atomic_sum_gpu.py b/extensions/brainpylib/tests/test_atomic_sum_gpu.py new file mode 100644 index 000000000..9e1b7db11 --- /dev/null +++ b/extensions/brainpylib/tests/test_atomic_sum_gpu.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- + + +import unittest + +import jax.numpy as jnp +from brainpylib import coo_atomic_sum + +import brainpy as bp + +bp.math.set_platform('cpu') + + +class TestAtomicSum(unittest.TestCase): + def test_heter_values1(self): + bp.math.random.seed(12345) + size = 200 + post_ids = jnp.arange(size, dtype=jnp.uint32) + pre_ids = jnp.arange(size, dtype=jnp.uint32) + sps = bp.math.asarray(bp.math.random.randint(0, 2, size), + dtype=bp.math.dftype()) + a = coo_atomic_sum(sps.value, post_ids, size, pre_ids) + print(a) + self.assertTrue(jnp.allclose(a, sps.value)) + + def test_homo_value1(self): + size = 200 + value = 2. + post_ids = jnp.arange(size, dtype=jnp.uint32) + a = coo_atomic_sum(value, post_ids, size) + print(a) + self.assertTrue(jnp.all(a == value)) + + def test_homo_fixedpro(self): + size = 10 + value = 2. + conn = bp.conn.FixedProb(prob=1, seed=123) + conn(pre_size=size, post_size=size) + post_ids = conn.require('post_ids') + a = coo_atomic_sum(value, post_ids.value, size) + print(a) + + def test_heter_fixedpro(self): + size = 10 + value = jnp.ones(size) * 2. + conn = bp.conn.FixedProb(prob=1, seed=123) + conn(pre_size=size, post_size=size) + pre_ids, post_ids = conn.require('pre_ids', 'post_ids') + a = coo_atomic_sum(value, post_ids.value, size, pre_ids.value) + print(a) diff --git a/extensions/brainpylib/tests/test_event_sum2.py b/extensions/brainpylib/tests/test_coo_event_sum_gpu.py similarity index 90% rename from extensions/brainpylib/tests/test_event_sum2.py rename to extensions/brainpylib/tests/test_coo_event_sum_gpu.py index a0d81d1eb..716ddd27f 100644 --- a/extensions/brainpylib/tests/test_event_sum2.py +++ b/extensions/brainpylib/tests/test_coo_event_sum_gpu.py @@ -7,7 +7,7 @@ import numpy as np import pytest import unittest -from brainpylib import event_sum +from brainpylib import coo_event_sum import brainpy as bp import brainpy.math as bm @@ -22,11 +22,11 @@ def test_homo_values(self): conn = bp.conn.FixedProb(prob=0.5, seed=123) # conn = bp.conn.All2All() conn(pre_size=size, post_size=size) - post_ids, indptr = conn.require('pre2post') + pre_ids, post_ids = conn.require('pre_ids', 'post_ids') sps = bm.random.random(size).value < 0.5 # print(sps) value = 3.0233 - a = event_sum(sps, (post_ids.value, indptr.value), size, value) + a = coo_event_sum(sps, pre_ids.value, post_ids.value, size, value) print(a) def test_heter_value(self): @@ -35,12 +35,12 @@ def test_heter_value(self): conn = bp.conn.FixedProb(prob=0.5, seed=3) # conn = bp.conn.One2One() conn(pre_size=size, post_size=size) - post_ids, indptr = conn.require('pre2post') + pre_ids, post_ids = conn.require('pre_ids', 'post_ids') # sps = bm.random.randint(0, 2, size).value < 1 sps = bm.random.random(size).value < 0.5 values = bm.random.rand(post_ids.size) # values = bm.ones(post_ids.size) - a = event_sum(sps, (post_ids.value, indptr.value), size, values.value) + a = coo_event_sum(sps, pre_ids.value, post_ids.value , size, values.value) print(a) # diff --git a/extensions/brainpylib/tests/test_event_prod.py b/extensions/brainpylib/tests/test_csr_event_prod_cpu.py similarity index 80% rename from extensions/brainpylib/tests/test_event_prod.py rename to extensions/brainpylib/tests/test_csr_event_prod_cpu.py index 0e886f114..4caeef2ca 100644 --- a/extensions/brainpylib/tests/test_event_prod.py +++ b/extensions/brainpylib/tests/test_csr_event_prod_cpu.py @@ -2,12 +2,12 @@ import unittest -from brainpylib import event_prod +from brainpylib import csr_event_prod import brainpy as bp import brainpy.math as bm -# bm.set_platform('gpu') +bm.set_platform('cpu') class TestEventProd(unittest.TestCase): @@ -21,7 +21,7 @@ def test_homo_values(self): sps = bm.random.random(size).value < 0.5 # print(sps) value = 1.0233 - a = event_prod(sps, (post_ids.value, indptr.value), size, value) + a = csr_event_prod(sps, (post_ids.value, indptr.value), size, value) print(a) def test_heter_value(self): @@ -35,6 +35,6 @@ def test_heter_value(self): sps = bm.random.random(size).value < 0.5 values = bm.random.rand(post_ids.size) # values = bm.ones(post_ids.size) - a = event_prod(sps, (post_ids.value, indptr.value), size, values.value) + a = csr_event_prod(sps, (post_ids.value, indptr.value), size, values.value) print(a) diff --git a/extensions/brainpylib/tests/test_event_sum.py b/extensions/brainpylib/tests/test_csr_event_sum_cpu.py similarity index 82% rename from extensions/brainpylib/tests/test_event_sum.py rename to extensions/brainpylib/tests/test_csr_event_sum_cpu.py index af6aabfdb..c6e120718 100644 --- a/extensions/brainpylib/tests/test_event_sum.py +++ b/extensions/brainpylib/tests/test_csr_event_sum_cpu.py @@ -8,12 +8,12 @@ import pytest import unittest from jax import vmap -from brainpylib import event_sum +from brainpylib import csr_event_sum import brainpy as bp import brainpy.math as bm -# bm.set_platform('gpu') +bm.set_platform('cpu') class TestEventSum(unittest.TestCase): @@ -27,7 +27,7 @@ def test_homo_values(self): sps = bm.random.random(size).value < 0.5 # print(sps) value = 3.0233 - a = event_sum(sps, (post_ids.value, indptr.value), size, value) + a = csr_event_sum(sps, (post_ids.value, indptr.value), size, value) print(a) def test_homo_values_batching(self): @@ -39,16 +39,16 @@ def test_homo_values_batching(self): post_ids, indptr = conn.require('pre2post') sps = bm.random.random((10, size)).value < 0.5 value = 3.0233 - f = vmap(bm.pre2post_event_sum, in_axes=(0, None, None, None)) + f = vmap(csr_event_sum, in_axes=(0, None, None, None)) a1 = f(sps, (post_ids.value, indptr.value), size, value) print(a1) - f = vmap(lambda events: bm.pre2post_event_sum(events, (post_ids.value, indptr.value), size, value)) + f = vmap(lambda events: csr_event_sum(events, (post_ids.value, indptr.value), size, value)) a2 = f(sps) print(a2) - self.assertTrue(jnp.array_equal(a1, a2)) + self.assertTrue(jnp.allclose(a1, a2)) def test_heter_value(self): bp.math.random.seed(3) @@ -61,7 +61,7 @@ def test_heter_value(self): sps = bm.random.random(size).value < 0.5 values = bm.random.rand(post_ids.size) # values = bm.ones(post_ids.size) - a = event_sum(sps, (post_ids.value, indptr.value), size, values.value) + a = csr_event_sum(sps, (post_ids.value, indptr.value), size, values.value) print(a) def test_heter_values_batching(self): @@ -72,14 +72,16 @@ def test_heter_values_batching(self): conn(pre_size=size, post_size=size) post_ids, indptr = conn.require('pre2post') sps = bm.random.random((10, size)).value < 0.5 - values = bm.random.rand(post_ids.size) - f = vmap(bm.pre2post_event_sum, in_axes=(0, None, None, None)) + values = bm.random.rand(post_ids.size).value + f = vmap(csr_event_sum, in_axes=(0, None, None, None)) a1 = f(sps, (post_ids.value, indptr.value), size, values) - f = vmap(lambda events: bm.pre2post_event_sum(events, (post_ids.value, indptr.value), size, values)) + f = vmap(lambda events: csr_event_sum(events, (post_ids.value, indptr.value), size, values)) a2 = f(sps) - self.assertTrue(jnp.array_equal(a1, a2)) + print(a1, a2) + + self.assertTrue(jnp.allclose(a1, a2)) # def test1(): diff --git a/extensions/brainpylib/tests/test_csr_event_sum_gpu.py b/extensions/brainpylib/tests/test_csr_event_sum_gpu.py new file mode 100644 index 000000000..44a882476 --- /dev/null +++ b/extensions/brainpylib/tests/test_csr_event_sum_gpu.py @@ -0,0 +1,148 @@ +# -*- coding: utf-8 -*- + +import timeit +import time +import jax +import jax.numpy as jnp +import numpy as np +import pytest +import unittest +from jax import vmap +from brainpylib import csr_event_sum +import brainpy as bp +import brainpy.math as bm + + +bm.set_platform('gpu') + + +class TestEventSum(unittest.TestCase): + def test_homo_values(self): + bp.math.random.seed(1345) + size = 200 + conn = bp.conn.FixedProb(prob=0.5, seed=123) + # conn = bp.conn.All2All() + conn(pre_size=size, post_size=size) + post_ids, indptr = conn.require('pre2post') + sps = bm.random.random(size).value < 0.5 + # print(sps) + value = 3.0233 + a = csr_event_sum(sps, (post_ids.value, indptr.value), size, value) + print(a) + + def test_homo_values_batching(self): + bp.math.random.seed(1345) + size = 200 + conn = bp.conn.FixedProb(prob=0.5, seed=123) + + conn(pre_size=size, post_size=size) + post_ids, indptr = conn.require('pre2post') + sps = bm.random.random((10, size)).value < 0.5 + value = 3.0233 + f = vmap(csr_event_sum, in_axes=(0, None, None, None)) + a1 = f(sps, (post_ids.value, indptr.value), size, value) + + print(a1) + + f = vmap(lambda events: csr_event_sum(events, (post_ids.value, indptr.value), size, value)) + a2 = f(sps) + + print(a2) + self.assertTrue(jnp.allclose(a1, a2)) + + def test_heter_value(self): + bp.math.random.seed(3) + size = 200 + conn = bp.conn.FixedProb(prob=0.5, seed=3) + # conn = bp.conn.One2One() + conn(pre_size=size, post_size=size) + post_ids, indptr = conn.require('pre2post') + # sps = bm.random.randint(0, 2, size).value < 1 + sps = bm.random.random(size).value < 0.5 + values = bm.random.rand(post_ids.size) + # values = bm.ones(post_ids.size) + a = csr_event_sum(sps, (post_ids.value, indptr.value), size, values.value) + print(a) + + def test_heter_values_batching(self): + bp.math.random.seed(1345) + size = 200 + conn = bp.conn.FixedProb(prob=0.5, seed=123) + + conn(pre_size=size, post_size=size) + post_ids, indptr = conn.require('pre2post') + sps = bm.random.random((10, size)).value < 0.5 + values = bm.random.rand(post_ids.size).value + f = vmap(csr_event_sum, in_axes=(0, None, None, None)) + a1 = f(sps, (post_ids.value, indptr.value), size, values) + + f = vmap(lambda events: csr_event_sum(events, (post_ids.value, indptr.value), size, values)) + a2 = f(sps) + + print(a1, a2) + + self.assertTrue(jnp.allclose(a1, a2)) + + +# def test1(): +# bm.random.seed(123) +# size = 3000 +# conn = bp.conn.FixedProb(prob=0.5, seed=123) +# conn(pre_size=size, post_size=size) +# # pre2post = conn.require('pre2post') +# pre_ids, post_ids = conn.require('pre_ids', 'post_ids') +# print("pre_ids size:", pre_ids.size) +# # indices = jnp.arange(size, dtype=jnp.uint32) +# # idnptr = jnp.arange(size + 1, dtype=jnp.uint32) +# sps = bm.random.randint(0, 2, size).value < 1 +# value = 2. +# +# +# # f = jax.jit(event_sum) +# +# +# # @partial(jax.jit, static_argnums=2) +# # def f(sps, pre2post, size, value): +# # return event_sum(sps, pre2post, size, value) +# +# +# @jax.jit +# def ours(events): +# out = jnp.zeros(size) +# out = _event_sum_prim.bind(events, post_ids.value, pre_ids.value, jnp.zeros(1), out) +# # print(type(out), out) +# # print(type(value), value) +# return out +# +# +# # @jax.jity +# # def yours(events): +# # out = jnp.zeros(size) +# # out = out.at[post_ids.value].add(events[pre_ids.value]) +# # return out +# +# +# a = ours(sps) +# b = np.asarray(a) +# # print(b) +# print(b.size) +# +# # a = yours(sps) +# # b = np.asarray(a) +# # # print(b) +# # print(b.size) +# +# sps = bm.random.randint(0, 2, size).value < 1 +# +# t0 = time.time() +# ours(sps) +# print(time.time() - t0) +# +# +# # t0 = time.time() +# # yours(sps) +# # print(time.time() - t0) +# +# # %timeit f(sps, (indices, indptr), size, value) +# # print(timeit.timeit('ours(sps, value)', globals=globals())) +# # print(timeit.timeit('yours(sps, value)', globals=globals())) diff --git a/extensions/lib/atomic_prod_cpu.h b/extensions/lib/atomic_prod_cpu.h deleted file mode 100644 index fbd392f31..000000000 --- a/extensions/lib/atomic_prod_cpu.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _BRAINPYLIB_ATOMIC_prod_H_ -#define _BRAINPYLIB_ATOMIC_prod_H_ - -#include -#include -#include - -namespace brainpy_lib { - void cpu_atomic_prod_heter_f32_i32(void *out, const void **in); - void cpu_atomic_prod_heter_f32_i64(void *out, const void **in); - void cpu_atomic_prod_heter_f64_i32(void *out, const void **in); - void cpu_atomic_prod_heter_f64_i64(void *out, const void **in); - - void cpu_atomic_prod_homo_f32_i32(void *out, const void **in); - void cpu_atomic_prod_homo_f32_i64(void *out, const void **in); - void cpu_atomic_prod_homo_f64_i32(void *out, const void **in); - void cpu_atomic_prod_homo_f64_i64(void *out, const void **in); -} - -#endif \ No newline at end of file diff --git a/extensions/lib/atomic_prod_gpu.cu b/extensions/lib/atomic_prod_gpu.cu deleted file mode 100644 index 97e01c661..000000000 --- a/extensions/lib/atomic_prod_gpu.cu +++ /dev/null @@ -1,153 +0,0 @@ -// This file contains the GPU implementation of our op. It's a pretty typical CUDA kernel -// and I make no promises about the quality of the code or the choices made therein, but -// it should get the point across. - -#include "atomic_prod_gpu.h" - -namespace brainpy_lib { - - namespace { - -// "atomic_prod" operator // - template - __global__ void gpu_atomic_prod_homo_kernel(const std::uint32_t size, - const F &value, - const I *post_ids, - F *result) { - for (std::uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; - i < size; i += blockDim.x * gridDim.x) { - atomicAdd(&result[post_ids[i]], value); - } - } - - template - inline void gpu_atomic_prod_homo(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - // size - const AtomicProdDescriptor &d = *UnpackDescriptor(opaque, opaque_len); - const std::uint32_t conn_size = d.conn_size; - const std::uint32_t post_size = d.post_size; - - // input and output data - const F *values = reinterpret_cast(buffers[0]); // scalar as a vector - const I *post_ids = reinterpret_cast(buffers[1]); - F *result = reinterpret_cast(buffers[2]); - - // call kernel - const int block_dim = 512; - const int grid_dim = std::min(1024, (conn_size + block_dim - 1) / block_dim); - cudaMemset(result, 1, sizeof(F) * post_size); - gpu_atomic_prod_homo_kernel<<>>(conn_size, values[0], post_ids, - result); - ThrowIfError(cudaGetLastError()); - } - - template - __global__ void gpu_atomic_prod_heter_kernel(const std::uint32_t size, - const F *values, - const I *post_ids, - const I *pre_ids, - F *result) { - for (std::uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; - i < size; i += blockDim.x * gridDim.x) { - atomicAdd(&result[post_ids[i]], values[pre_ids[i]]); - } - } - - template - inline void gpu_atomic_prod_heter(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - // size - const AtomicProdDescriptor &d = *UnpackDescriptor(opaque, opaque_len); - const std::uint32_t conn_size = d.conn_size; - const std::uint32_t post_size = d.post_size; - - // input and output data - const F *values = reinterpret_cast(buffers[0]); // scalar as a vector - const I *post_ids = reinterpret_cast(buffers[1]); - const I *pre_ids = reinterpret_cast(buffers[2]); - F *result = reinterpret_cast(buffers[3]); - - // call kernel - const int block_dim = 512; - const int grid_dim = std::min(1024, (conn_size + block_dim - 1) / block_dim); - cudaMemset(result, 1, sizeof(F) * post_size); - gpu_atomic_prod_heter_kernel<<>>(conn_size, values, post_ids, pre_ids, - result); - ThrowIfError(cudaGetLastError()); - } - - - } // namespace - - -// Descriptor - pybind11::bytes build_atomic_prod_descriptor(std::uint32_t conn_size, - std::uint32_t post_size) { - return PackDescriptor(AtomicProdDescriptor{conn_size, post_size}); - } - -// homogenous atomic sum - void gpu_atomic_prod_homo_f32_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_atomic_prod_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_atomic_prod_homo_f32_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_atomic_prod_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_atomic_prod_homo_f64_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_atomic_prod_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_atomic_prod_homo_f64_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_atomic_prod_homo(stream, buffers, opaque, opaque_len); - } - -// heterogeneous atomic sum - void gpu_atomic_prod_heter_f32_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_atomic_prod_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_atomic_prod_heter_f32_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_atomic_prod_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_atomic_prod_heter_f64_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_atomic_prod_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_atomic_prod_heter_f64_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_atomic_prod_heter(stream, buffers, opaque, opaque_len); - } - - -} // namespace brainpylib diff --git a/extensions/lib/atomic_prod_gpu.h b/extensions/lib/atomic_prod_gpu.h deleted file mode 100644 index 1a329f8ca..000000000 --- a/extensions/lib/atomic_prod_gpu.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef _BRAINPY_ATOMIC_prod_KERNELS_H_ -#define _BRAINPY_ATOMIC_prod_KERNELS_H_ - -#include -#include -#include "pybind11_kernel_helpers.h" -#include "kernel_helpers_gpu.h" - -namespace brainpy_lib { - struct AtomicProdDescriptor { - std::uint32_t conn_size; - std::uint32_t post_size; - }; - - // homogeneous - void gpu_atomic_prod_homo_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - void gpu_atomic_prod_homo_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - void gpu_atomic_prod_homo_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - void gpu_atomic_prod_homo_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - // heterogeneous - void gpu_atomic_prod_heter_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - void gpu_atomic_prod_heter_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - void gpu_atomic_prod_heter_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - void gpu_atomic_prod_heter_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - // descriptors - pybind11::bytes build_atomic_prod_descriptor(std::uint32_t conn_size, std::uint32_t post_size); - -} // namespace brainpy_lib - -#endif \ No newline at end of file diff --git a/extensions/lib/atomic_sum_cpu.h b/extensions/lib/atomic_sum_cpu.h deleted file mode 100644 index fb4b8b7a6..000000000 --- a/extensions/lib/atomic_sum_cpu.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _BRAINPYLIB_ATOMIC_SUM_H_ -#define _BRAINPYLIB_ATOMIC_SUM_H_ - -#include -#include -#include - -namespace brainpy_lib { - void cpu_atomic_sum_heter_f32_i32(void *out, const void **in); - void cpu_atomic_sum_heter_f32_i64(void *out, const void **in); - void cpu_atomic_sum_heter_f64_i32(void *out, const void **in); - void cpu_atomic_sum_heter_f64_i64(void *out, const void **in); - - void cpu_atomic_sum_homo_f32_i32(void *out, const void **in); - void cpu_atomic_sum_homo_f32_i64(void *out, const void **in); - void cpu_atomic_sum_homo_f64_i32(void *out, const void **in); - void cpu_atomic_sum_homo_f64_i64(void *out, const void **in); -} - -#endif \ No newline at end of file diff --git a/extensions/lib/atomic_sum_gpu.h b/extensions/lib/atomic_sum_gpu.h deleted file mode 100644 index 401a1c7d8..000000000 --- a/extensions/lib/atomic_sum_gpu.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef _BRAINPY_ATOMIC_SUM_KERNELS_H_ -#define _BRAINPY_ATOMIC_SUM_KERNELS_H_ - -#include -#include -#include "pybind11_kernel_helpers.h" -#include "kernel_helpers_gpu.h" - -namespace brainpy_lib { - struct AtomicSumDescriptor { - std::uint32_t conn_size; - std::uint32_t post_size; - }; - - // homogeneous - void gpu_atomic_sum_homo_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - void gpu_atomic_sum_homo_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - void gpu_atomic_sum_homo_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - void gpu_atomic_sum_homo_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - // heterogeneous - void gpu_atomic_sum_heter_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - void gpu_atomic_sum_heter_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - void gpu_atomic_sum_heter_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - void gpu_atomic_sum_heter_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - // descriptors - pybind11::bytes build_atomic_sum_descriptor(std::uint32_t conn_size, std::uint32_t post_size); - -} // namespace brainpy_lib - -#endif \ No newline at end of file diff --git a/extensions/lib/atomic_prod_cpu.cc b/extensions/lib/cpu_atomic_prod.cc similarity index 54% rename from extensions/lib/atomic_prod_cpu.cc rename to extensions/lib/cpu_atomic_prod.cc index dbf1dc1a3..84a1ce5b3 100644 --- a/extensions/lib/atomic_prod_cpu.cc +++ b/extensions/lib/cpu_atomic_prod.cc @@ -1,9 +1,9 @@ -#include "atomic_prod_cpu.h" +#include "cpu_atomic_prod.h" namespace brainpy_lib { namespace{ template - void cpu_atomic_prod_heter(void *out, const void **in) { + void cpu_coo_atomic_prod_heter(void *out, const void **in) { // The inputs const F *values = reinterpret_cast(in[0]); const I *pre_ids = reinterpret_cast(in[1]); @@ -23,7 +23,7 @@ namespace{ } template - void cpu_atomic_prod_homo(void *out, const void **in) { + void cpu_coo_atomic_prod_homo(void *out, const void **in) { // The inputs const F *values = reinterpret_cast(in[0]); // scalar as a vector const F value = values[0]; @@ -43,14 +43,22 @@ namespace{ } } -void cpu_atomic_prod_heter_f32_i32(void *out, const void **in){cpu_atomic_prod_heter(out, in);} -void cpu_atomic_prod_heter_f32_i64(void *out, const void **in){cpu_atomic_prod_heter(out, in);} -void cpu_atomic_prod_heter_f64_i32(void *out, const void **in){cpu_atomic_prod_heter(out, in);} -void cpu_atomic_prod_heter_f64_i64(void *out, const void **in){cpu_atomic_prod_heter(out, in);} +void cpu_coo_atomic_prod_heter_f32_i32(void *out, const void **in){ + cpu_coo_atomic_prod_heter(out, in);} +void cpu_coo_atomic_prod_heter_f32_i64(void *out, const void **in){ + cpu_coo_atomic_prod_heter(out, in);} +void cpu_coo_atomic_prod_heter_f64_i32(void *out, const void **in){ + cpu_coo_atomic_prod_heter(out, in);} +void cpu_coo_atomic_prod_heter_f64_i64(void *out, const void **in){ + cpu_coo_atomic_prod_heter(out, in);} -void cpu_atomic_prod_homo_f32_i32(void *out, const void **in){cpu_atomic_prod_homo(out, in);} -void cpu_atomic_prod_homo_f32_i64(void *out, const void **in){cpu_atomic_prod_homo(out, in);} -void cpu_atomic_prod_homo_f64_i32(void *out, const void **in){cpu_atomic_prod_homo(out, in);} -void cpu_atomic_prod_homo_f64_i64(void *out, const void **in){cpu_atomic_prod_homo(out, in);} +void cpu_coo_atomic_prod_homo_f32_i32(void *out, const void **in){ + cpu_coo_atomic_prod_homo(out, in);} +void cpu_coo_atomic_prod_homo_f32_i64(void *out, const void **in){ + cpu_coo_atomic_prod_homo(out, in);} +void cpu_coo_atomic_prod_homo_f64_i32(void *out, const void **in){ + cpu_coo_atomic_prod_homo(out, in);} +void cpu_coo_atomic_prod_homo_f64_i64(void *out, const void **in){ + cpu_coo_atomic_prod_homo(out, in);} } diff --git a/extensions/lib/cpu_atomic_prod.h b/extensions/lib/cpu_atomic_prod.h new file mode 100644 index 000000000..dd5d54b68 --- /dev/null +++ b/extensions/lib/cpu_atomic_prod.h @@ -0,0 +1,20 @@ +#ifndef _BRAINPYLIB_ATOMIC_prod_H_ +#define _BRAINPYLIB_ATOMIC_prod_H_ + +#include +#include +#include + +namespace brainpy_lib { + void cpu_coo_atomic_prod_heter_f32_i32(void *out, const void **in); + void cpu_coo_atomic_prod_heter_f32_i64(void *out, const void **in); + void cpu_coo_atomic_prod_heter_f64_i32(void *out, const void **in); + void cpu_coo_atomic_prod_heter_f64_i64(void *out, const void **in); + + void cpu_coo_atomic_prod_homo_f32_i32(void *out, const void **in); + void cpu_coo_atomic_prod_homo_f32_i64(void *out, const void **in); + void cpu_coo_atomic_prod_homo_f64_i32(void *out, const void **in); + void cpu_coo_atomic_prod_homo_f64_i64(void *out, const void **in); +} + +#endif \ No newline at end of file diff --git a/extensions/lib/atomic_sum_cpu.cc b/extensions/lib/cpu_atomic_sum.cc similarity index 54% rename from extensions/lib/atomic_sum_cpu.cc rename to extensions/lib/cpu_atomic_sum.cc index 7f255b1dc..e7a9f719e 100644 --- a/extensions/lib/atomic_sum_cpu.cc +++ b/extensions/lib/cpu_atomic_sum.cc @@ -1,9 +1,9 @@ -#include "atomic_sum_cpu.h" +#include "cpu_atomic_sum.h" namespace brainpy_lib { namespace{ template - void cpu_atomic_sum_heter(void *out, const void **in) { + void cpu_coo_atomic_sum_heter(void *out, const void **in) { // The inputs const F *values = reinterpret_cast(in[0]); const I *pre_ids = reinterpret_cast(in[1]); @@ -22,7 +22,7 @@ namespace{ } template - void cpu_atomic_sum_homo(void *out, const void **in) { + void cpu_coo_atomic_sum_homo(void *out, const void **in) { // The inputs const F *values = reinterpret_cast(in[0]); // scalar as a vector const F value = values[0]; @@ -41,14 +41,14 @@ namespace{ } } -void cpu_atomic_sum_heter_f32_i32(void *out, const void **in){cpu_atomic_sum_heter(out, in);} -void cpu_atomic_sum_heter_f32_i64(void *out, const void **in){cpu_atomic_sum_heter(out, in);} -void cpu_atomic_sum_heter_f64_i32(void *out, const void **in){cpu_atomic_sum_heter(out, in);} -void cpu_atomic_sum_heter_f64_i64(void *out, const void **in){cpu_atomic_sum_heter(out, in);} +void cpu_coo_atomic_sum_heter_f32_i32(void *out, const void **in){cpu_coo_atomic_sum_heter(out, in);} +void cpu_coo_atomic_sum_heter_f32_i64(void *out, const void **in){cpu_coo_atomic_sum_heter(out, in);} +void cpu_coo_atomic_sum_heter_f64_i32(void *out, const void **in){cpu_coo_atomic_sum_heter(out, in);} +void cpu_coo_atomic_sum_heter_f64_i64(void *out, const void **in){cpu_coo_atomic_sum_heter(out, in);} -void cpu_atomic_sum_homo_f32_i32(void *out, const void **in){cpu_atomic_sum_homo(out, in);} -void cpu_atomic_sum_homo_f32_i64(void *out, const void **in){cpu_atomic_sum_homo(out, in);} -void cpu_atomic_sum_homo_f64_i32(void *out, const void **in){cpu_atomic_sum_homo(out, in);} -void cpu_atomic_sum_homo_f64_i64(void *out, const void **in){cpu_atomic_sum_homo(out, in);} +void cpu_coo_atomic_sum_homo_f32_i32(void *out, const void **in){cpu_coo_atomic_sum_homo(out, in);} +void cpu_coo_atomic_sum_homo_f32_i64(void *out, const void **in){cpu_coo_atomic_sum_homo(out, in);} +void cpu_coo_atomic_sum_homo_f64_i32(void *out, const void **in){cpu_coo_atomic_sum_homo(out, in);} +void cpu_coo_atomic_sum_homo_f64_i64(void *out, const void **in){cpu_coo_atomic_sum_homo(out, in);} } diff --git a/extensions/lib/cpu_atomic_sum.h b/extensions/lib/cpu_atomic_sum.h new file mode 100644 index 000000000..2a9eb9714 --- /dev/null +++ b/extensions/lib/cpu_atomic_sum.h @@ -0,0 +1,20 @@ +#ifndef _BRAINPYLIB_ATOMIC_SUM_H_ +#define _BRAINPYLIB_ATOMIC_SUM_H_ + +#include +#include +#include + +namespace brainpy_lib { + void cpu_coo_atomic_sum_heter_f32_i32(void *out, const void **in); + void cpu_coo_atomic_sum_heter_f32_i64(void *out, const void **in); + void cpu_coo_atomic_sum_heter_f64_i32(void *out, const void **in); + void cpu_coo_atomic_sum_heter_f64_i64(void *out, const void **in); + + void cpu_coo_atomic_sum_homo_f32_i32(void *out, const void **in); + void cpu_coo_atomic_sum_homo_f32_i64(void *out, const void **in); + void cpu_coo_atomic_sum_homo_f64_i32(void *out, const void **in); + void cpu_coo_atomic_sum_homo_f64_i64(void *out, const void **in); +} + +#endif \ No newline at end of file diff --git a/extensions/lib/event_prod_cpu.cc b/extensions/lib/cpu_event_prod.cc similarity index 61% rename from extensions/lib/event_prod_cpu.cc rename to extensions/lib/cpu_event_prod.cc index bbdb9f0ca..e72a4ce16 100644 --- a/extensions/lib/event_prod_cpu.cc +++ b/extensions/lib/cpu_event_prod.cc @@ -1,9 +1,9 @@ -#include "event_prod_cpu.h" +#include "cpu_event_prod.h" namespace brainpy_lib { namespace{ template - void cpu_event_prod_homo(void *out, const void **in) { + void cpu_csr_event_prod_homo(void *out, const void **in) { // Parse the inputs const std::uint32_t pre_size = *reinterpret_cast(in[0]); const std::uint32_t post_size = *reinterpret_cast(in[1]); @@ -30,7 +30,7 @@ namespace{ } template - void cpu_event_prod_heter(void *out, const void **in) { + void cpu_csr_event_prod_heter(void *out, const void **in) { // Parse the inputs const std::uint32_t pre_size = *reinterpret_cast(in[0]); const std::uint32_t post_size = *reinterpret_cast(in[1]); @@ -55,13 +55,13 @@ namespace{ } } -void cpu_event_prod_homo_f32_i32(void *out, const void **in){cpu_event_prod_homo(out, in);} -void cpu_event_prod_homo_f32_i64(void *out, const void **in){cpu_event_prod_homo(out, in);} -void cpu_event_prod_homo_f64_i32(void *out, const void **in){cpu_event_prod_homo(out, in);} -void cpu_event_prod_homo_f64_i64(void *out, const void **in){cpu_event_prod_homo(out, in);} +void cpu_csr_event_prod_homo_f32_i32(void *out, const void **in){cpu_csr_event_prod_homo(out, in);} +void cpu_csr_event_prod_homo_f32_i64(void *out, const void **in){cpu_csr_event_prod_homo(out, in);} +void cpu_csr_event_prod_homo_f64_i32(void *out, const void **in){cpu_csr_event_prod_homo(out, in);} +void cpu_csr_event_prod_homo_f64_i64(void *out, const void **in){cpu_csr_event_prod_homo(out, in);} -void cpu_event_prod_heter_f32_i32(void *out, const void **in){cpu_event_prod_heter(out, in);} -void cpu_event_prod_heter_f32_i64(void *out, const void **in){cpu_event_prod_heter(out, in);} -void cpu_event_prod_heter_f64_i32(void *out, const void **in){cpu_event_prod_heter(out, in);} -void cpu_event_prod_heter_f64_i64(void *out, const void **in){cpu_event_prod_heter(out, in);} +void cpu_csr_event_prod_heter_f32_i32(void *out, const void **in){cpu_csr_event_prod_heter(out, in);} +void cpu_csr_event_prod_heter_f32_i64(void *out, const void **in){cpu_csr_event_prod_heter(out, in);} +void cpu_csr_event_prod_heter_f64_i32(void *out, const void **in){cpu_csr_event_prod_heter(out, in);} +void cpu_csr_event_prod_heter_f64_i64(void *out, const void **in){cpu_csr_event_prod_heter(out, in);} } diff --git a/extensions/lib/cpu_event_prod.h b/extensions/lib/cpu_event_prod.h new file mode 100644 index 000000000..6af73cbc3 --- /dev/null +++ b/extensions/lib/cpu_event_prod.h @@ -0,0 +1,21 @@ +#ifndef _BRAINPY_EVENT_prod_H_ +#define _BRAINPY_EVENT_prod_H_ + +#include +#include +#include + +namespace brainpy_lib { + // "values" is homogeneous + void cpu_csr_event_prod_homo_f32_i32(void *out, const void **in); + void cpu_csr_event_prod_homo_f32_i64(void *out, const void **in); + void cpu_csr_event_prod_homo_f64_i32(void *out, const void **in); + void cpu_csr_event_prod_homo_f64_i64(void *out, const void **in); + // "values" is heterogeneous + void cpu_csr_event_prod_heter_f32_i32(void *out, const void **in); + void cpu_csr_event_prod_heter_f32_i64(void *out, const void **in); + void cpu_csr_event_prod_heter_f64_i32(void *out, const void **in); + void cpu_csr_event_prod_heter_f64_i64(void *out, const void **in); +} + +#endif \ No newline at end of file diff --git a/extensions/lib/event_sum_cpu.cc b/extensions/lib/cpu_event_sum.cc similarity index 71% rename from extensions/lib/event_sum_cpu.cc rename to extensions/lib/cpu_event_sum.cc index 18bf8c738..016ad60a1 100644 --- a/extensions/lib/event_sum_cpu.cc +++ b/extensions/lib/cpu_event_sum.cc @@ -1,15 +1,15 @@ -#include "event_sum_cpu.h" +#include "cpu_event_sum.h" namespace brainpy_lib { namespace{ template - void cpu_event_sum_homo(void *out, const void **in) { + void cpu_csr_event_sum_homo(void *out, const void **in) { const std::uint32_t pre_size = *reinterpret_cast(in[0]); const std::uint32_t post_size = *reinterpret_cast(in[1]); const bool *events = reinterpret_cast(in[2]); const I *indices = reinterpret_cast(in[3]); const I *indptr = reinterpret_cast(in[4]); - const F weight = *reinterpret_cast(in[5]); + const F weight = reinterpret_cast(in[5])[0]; F *result = reinterpret_cast(out); // algorithm @@ -25,7 +25,7 @@ namespace{ // TODO:: batch version of "event_sum_homo" CPU operator template - void cpu_event_sum_batch_homo(void *out, const void **in) { + void cpu_csr_event_sum_batch_homo(void *out, const void **in) { const std::uint32_t pre_size = *reinterpret_cast(in[0]); const std::uint32_t post_size = *reinterpret_cast(in[1]); const bool *events = reinterpret_cast(in[2]); @@ -46,7 +46,7 @@ namespace{ } template - void cpu_event_sum_heter(void *out, const void **in) { + void cpu_csr_event_sum_heter(void *out, const void **in) { const std::uint32_t pre_size = *reinterpret_cast(in[0]); const std::uint32_t post_size = *reinterpret_cast(in[1]); const bool *events = reinterpret_cast(in[2]); @@ -69,7 +69,7 @@ namespace{ // TODO:: batch version of "event_sum_heter" CPU operator template - void cpu_event_sum_batch_heter(void *out, const void **in) { + void cpu_csr_event_sum_batch_heter(void *out, const void **in) { const std::uint32_t pre_size = *reinterpret_cast(in[0]); const std::uint32_t post_size = *reinterpret_cast(in[1]); const bool *events = reinterpret_cast(in[2]); @@ -92,13 +92,13 @@ namespace{ } -void cpu_event_sum_homo_f32_i32(void *out, const void **in){cpu_event_sum_homo(out, in);} -void cpu_event_sum_homo_f32_i64(void *out, const void **in){cpu_event_sum_homo(out, in);} -void cpu_event_sum_homo_f64_i32(void *out, const void **in){cpu_event_sum_homo(out, in);} -void cpu_event_sum_homo_f64_i64(void *out, const void **in){cpu_event_sum_homo(out, in);} +void cpu_csr_event_sum_homo_f32_i32(void *out, const void **in){cpu_csr_event_sum_homo(out, in);} +void cpu_csr_event_sum_homo_f32_i64(void *out, const void **in){cpu_csr_event_sum_homo(out, in);} +void cpu_csr_event_sum_homo_f64_i32(void *out, const void **in){cpu_csr_event_sum_homo(out, in);} +void cpu_csr_event_sum_homo_f64_i64(void *out, const void **in){cpu_csr_event_sum_homo(out, in);} -void cpu_event_sum_heter_f32_i32(void *out, const void **in){cpu_event_sum_heter(out, in);} -void cpu_event_sum_heter_f32_i64(void *out, const void **in){cpu_event_sum_heter(out, in);} -void cpu_event_sum_heter_f64_i32(void *out, const void **in){cpu_event_sum_heter(out, in);} -void cpu_event_sum_heter_f64_i64(void *out, const void **in){cpu_event_sum_heter(out, in);} +void cpu_csr_event_sum_heter_f32_i32(void *out, const void **in){cpu_csr_event_sum_heter(out, in);} +void cpu_csr_event_sum_heter_f32_i64(void *out, const void **in){cpu_csr_event_sum_heter(out, in);} +void cpu_csr_event_sum_heter_f64_i32(void *out, const void **in){cpu_csr_event_sum_heter(out, in);} +void cpu_csr_event_sum_heter_f64_i64(void *out, const void **in){cpu_csr_event_sum_heter(out, in);} } diff --git a/extensions/lib/cpu_event_sum.h b/extensions/lib/cpu_event_sum.h new file mode 100644 index 000000000..feed5bc77 --- /dev/null +++ b/extensions/lib/cpu_event_sum.h @@ -0,0 +1,21 @@ +#ifndef _BRAINPY_EVENT_SUM_H_ +#define _BRAINPY_EVENT_SUM_H_ + +#include +#include +#include + +namespace brainpy_lib { + // "values" is homogeneous + void cpu_csr_event_sum_homo_f32_i32(void *out, const void **in); + void cpu_csr_event_sum_homo_f32_i64(void *out, const void **in); + void cpu_csr_event_sum_homo_f64_i32(void *out, const void **in); + void cpu_csr_event_sum_homo_f64_i64(void *out, const void **in); + // "values" is heterogeneous + void cpu_csr_event_sum_heter_f32_i32(void *out, const void **in); + void cpu_csr_event_sum_heter_f32_i64(void *out, const void **in); + void cpu_csr_event_sum_heter_f64_i32(void *out, const void **in); + void cpu_csr_event_sum_heter_f64_i64(void *out, const void **in); +} + +#endif \ No newline at end of file diff --git a/extensions/lib/cpu_ops.cc b/extensions/lib/cpu_ops.cc index 68b55566d..85d056c8f 100644 --- a/extensions/lib/cpu_ops.cc +++ b/extensions/lib/cpu_ops.cc @@ -2,10 +2,10 @@ // It is exposed as a standard pybind11 module defining "capsule" objects containing our // method. For simplicity, we export a separate capsule for each supported dtype. -#include "event_sum_cpu.h" -#include "event_prod_cpu.h" -#include "atomic_sum_cpu.h" -#include "atomic_prod_cpu.h" +#include "cpu_event_sum.h" +#include "cpu_event_prod.h" +#include "cpu_atomic_sum.h" +#include "cpu_atomic_prod.h" #include "pybind11_kernel_helpers.h" using namespace brainpy_lib; @@ -15,48 +15,48 @@ namespace { pybind11::dict dict; // event_sum for homogeneous value - dict["cpu_event_sum_homo_f32_i32"] = EncapsulateFunction(cpu_event_sum_homo_f32_i32); - dict["cpu_event_sum_homo_f32_i64"] = EncapsulateFunction(cpu_event_sum_homo_f32_i64); - dict["cpu_event_sum_homo_f64_i32"] = EncapsulateFunction(cpu_event_sum_homo_f64_i32); - dict["cpu_event_sum_homo_f64_i64"] = EncapsulateFunction(cpu_event_sum_homo_f64_i64); + dict["cpu_csr_event_sum_homo_f32_i32"] = EncapsulateFunction(cpu_csr_event_sum_homo_f32_i32); + dict["cpu_csr_event_sum_homo_f32_i64"] = EncapsulateFunction(cpu_csr_event_sum_homo_f32_i64); + dict["cpu_csr_event_sum_homo_f64_i32"] = EncapsulateFunction(cpu_csr_event_sum_homo_f64_i32); + dict["cpu_csr_event_sum_homo_f64_i64"] = EncapsulateFunction(cpu_csr_event_sum_homo_f64_i64); // event_sum for heterogeneous values - dict["cpu_event_sum_heter_f32_i32"] = EncapsulateFunction(cpu_event_sum_heter_f32_i32); - dict["cpu_event_sum_heter_f32_i64"] = EncapsulateFunction(cpu_event_sum_heter_f32_i64); - dict["cpu_event_sum_heter_f64_i32"] = EncapsulateFunction(cpu_event_sum_heter_f64_i32); - dict["cpu_event_sum_heter_f64_i64"] = EncapsulateFunction(cpu_event_sum_heter_f64_i64); + dict["cpu_csr_event_sum_heter_f32_i32"] = EncapsulateFunction(cpu_csr_event_sum_heter_f32_i32); + dict["cpu_csr_event_sum_heter_f32_i64"] = EncapsulateFunction(cpu_csr_event_sum_heter_f32_i64); + dict["cpu_csr_event_sum_heter_f64_i32"] = EncapsulateFunction(cpu_csr_event_sum_heter_f64_i32); + dict["cpu_csr_event_sum_heter_f64_i64"] = EncapsulateFunction(cpu_csr_event_sum_heter_f64_i64); // event_prod for homogeneous value - dict["cpu_event_prod_homo_f32_i32"] = EncapsulateFunction(cpu_event_prod_homo_f32_i32); - dict["cpu_event_prod_homo_f32_i64"] = EncapsulateFunction(cpu_event_prod_homo_f32_i64); - dict["cpu_event_prod_homo_f64_i32"] = EncapsulateFunction(cpu_event_prod_homo_f64_i32); - dict["cpu_event_prod_homo_f64_i64"] = EncapsulateFunction(cpu_event_prod_homo_f64_i64); + dict["cpu_csr_event_prod_homo_f32_i32"] = EncapsulateFunction(cpu_csr_event_prod_homo_f32_i32); + dict["cpu_csr_event_prod_homo_f32_i64"] = EncapsulateFunction(cpu_csr_event_prod_homo_f32_i64); + dict["cpu_csr_event_prod_homo_f64_i32"] = EncapsulateFunction(cpu_csr_event_prod_homo_f64_i32); + dict["cpu_csr_event_prod_homo_f64_i64"] = EncapsulateFunction(cpu_csr_event_prod_homo_f64_i64); // event_prod for heterogeneous values - dict["cpu_event_prod_heter_f32_i32"] = EncapsulateFunction(cpu_event_prod_heter_f32_i32); - dict["cpu_event_prod_heter_f32_i64"] = EncapsulateFunction(cpu_event_prod_heter_f32_i64); - dict["cpu_event_prod_heter_f64_i32"] = EncapsulateFunction(cpu_event_prod_heter_f64_i32); - dict["cpu_event_prod_heter_f64_i64"] = EncapsulateFunction(cpu_event_prod_heter_f64_i64); + dict["cpu_csr_event_prod_heter_f32_i32"] = EncapsulateFunction(cpu_csr_event_prod_heter_f32_i32); + dict["cpu_csr_event_prod_heter_f32_i64"] = EncapsulateFunction(cpu_csr_event_prod_heter_f32_i64); + dict["cpu_csr_event_prod_heter_f64_i32"] = EncapsulateFunction(cpu_csr_event_prod_heter_f64_i32); + dict["cpu_csr_event_prod_heter_f64_i64"] = EncapsulateFunction(cpu_csr_event_prod_heter_f64_i64); // atomic_sum for heterogeneous values - dict["cpu_atomic_sum_heter_f32_i32"] = EncapsulateFunction(cpu_atomic_sum_heter_f32_i32); - dict["cpu_atomic_sum_heter_f32_i64"] = EncapsulateFunction(cpu_atomic_sum_heter_f32_i64); - dict["cpu_atomic_sum_heter_f64_i32"] = EncapsulateFunction(cpu_atomic_sum_heter_f64_i32); - dict["cpu_atomic_sum_heter_f64_i64"] = EncapsulateFunction(cpu_atomic_sum_heter_f64_i64); + dict["cpu_coo_atomic_sum_heter_f32_i32"] = EncapsulateFunction(cpu_coo_atomic_sum_heter_f32_i32); + dict["cpu_coo_atomic_sum_heter_f32_i64"] = EncapsulateFunction(cpu_coo_atomic_sum_heter_f32_i64); + dict["cpu_coo_atomic_sum_heter_f64_i32"] = EncapsulateFunction(cpu_coo_atomic_sum_heter_f64_i32); + dict["cpu_coo_atomic_sum_heter_f64_i64"] = EncapsulateFunction(cpu_coo_atomic_sum_heter_f64_i64); // atomic_sum for homogeneous value - dict["cpu_atomic_sum_homo_f32_i32"] = EncapsulateFunction(cpu_atomic_sum_homo_f32_i32); - dict["cpu_atomic_sum_homo_f32_i64"] = EncapsulateFunction(cpu_atomic_sum_homo_f32_i64); - dict["cpu_atomic_sum_homo_f64_i32"] = EncapsulateFunction(cpu_atomic_sum_homo_f64_i32); - dict["cpu_atomic_sum_homo_f64_i64"] = EncapsulateFunction(cpu_atomic_sum_homo_f64_i64); + dict["cpu_coo_atomic_sum_homo_f32_i32"] = EncapsulateFunction(cpu_coo_atomic_sum_homo_f32_i32); + dict["cpu_coo_atomic_sum_homo_f32_i64"] = EncapsulateFunction(cpu_coo_atomic_sum_homo_f32_i64); + dict["cpu_coo_atomic_sum_homo_f64_i32"] = EncapsulateFunction(cpu_coo_atomic_sum_homo_f64_i32); + dict["cpu_coo_atomic_sum_homo_f64_i64"] = EncapsulateFunction(cpu_coo_atomic_sum_homo_f64_i64); // atomic_prod for heterogeneous values - dict["cpu_atomic_prod_heter_f32_i32"] = EncapsulateFunction(cpu_atomic_prod_heter_f32_i32); - dict["cpu_atomic_prod_heter_f32_i64"] = EncapsulateFunction(cpu_atomic_prod_heter_f32_i64); - dict["cpu_atomic_prod_heter_f64_i32"] = EncapsulateFunction(cpu_atomic_prod_heter_f64_i32); - dict["cpu_atomic_prod_heter_f64_i64"] = EncapsulateFunction(cpu_atomic_prod_heter_f64_i64); + dict["cpu_coo_atomic_prod_heter_f32_i32"] = EncapsulateFunction(cpu_coo_atomic_prod_heter_f32_i32); + dict["cpu_coo_atomic_prod_heter_f32_i64"] = EncapsulateFunction(cpu_coo_atomic_prod_heter_f32_i64); + dict["cpu_coo_atomic_prod_heter_f64_i32"] = EncapsulateFunction(cpu_coo_atomic_prod_heter_f64_i32); + dict["cpu_coo_atomic_prod_heter_f64_i64"] = EncapsulateFunction(cpu_coo_atomic_prod_heter_f64_i64); // atomic_prod for homogeneous value - dict["cpu_atomic_prod_homo_f32_i32"] = EncapsulateFunction(cpu_atomic_prod_homo_f32_i32); - dict["cpu_atomic_prod_homo_f32_i64"] = EncapsulateFunction(cpu_atomic_prod_homo_f32_i64); - dict["cpu_atomic_prod_homo_f64_i32"] = EncapsulateFunction(cpu_atomic_prod_homo_f64_i32); - dict["cpu_atomic_prod_homo_f64_i64"] = EncapsulateFunction(cpu_atomic_prod_homo_f64_i64); + dict["cpu_coo_atomic_prod_homo_f32_i32"] = EncapsulateFunction(cpu_coo_atomic_prod_homo_f32_i32); + dict["cpu_coo_atomic_prod_homo_f32_i64"] = EncapsulateFunction(cpu_coo_atomic_prod_homo_f32_i64); + dict["cpu_coo_atomic_prod_homo_f64_i32"] = EncapsulateFunction(cpu_coo_atomic_prod_homo_f64_i32); + dict["cpu_coo_atomic_prod_homo_f64_i64"] = EncapsulateFunction(cpu_coo_atomic_prod_homo_f64_i64); return dict; } diff --git a/extensions/lib/event_prod_cpu.h b/extensions/lib/event_prod_cpu.h deleted file mode 100644 index 565b8b2e8..000000000 --- a/extensions/lib/event_prod_cpu.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef _BRAINPY_EVENT_prod_H_ -#define _BRAINPY_EVENT_prod_H_ - -#include -#include -#include - -namespace brainpy_lib { - // "values" is homogeneous - void cpu_event_prod_homo_f32_i32(void *out, const void **in); - void cpu_event_prod_homo_f32_i64(void *out, const void **in); - void cpu_event_prod_homo_f64_i32(void *out, const void **in); - void cpu_event_prod_homo_f64_i64(void *out, const void **in); - // "values" is heterogeneous - void cpu_event_prod_heter_f32_i32(void *out, const void **in); - void cpu_event_prod_heter_f32_i64(void *out, const void **in); - void cpu_event_prod_heter_f64_i32(void *out, const void **in); - void cpu_event_prod_heter_f64_i64(void *out, const void **in); -} - -#endif \ No newline at end of file diff --git a/extensions/lib/event_sum_cpu.h b/extensions/lib/event_sum_cpu.h deleted file mode 100644 index c9b47bc70..000000000 --- a/extensions/lib/event_sum_cpu.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef _BRAINPY_EVENT_SUM_H_ -#define _BRAINPY_EVENT_SUM_H_ - -#include -#include -#include - -namespace brainpy_lib { - // "values" is homogeneous - void cpu_event_sum_homo_f32_i32(void *out, const void **in); - void cpu_event_sum_homo_f32_i64(void *out, const void **in); - void cpu_event_sum_homo_f64_i32(void *out, const void **in); - void cpu_event_sum_homo_f64_i64(void *out, const void **in); - // "values" is heterogeneous - void cpu_event_sum_heter_f32_i32(void *out, const void **in); - void cpu_event_sum_heter_f32_i64(void *out, const void **in); - void cpu_event_sum_heter_f64_i32(void *out, const void **in); - void cpu_event_sum_heter_f64_i64(void *out, const void **in); -} - -#endif \ No newline at end of file diff --git a/extensions/lib/event_sum_gpu.cu b/extensions/lib/event_sum_gpu.cu deleted file mode 100644 index 79973c9ec..000000000 --- a/extensions/lib/event_sum_gpu.cu +++ /dev/null @@ -1,836 +0,0 @@ -// This file contains the GPU implementation of our op. It's a pretty typical CUDA kernel -// and I make no promises about the quality of the code or the choices made therein, but -// it should get the point across. - -#include "event_sum_gpu.h" - -namespace brainpy_lib { - - namespace { - - -// "event_sum_homo" operator // -// This function launches "num_of_pre_neuron" threads to -// update the "result" (in global memory) - template - __global__ void event_sum_homo_kernel(const std::uint32_t size, - const bool *events, - const I *indices, - const I *indptr, - const F &value, - F *result) { - for (std::uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; - i < size; i += blockDim.x * gridDim.x) { - if (events[i]) { - for (I j = indptr[i]; j < indptr[i + 1]; ++j) { - atomicAdd(&result[indices[j]], value); - } - } - } - } - - template - inline void gpu_event_sum_homo(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - // size - const EventSumDescriptor &d = *UnpackDescriptor(opaque, opaque_len); - const std::uint32_t pre_size = d.pre_size; - const std::uint32_t post_size = d.post_size; - - // input and output data - const bool *events = reinterpret_cast(buffers[0]); - const I *indices = reinterpret_cast(buffers[1]); - const I *indptr = reinterpret_cast(buffers[2]); - const F *value = reinterpret_cast(buffers[3]); - F *result = reinterpret_cast(buffers[4]); - - // call kernel - const int block_dim = 512; - const int grid_dim = std::min(1024, (pre_size + block_dim - 1) / block_dim); - cudaMemset(result, 0, sizeof(F) * post_size); - event_sum_homo_kernel<<>>(pre_size, events, indices, indptr, value[0], - result); - ThrowIfError(cudaGetLastError()); - } - - template - __global__ void event_sum_heter_kernel(const std::uint32_t size, - const bool *events, - const I *indices, - const I *indptr, - const F *values, - F *result) { - for (std::uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; - i < size; i += blockDim.x * gridDim.x) { - if (events[i]) { - for (I j = indptr[i]; j < indptr[i + 1]; ++j) { - atomicAdd(&result[indices[j]], values[j]); - } - } - } - } - - template - inline void gpu_event_sum_heter(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - // size - const EventSumDescriptor &d = *UnpackDescriptor(opaque, opaque_len); - const std::uint32_t pre_size = d.pre_size; - const std::uint32_t post_size = d.post_size; - - // input and output data - const bool *events = reinterpret_cast(buffers[0]); - const I *indices = reinterpret_cast(buffers[1]); - const I *indptr = reinterpret_cast(buffers[2]); - const F *values = reinterpret_cast(buffers[3]); - F *result = reinterpret_cast(buffers[4]); - - // call kernel - const int block_dim = 512; - const int grid_dim = std::min(1024, (pre_size + block_dim - 1) / block_dim); - cudaMemset(result, 0, sizeof(F) * post_size); - event_sum_heter_kernel<<>>(pre_size, events, indices, indptr, values, - result); - ThrowIfError(cudaGetLastError()); - } - - -// "event_sum2" operator // - - template - __global__ void event_sum2_homo_kernel(const std::uint32_t size, - const bool *events, - const I *pre_ids, - const I *post_ids, - const F &value, - F *result) { - for (std::uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; - i < size; i += blockDim.x * gridDim.x) { - if (events[pre_ids[i]]) { - atomicAdd(&result[post_ids[i]], value); - } - } - } - - template - inline void gpu_event_sum2_homo(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - // size - const EventSum2Descriptor &d = *UnpackDescriptor(opaque, opaque_len); - const std::uint32_t conn_size = d.conn_size; - const std::uint32_t post_size = d.post_size; - - // input and output data - const bool *events = reinterpret_cast(buffers[0]); - const I *pre_ids = reinterpret_cast(buffers[1]); - const I *post_ids = reinterpret_cast(buffers[2]); - const F *value = reinterpret_cast(buffers[3]); - F *result = reinterpret_cast(buffers[4]); - - // call kernel - const int block_dim = 512; - const int grid_dim = std::min(1024, (conn_size + block_dim - 1) / block_dim); - cudaMemset(result, 0, sizeof(F) * post_size); - event_sum2_homo_kernel<<>>(conn_size, events, pre_ids, post_ids, - value[0], result); - ThrowIfError(cudaGetLastError()); - } - - template - __global__ void event_sum2_heter_kernel(const std::uint32_t size, - const bool *events, - const I *pre_ids, - const I *post_ids, - const F *values, - F *result) { - for (std::uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; - i < size; i += blockDim.x * gridDim.x) { - if (events[pre_ids[i]]) { - atomicAdd(&result[post_ids[i]], values[i]); - } - } - } - - template - inline void gpu_event_sum2_heter(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - // size - const EventSum2Descriptor &d = *UnpackDescriptor(opaque, opaque_len); - const std::uint32_t conn_size = d.conn_size; - const std::uint32_t post_size = d.post_size; - - // iput and output data - const bool *events = reinterpret_cast(buffers[0]); - const I *pre_ids = reinterpret_cast(buffers[1]); - const I *post_ids = reinterpret_cast(buffers[2]); - const F *values = reinterpret_cast(buffers[3]); - F *result = reinterpret_cast(buffers[4]); - - // call kernel - const int block_dim = 512; - const int grid_dim = std::min(1024, (conn_size + block_dim - 1) / block_dim); - cudaMemset(result, 0, sizeof(F) * post_size); - event_sum2_heter_kernel<<>>(conn_size, events, pre_ids, post_ids, - values, result); - ThrowIfError(cudaGetLastError()); - } - - - - - // The third method to make "event_sum" // - // This method is inspired by GeNN codes. - - __global__ void collect_spike_info(const bool *events, - const std::uint32_t pre_size, - unsigned int *event_ids, - unsigned int *event_num) { - const unsigned int id = blockDim.x * blockIdx.x + threadIdx.x; -// __shared__ unsigned int shSpk[blockDim.x]; - __shared__ unsigned int shSpk[64]; - __shared__ unsigned int shPosSpk; - __shared__ unsigned int shSpkCount; - if (threadIdx.x == 0) { - shSpkCount = 0; - } - __syncthreads(); - - if (id < pre_size) { - if (events[id]) { - const unsigned int spkIdx = atomicAdd(&shSpkCount, 1); - shSpk[spkIdx] = id; - } - __syncthreads(); - - if (threadIdx.x == 0) { - if (shSpkCount > 0) { - shPosSpk = atomicAdd(&event_num[0], shSpkCount); - } - } - __syncthreads(); - - if (threadIdx.x < shSpkCount) { - const unsigned int n = shSpk[threadIdx.x]; - event_ids[shPosSpk + threadIdx.x] = n; - } - } - } - - template - __global__ void event_sum3_homo_kernel(const std::uint32_t max_post_num, - const I *indices, - const I *indptr, - const F *values, - const unsigned int *event_ids, - const unsigned int *event_num, - F *result) { - const unsigned int id = blockDim.x * blockIdx.x + threadIdx.x; -// __shared__ unsigned int shSpk[blockDim.x]; -// __shared__ I shPreStartID[blockDim.x]; -// __shared__ I shRowLength[blockDim.x]; - __shared__ I shPreStartID[32]; - __shared__ I shRowLength[32]; - __shared__ unsigned int event_count; - __shared__ F value; - - if (threadIdx.x == 0) { - value = values[0]; - event_count = event_num[0]; - } - __syncthreads(); - - if (id < max_post_num) { - const unsigned int num_iter = (event_count + blockDim.x - 1) / blockDim.x; - for (unsigned int r = 0; r < num_iter; r++) { - const unsigned int num_event = (r == num_iter - 1) ? ((event_count - 1) % blockDim.x) + 1 - : blockDim.x; - __syncthreads(); - if (threadIdx.x < num_event) { - const unsigned int pre_i = event_ids[(r * 32) + threadIdx.x]; -// shSpk[threadIdx.x] = pre_i; -// shRowLength[threadIdx.x] = indptr[pre_i + 1] - indptr[pre_i]; - shPreStartID[threadIdx.x] = indptr[pre_i]; - shRowLength[threadIdx.x] = indptr[pre_i + 1] - shPreStartID[threadIdx.x]; - } - __syncthreads(); - // loop through all incoming spikes - for (unsigned int j = 0; j < num_event; j++) { - // only work on existing neurons - const I post_num = shRowLength[j]; - if (id < post_num) { -// const I post_i = indices[indptr[shSpk[j]] + id]; - const I post_i = indices[shPreStartID[j] + id]; - atomicAdd(&result[post_i], value); - } - } - } - } - } - - - template - inline void gpu_event_sum3_homo(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - // size information - const EventSum3Descriptor &d = *UnpackDescriptor(opaque, opaque_len); - const std::uint32_t pre_size = d.pre_size; - const std::uint32_t post_size = d.post_size; - const std::uint32_t max_post_conn = d.max_post_conn; - - // input and output data // - const bool *events = reinterpret_cast(buffers[0]); - const I *indices = reinterpret_cast(buffers[1]); - const I *indptr = reinterpret_cast(buffers[2]); - const F *values = reinterpret_cast(buffers[3]); - F *result = reinterpret_cast(buffers[4]); - - // get spike information // - unsigned int *event_ids; - cudaMalloc(&event_ids, pre_size * sizeof(unsigned int)); - // I *spikes[pre_size]; - // cudaMemset(spikes, 0, sizeof(I)*pre_size); - unsigned int *event_num; - cudaMalloc(&event_num, 1 * sizeof(unsigned int)); - int block_dim = 64; - int grid_dim = (pre_size + block_dim - 1) / block_dim; - collect_spike_info<<>>(events, - pre_size, - event_ids, - event_num); - - // event sum kernel // - cudaMemset(result, 0, sizeof(F) * post_size); - block_dim = 32; - grid_dim = (max_post_conn + block_dim - 1) / block_dim; - event_sum3_homo_kernel<<>>(max_post_conn, - indices, - indptr, - values, - event_ids, - event_num, - result); - - // free memory - cudaFree(event_ids); - cudaFree(event_num); - - // check error - ThrowIfError(cudaGetLastError()); - } - - template - __global__ void event_sum3_heter_kernel(const std::uint32_t max_post_num, - const I *indices, - const I *indptr, - const F *values, - const unsigned int *event_ids, - const unsigned int *event_num, - F *result) { - const unsigned int id = blockDim.x * blockIdx.x + threadIdx.x; -// __shared__ unsigned int shSpk[blockDim.x]; -// __shared__ I shPreStartID[blockDim.x]; -// __shared__ I shRowLength[blockDim.x]; - __shared__ I shPreStartID[32]; - __shared__ I shRowLength[32]; - __shared__ unsigned int event_count; - - if (threadIdx.x == 0) { - event_count = event_num[0]; - } - __syncthreads(); - - if (id < max_post_num) { - const unsigned int num_iter = (event_count + blockDim.x - 1) / blockDim.x; - for (unsigned int r = 0; r < num_iter; r++) { - const unsigned int num_event = (r == num_iter - 1) ? ((event_count - 1) % blockDim.x) + 1 - : blockDim.x; - __syncthreads(); - if (threadIdx.x < num_event) { - const unsigned int spk = event_ids[(r * 32) + threadIdx.x]; -// shSpk[threadIdx.x] = spk; -// shRowLength[threadIdx.x] = indptr[spk + 1] - indptr[spk]; - shPreStartID[threadIdx.x] = indptr[spk]; - shRowLength[threadIdx.x] = indptr[spk + 1] - shPreStartID[threadIdx.x]; - } - __syncthreads(); - // loop through all incoming spikes - for (unsigned int j = 0; j < num_event; j++) { - // only work on existing neurons - const I post_num = shRowLength[j]; - if (id < post_num) { -// const I syn_i = indptr[shSpk[j]] + id; - const I syn_i = shPreStartID[j] + id; - const I post_i = indices[syn_i]; - atomicAdd(&result[post_i], values[syn_i]); - } - } - } - } - } - - template - inline void gpu_event_sum3_heter(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - // size information - const EventSum3Descriptor &d = *UnpackDescriptor(opaque, opaque_len); - const std::uint32_t pre_size = d.pre_size; - const std::uint32_t post_size = d.post_size; - const std::uint32_t max_post_conn = d.max_post_conn; - - // input and output data // - const bool *events = reinterpret_cast(buffers[0]); - const I *indices = reinterpret_cast(buffers[1]); - const I *indptr = reinterpret_cast(buffers[2]); - const F *values = reinterpret_cast(buffers[3]); - F *result = reinterpret_cast(buffers[4]); - - // get spike information // - unsigned int *event_ids; - cudaMalloc(&event_ids, pre_size * sizeof(unsigned int)); - // I *spikes[pre_size]; - // cudaMemset(spikes, 0, sizeof(I)*pre_size); - unsigned int *event_num; - cudaMalloc(&event_num, 1 * sizeof(unsigned int)); - int block_dim = 64; - int grid_dim = (pre_size + block_dim - 1) / block_dim; - collect_spike_info<<>>(events, - pre_size, - event_ids, - event_num); - - // event sum kernel // - cudaMemset(result, 0, sizeof(F) * post_size); - block_dim = 32; - grid_dim = (max_post_conn + block_dim - 1) / block_dim; - event_sum3_heter_kernel<<>>(max_post_conn, - indices, - indptr, - values, - event_ids, - event_num, - result); - - // free memory - cudaFree(event_ids); - cudaFree(event_num); - - // check error - ThrowIfError(cudaGetLastError()); - } - - - template - __global__ void event_sum4_homo_kernel(const std::uint32_t max_post_conn, - const std::uint32_t pre_size, - const bool *events, - const I *indices, - const I *indptr, - const F *values, - F *result) { - __shared__ bool shared_events[32]; - __shared__ I shRowLength[32]; - __shared__ I shPreStartID[32]; - __shared__ F value; - - if (threadIdx.x == 0) { - value = values[0]; - } - __syncthreads(); - - const I id = blockIdx.x * 32 + threadIdx.x; - if (id < max_post_conn) { - const unsigned int num_iter = (pre_size + 32 - 1) / 32; - for (unsigned int r = 0; r < num_iter; r++) { - const unsigned int num_event = (r == num_iter - 1) ? ((pre_size - 1) % 32) + 1 : 32; - // assume "max_post_conn" >= num_event - if (threadIdx.x < num_event) { - const unsigned int pre_i = (r * 32) + threadIdx.x; - shared_events[threadIdx.x] = events[pre_i]; - if (shared_events[threadIdx.x]) { - shPreStartID[threadIdx.x] = indptr[pre_i]; - shRowLength[threadIdx.x] = indptr[pre_i + 1] - shPreStartID[threadIdx.x]; - } - } - __syncthreads(); - for (unsigned int j = 0; j < num_event; j++) { - if (shared_events[j]) { - if (id < shRowLength[j]) { - const I syn_i = shPreStartID[j] + id; - const I post_i = indices[syn_i]; - atomicAdd(&result[post_i], value); - } - } - } - } - } - } - - template - inline void gpu_event_sum4_homo(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - // size - const EventSum3Descriptor &d = *UnpackDescriptor(opaque, opaque_len); - const std::uint32_t pre_size = d.pre_size; - const std::uint32_t post_size = d.post_size; - const std::uint32_t max_post_conn = d.max_post_conn; - - // input and output data - const bool *events = reinterpret_cast(buffers[0]); - const I *indices = reinterpret_cast(buffers[1]); - const I *indptr = reinterpret_cast(buffers[2]); - const F *values = reinterpret_cast(buffers[3]); // 1D vector with the size of 1 - F *result = reinterpret_cast(buffers[4]); - - // call kernel - const int block_dim = 32; - const int grid_dim = (max_post_conn + block_dim - 1) / block_dim; - cudaMemset(result, 0, sizeof(F) * post_size); - event_sum4_homo_kernel<<>>(max_post_conn, - pre_size, - events, - indices, - indptr, - values, - result); - ThrowIfError(cudaGetLastError()); - } - - template - __global__ void event_sum4_heter_kernel(const std::uint32_t max_post_conn, - const std::uint32_t pre_size, - const bool *events, - const I *indices, - const I *indptr, - const F *values, - F *result) { - __shared__ bool shared_events[32]; - __shared__ I shRowLength[32]; - __shared__ I shPreStartID[32]; - - const I id = blockIdx.x * 32 + threadIdx.x; - if (id < max_post_conn) { - const unsigned int num_iter = (pre_size + 32 - 1) / 32; - for (unsigned int r = 0; r < num_iter; r++) { - const unsigned int num_event = (r == num_iter - 1) ? ((pre_size - 1) % 32) + 1 : 32; - // assume "max_post_conn" >= num_event - // TODO: fix the bug - if (threadIdx.x < num_event) { - const unsigned int pre_i = (r * 32) + threadIdx.x; - shared_events[threadIdx.x] = events[pre_i]; - if (shared_events[threadIdx.x]) { - shPreStartID[threadIdx.x] = indptr[pre_i]; - shRowLength[threadIdx.x] = indptr[pre_i + 1] - shPreStartID[threadIdx.x]; - } - } - __syncthreads(); - for (unsigned int j = 0; j < num_event; j++) { - if (shared_events[j]) { - if (id < shRowLength[j]) { - const I syn_i = shPreStartID[j] + id; - const I post_i = indices[syn_i]; - atomicAdd(&result[post_i], values[syn_i]); - } - } - } - } - } - } - - - template - inline void gpu_event_sum4_heter(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - // size - const EventSum3Descriptor &d = *UnpackDescriptor(opaque, opaque_len); - const std::uint32_t pre_size = d.pre_size; - const std::uint32_t post_size = d.post_size; - const std::uint32_t max_post_conn = d.max_post_conn; - - // input and output data - const bool *events = reinterpret_cast(buffers[0]); - const I *indices = reinterpret_cast(buffers[1]); - const I *indptr = reinterpret_cast(buffers[2]); - const F *values = reinterpret_cast(buffers[3]); // 1D vector with the size of 1 - F *result = reinterpret_cast(buffers[4]); - - // call kernel - const int block_dim = 32; - const int grid_dim = (max_post_conn + block_dim - 1) / block_dim; - cudaMemset(result, 0, sizeof(F) * post_size); - event_sum4_heter_kernel<<>>(max_post_conn, - pre_size, - events, - indices, - indptr, - values, - result); - ThrowIfError(cudaGetLastError()); - } - - - } // namespace - - - // Descriptors - pybind11::bytes build_event_sum_descriptor(std::uint32_t pre_size, - std::uint32_t post_size) { - return PackDescriptor(EventSumDescriptor{pre_size, post_size}); - } - - pybind11::bytes build_event_sum2_descriptor(std::uint32_t conn_size, - std::uint32_t post_size) { - return PackDescriptor(EventSum2Descriptor{conn_size, post_size}); - } - - pybind11::bytes build_event_sum3_descriptor(std::uint32_t pre_size, - std::uint32_t post_size, - std::uint32_t max_post_conn) { - return PackDescriptor(EventSum3Descriptor{pre_size, post_size, max_post_conn}); - } - - - // homogenous event sum 1 - void gpu_event_sum_homo_f32_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum_homo_f32_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum_homo_f64_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum_homo_f64_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum_homo(stream, buffers, opaque, opaque_len); - } - - // heterogeneous event sum 1 - void gpu_event_sum_heter_f32_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum_heter_f32_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum_heter_f64_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum_heter_f64_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum_heter(stream, buffers, opaque, opaque_len); - } - - - // homogenous event sum 2 - void gpu_event_sum2_homo_f32_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum2_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum2_homo_f32_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum2_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum2_homo_f64_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum2_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum2_homo_f64_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum2_homo(stream, buffers, opaque, opaque_len); - } - - // heterogeneous event sum 2 - void gpu_event_sum2_heter_f32_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum2_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum2_heter_f32_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum2_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum2_heter_f64_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum2_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum2_heter_f64_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum2_heter(stream, buffers, opaque, opaque_len); - } - - - // homogenous event sum 3 - void gpu_event_sum3_homo_f32_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum3_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum3_homo_f32_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum3_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum3_homo_f64_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum3_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum3_homo_f64_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum3_homo(stream, buffers, opaque, opaque_len); - } - - // heterogeneous event sum 3 - void gpu_event_sum3_heter_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) { - gpu_event_sum3_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum3_heter_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) { - gpu_event_sum3_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum3_heter_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) { - gpu_event_sum3_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum3_heter_f64_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum3_heter(stream, buffers, opaque, opaque_len); - } - - - // homogenous event sum 3 - void gpu_event_sum4_homo_f32_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum4_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum4_homo_f32_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum4_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum4_homo_f64_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum4_homo(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum4_homo_f64_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum4_homo(stream, buffers, opaque, opaque_len); - } - - // heterogeneous event sum 3 - void gpu_event_sum4_heter_f32_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum4_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum4_heter_f32_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum4_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum4_heter_f64_i32(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum4_heter(stream, buffers, opaque, opaque_len); - } - - void gpu_event_sum4_heter_f64_i64(cudaStream_t stream, - void **buffers, - const char *opaque, - std::size_t opaque_len) { - gpu_event_sum4_heter(stream, buffers, opaque, opaque_len); - } - - -} // namespace brainpylib diff --git a/extensions/lib/event_sum_gpu.h b/extensions/lib/event_sum_gpu.h deleted file mode 100644 index 3837f6e95..000000000 --- a/extensions/lib/event_sum_gpu.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef _BRAINPY_EVENT_SUM_KERNELS_H_ -#define _BRAINPY_EVENT_SUM_KERNELS_H_ - -#include -#include -#include "pybind11_kernel_helpers.h" -#include "kernel_helpers_gpu.h" - -namespace brainpy_lib { - - struct EventSumDescriptor { - std::uint32_t pre_size; - std::uint32_t post_size; - }; - - pybind11::bytes build_event_sum_descriptor(std::uint32_t pre_size, std::uint32_t post_size); - - // homogeneous - void gpu_event_sum_homo_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum_homo_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum_homo_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum_homo_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - // heterogeneous - void gpu_event_sum_heter_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum_heter_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum_heter_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum_heter_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - struct EventSum2Descriptor { - std::uint32_t conn_size; - std::uint32_t post_size; - }; - - pybind11::bytes build_event_sum2_descriptor(std::uint32_t conn_size, std::uint32_t post_size); - - - // homogeneous - void gpu_event_sum2_homo_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum2_homo_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum2_homo_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum2_homo_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - // heterogeneous - void gpu_event_sum2_heter_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum2_heter_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum2_heter_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum2_heter_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - // event_sum3 descriptor - struct EventSum3Descriptor { - std::uint32_t pre_size; - std::uint32_t post_size; - std::uint32_t max_post_conn; - }; - - pybind11::bytes build_event_sum3_descriptor(std::uint32_t pre_size, std::uint32_t post_size, - std::uint32_t max_post_conn); - - // event_sum3 homogeneous - void gpu_event_sum3_homo_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum3_homo_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum3_homo_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum3_homo_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - // event_sum3 heterogeneous - void gpu_event_sum3_heter_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum3_heter_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum3_heter_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum3_heter_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - // event_sum4 homogeneous - void gpu_event_sum4_homo_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum4_homo_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum4_homo_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum4_homo_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - // event_sum4 heterogeneous - void gpu_event_sum4_heter_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum4_heter_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum4_heter_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - void gpu_event_sum4_heter_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); - - - -} // namespace brainpy_lib - -#endif \ No newline at end of file diff --git a/extensions/lib/gpu_atomic_prod.cu b/extensions/lib/gpu_atomic_prod.cu new file mode 100644 index 000000000..d15eb6b29 --- /dev/null +++ b/extensions/lib/gpu_atomic_prod.cu @@ -0,0 +1,168 @@ +// This file contains the GPU implementation of our op. It's a pretty typical CUDA kernel +// and I make no promises about the quality of the code or the choices made therein, but +// it should get the point across. + +#include "gpu_atomic_prod.h" + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 +#else +__device__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); +} +#endif + +namespace brainpy_lib { + + namespace { + +// "atomic_prod" operator // + template + __global__ void gpu_coo_atomic_prod_homo_kernel(const std::uint32_t size, + const F &value, + const I *post_ids, + F *result) { + for (std::uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; + i < size; i += blockDim.x * gridDim.x) { + atomicAdd(&result[post_ids[i]], value); + } + } + + template + inline void gpu_coo_atomic_prod_homo(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + // size + const COOAtomicProdDescriptor &d = *UnpackDescriptor(opaque, opaque_len); + const std::uint32_t conn_size = d.conn_size; + const std::uint32_t post_size = d.post_size; + + // input and output data + const F *values = reinterpret_cast(buffers[0]); // scalar as a vector + const I *post_ids = reinterpret_cast(buffers[1]); + F *result = reinterpret_cast(buffers[2]); + + // call kernel + const int block_dim = 512; + const int grid_dim = std::min(1024, (conn_size + block_dim - 1) / block_dim); + cudaMemset(result, 1, sizeof(F) * post_size); + gpu_coo_atomic_prod_homo_kernel<<>>( + conn_size, values[0], post_ids, result); + ThrowIfError(cudaGetLastError()); + } + + template + __global__ void gpu_coo_atomic_prod_heter_kernel(const std::uint32_t size, + const F *values, + const I *post_ids, + const I *pre_ids, + F *result) { + for (std::uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; + i < size; i += blockDim.x * gridDim.x) { + atomicAdd(&result[post_ids[i]], values[pre_ids[i]]); + } + } + + template + inline void gpu_coo_atomic_prod_heter(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + // size + const COOAtomicProdDescriptor &d = *UnpackDescriptor(opaque, opaque_len); + const std::uint32_t conn_size = d.conn_size; + const std::uint32_t post_size = d.post_size; + + // input and output data + const F *values = reinterpret_cast(buffers[0]); // scalar as a vector + const I *post_ids = reinterpret_cast(buffers[1]); + const I *pre_ids = reinterpret_cast(buffers[2]); + F *result = reinterpret_cast(buffers[3]); + + // call kernel + const int block_dim = 512; + const int grid_dim = std::min(1024, (conn_size + block_dim - 1) / block_dim); + cudaMemset(result, 1, sizeof(F) * post_size); + gpu_coo_atomic_prod_heter_kernel<<>>( + conn_size, values, post_ids, pre_ids, result); + ThrowIfError(cudaGetLastError()); + } + + + } // namespace + + +// Descriptor + pybind11::bytes build_coo_atomic_prod_descriptor(std::uint32_t conn_size, + std::uint32_t post_size) { + return PackDescriptor(COOAtomicProdDescriptor{conn_size, post_size}); + } + +// homogenous atomic sum + void gpu_coo_atomic_prod_homo_f32_i32(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_atomic_prod_homo(stream, buffers, opaque, opaque_len); + } + + void gpu_coo_atomic_prod_homo_f32_i64(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_atomic_prod_homo(stream, buffers, opaque, opaque_len); + } + + void gpu_coo_atomic_prod_homo_f64_i32(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_atomic_prod_homo(stream, buffers, opaque, opaque_len); + } + + void gpu_coo_atomic_prod_homo_f64_i64(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_atomic_prod_homo(stream, buffers, opaque, opaque_len); + } + +// heterogeneous atomic sum + void gpu_coo_atomic_prod_heter_f32_i32(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_atomic_prod_heter(stream, buffers, opaque, opaque_len); + } + + void gpu_coo_atomic_prod_heter_f32_i64(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_atomic_prod_heter(stream, buffers, opaque, opaque_len); + } + + void gpu_coo_atomic_prod_heter_f64_i32(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_atomic_prod_heter(stream, buffers, opaque, opaque_len); + } + + void gpu_coo_atomic_prod_heter_f64_i64(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_atomic_prod_heter(stream, buffers, opaque, opaque_len); + } + + +} // namespace brainpylib diff --git a/extensions/lib/gpu_atomic_prod.h b/extensions/lib/gpu_atomic_prod.h new file mode 100644 index 000000000..8552538c1 --- /dev/null +++ b/extensions/lib/gpu_atomic_prod.h @@ -0,0 +1,40 @@ +#ifndef _BRAINPY_ATOMIC_PROD_KERNELS_H_ +#define _BRAINPY_ATOMIC_PROD_KERNELS_H_ + +#include +#include +#include "pybind11_kernel_helpers.h" +#include "kernel_helpers_gpu.h" + +namespace brainpy_lib { + struct COOAtomicProdDescriptor { + std::uint32_t conn_size; + std::uint32_t post_size; + }; + + // homogeneous + void + gpu_coo_atomic_prod_homo_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_coo_atomic_prod_homo_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_coo_atomic_prod_homo_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_coo_atomic_prod_homo_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + + // heterogeneous + void + gpu_coo_atomic_prod_heter_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_coo_atomic_prod_heter_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_coo_atomic_prod_heter_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_coo_atomic_prod_heter_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + + // descriptors + pybind11::bytes build_coo_atomic_prod_descriptor(std::uint32_t conn_size, std::uint32_t post_size); + +} // namespace brainpy_lib + +#endif \ No newline at end of file diff --git a/extensions/lib/atomic_sum_gpu.cu b/extensions/lib/gpu_atomic_sum.cu similarity index 63% rename from extensions/lib/atomic_sum_gpu.cu rename to extensions/lib/gpu_atomic_sum.cu index 99ff643ab..3b4be8135 100644 --- a/extensions/lib/atomic_sum_gpu.cu +++ b/extensions/lib/gpu_atomic_sum.cu @@ -2,7 +2,22 @@ // and I make no promises about the quality of the code or the choices made therein, but // it should get the point across. -#include "atomic_sum_gpu.h" +#include "gpu_atomic_sum.h" + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 +#else +__device__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); +} +#endif namespace brainpy_lib { @@ -10,7 +25,7 @@ namespace brainpy_lib { // "atomic_sum" operator // template - __global__ void gpu_atomic_sum_homo_kernel(const std::uint32_t size, + __global__ void gpu_coo_atomic_sum_homo_kernel(const std::uint32_t size, const F &value, const I *post_ids, F *result) { @@ -21,12 +36,12 @@ namespace brainpy_lib { } template - inline void gpu_atomic_sum_homo(cudaStream_t stream, + inline void gpu_coo_atomic_sum_homo(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) { // size - const AtomicSumDescriptor &d = *UnpackDescriptor(opaque, opaque_len); + const COOAtomicSumDescriptor &d = *UnpackDescriptor(opaque, opaque_len); const std::uint32_t conn_size = d.conn_size; const std::uint32_t post_size = d.post_size; @@ -39,13 +54,13 @@ namespace brainpy_lib { const int block_dim = 512; const int grid_dim = std::min(1024, (conn_size + block_dim - 1) / block_dim); cudaMemset(result, 0, sizeof(F) * post_size); - gpu_atomic_sum_homo_kernel<<>>(conn_size, values[0], post_ids, - result); + gpu_coo_atomic_sum_homo_kernel<<>>( + conn_size, values[0], post_ids, result); ThrowIfError(cudaGetLastError()); } template - __global__ void gpu_atomic_sum_heter_kernel(const std::uint32_t size, + __global__ void gpu_coo_atomic_sum_heter_kernel(const std::uint32_t size, const F *values, const I *post_ids, const I *pre_ids, @@ -57,12 +72,12 @@ namespace brainpy_lib { } template - inline void gpu_atomic_sum_heter(cudaStream_t stream, + inline void gpu_coo_atomic_sum_heter(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) { // size - const AtomicSumDescriptor &d = *UnpackDescriptor(opaque, opaque_len); + const COOAtomicSumDescriptor &d = *UnpackDescriptor(opaque, opaque_len); const std::uint32_t conn_size = d.conn_size; const std::uint32_t post_size = d.post_size; @@ -76,8 +91,8 @@ namespace brainpy_lib { const int block_dim = 512; const int grid_dim = std::min(1024, (conn_size + block_dim - 1) / block_dim); cudaMemset(result, 0, sizeof(F) * post_size); - gpu_atomic_sum_heter_kernel<<>>(conn_size, values, post_ids, pre_ids, - result); + gpu_coo_atomic_sum_heter_kernel<<>>( + conn_size, values, post_ids, pre_ids, result); ThrowIfError(cudaGetLastError()); } @@ -86,67 +101,67 @@ namespace brainpy_lib { // Descriptor - pybind11::bytes build_atomic_sum_descriptor(std::uint32_t conn_size, + pybind11::bytes build_coo_atomic_sum_descriptor(std::uint32_t conn_size, std::uint32_t post_size) { - return PackDescriptor(AtomicSumDescriptor{conn_size, post_size}); + return PackDescriptor(COOAtomicSumDescriptor{conn_size, post_size}); } // homogenous atomic sum - void gpu_atomic_sum_homo_f32_i32(cudaStream_t stream, + void gpu_coo_atomic_sum_homo_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) { - gpu_atomic_sum_homo(stream, buffers, opaque, opaque_len); + gpu_coo_atomic_sum_homo(stream, buffers, opaque, opaque_len); } - void gpu_atomic_sum_homo_f32_i64(cudaStream_t stream, + void gpu_coo_atomic_sum_homo_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) { - gpu_atomic_sum_homo(stream, buffers, opaque, opaque_len); + gpu_coo_atomic_sum_homo(stream, buffers, opaque, opaque_len); } - void gpu_atomic_sum_homo_f64_i32(cudaStream_t stream, + void gpu_coo_atomic_sum_homo_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) { - gpu_atomic_sum_homo(stream, buffers, opaque, opaque_len); + gpu_coo_atomic_sum_homo(stream, buffers, opaque, opaque_len); } - void gpu_atomic_sum_homo_f64_i64(cudaStream_t stream, + void gpu_coo_atomic_sum_homo_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) { - gpu_atomic_sum_homo(stream, buffers, opaque, opaque_len); + gpu_coo_atomic_sum_homo(stream, buffers, opaque, opaque_len); } // heterogeneous atomic sum - void gpu_atomic_sum_heter_f32_i32(cudaStream_t stream, + void gpu_coo_atomic_sum_heter_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) { - gpu_atomic_sum_heter(stream, buffers, opaque, opaque_len); + gpu_coo_atomic_sum_heter(stream, buffers, opaque, opaque_len); } - void gpu_atomic_sum_heter_f32_i64(cudaStream_t stream, + void gpu_coo_atomic_sum_heter_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) { - gpu_atomic_sum_heter(stream, buffers, opaque, opaque_len); + gpu_coo_atomic_sum_heter(stream, buffers, opaque, opaque_len); } - void gpu_atomic_sum_heter_f64_i32(cudaStream_t stream, + void gpu_coo_atomic_sum_heter_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) { - gpu_atomic_sum_heter(stream, buffers, opaque, opaque_len); + gpu_coo_atomic_sum_heter(stream, buffers, opaque, opaque_len); } - void gpu_atomic_sum_heter_f64_i64(cudaStream_t stream, + void gpu_coo_atomic_sum_heter_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) { - gpu_atomic_sum_heter(stream, buffers, opaque, opaque_len); + gpu_coo_atomic_sum_heter(stream, buffers, opaque, opaque_len); } diff --git a/extensions/lib/gpu_atomic_sum.h b/extensions/lib/gpu_atomic_sum.h new file mode 100644 index 000000000..a48808679 --- /dev/null +++ b/extensions/lib/gpu_atomic_sum.h @@ -0,0 +1,31 @@ +#ifndef _BRAINPY_ATOMIC_SUM_KERNELS_H_ +#define _BRAINPY_ATOMIC_SUM_KERNELS_H_ + +#include +#include +#include "pybind11_kernel_helpers.h" +#include "kernel_helpers_gpu.h" + +namespace brainpy_lib { + struct COOAtomicSumDescriptor { + std::uint32_t conn_size; + std::uint32_t post_size; + }; + + // homogeneous + void gpu_coo_atomic_sum_homo_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void gpu_coo_atomic_sum_homo_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void gpu_coo_atomic_sum_homo_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void gpu_coo_atomic_sum_homo_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + // heterogeneous + void gpu_coo_atomic_sum_heter_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void gpu_coo_atomic_sum_heter_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void gpu_coo_atomic_sum_heter_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void gpu_coo_atomic_sum_heter_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + + // descriptors + pybind11::bytes build_coo_atomic_sum_descriptor(std::uint32_t conn_size, std::uint32_t post_size); + +} // namespace brainpy_lib + +#endif \ No newline at end of file diff --git a/extensions/lib/gpu_event_sum.cu b/extensions/lib/gpu_event_sum.cu new file mode 100644 index 000000000..3e5fd43aa --- /dev/null +++ b/extensions/lib/gpu_event_sum.cu @@ -0,0 +1,383 @@ +// This file contains the GPU implementation of our op. It's a pretty typical CUDA kernel +// and I make no promises about the quality of the code or the choices made therein, but +// it should get the point across. + +#include "gpu_event_sum.h" + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 +#else +__device__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); +} +#endif + + +namespace brainpy_lib { + + namespace { + + // "event_sum_homo" operator // + // This function launches "num_of_pre_neuron" threads to + // update the "result" (in global memory) + template + __global__ void _csr_event_sum_homo_kernel( + const std::uint32_t size, + const bool *events, + const I *indices, + const I *indptr, + const F &value, + F *result + ) { + for (std::uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; + i < size; i += blockDim.x * gridDim.x) { + if (events[i]) { + for (I j = indptr[i]; j < indptr[i + 1]; ++j) { + atomicAdd(&result[indices[j]], value); + } + } + } + } + + template + inline void gpu_csr_event_sum_homo(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + // size + const CSREventSumDescriptor &d = *UnpackDescriptor(opaque, opaque_len); + const std::uint32_t pre_size = d.pre_size; + const std::uint32_t post_size = d.post_size; + + // input and output data + const bool *events = reinterpret_cast(buffers[0]); + const I *indices = reinterpret_cast(buffers[1]); + const I *indptr = reinterpret_cast(buffers[2]); + const F *weights = reinterpret_cast(buffers[3]); + F *result = reinterpret_cast(buffers[4]); + + // call kernel + const int block_dim = 512; + const int grid_dim = (pre_size + block_dim - 1) / block_dim; + cudaMemset(result, 0, sizeof(F) * post_size); + _csr_event_sum_homo_kernel<<>>( + pre_size, events, indices, indptr, weights[0], result); + ThrowIfError(cudaGetLastError()); + } + + template + __global__ void _csr_event_sum_heter_kernel( + const std::uint32_t size, + const bool *events, + const I *indices, + const I *indptr, + const F *values, + F *result + ) { + for (std::uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; + i < size; i += blockDim.x * gridDim.x) { + if (events[i]) { + for (I j = indptr[i]; j < indptr[i + 1]; ++j) { + atomicAdd(&result[indices[j]], values[j]); + } + } + } + } + + template + inline void gpu_csr_event_sum_heter(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + // size + const CSREventSumDescriptor &d = *UnpackDescriptor(opaque, opaque_len); + const std::uint32_t pre_size = d.pre_size; + const std::uint32_t post_size = d.post_size; + + // input and output data + const bool *events = reinterpret_cast(buffers[0]); + const I *indices = reinterpret_cast(buffers[1]); + const I *indptr = reinterpret_cast(buffers[2]); + const F *values = reinterpret_cast(buffers[3]); + F *result = reinterpret_cast(buffers[4]); + + // call kernel + const int block_dim = 512; + const int grid_dim = (pre_size + block_dim - 1) / block_dim; + cudaMemset(result, 0, sizeof(F) * post_size); + _csr_event_sum_heter_kernel<<>>( + pre_size, events, indices, indptr, values, result); + ThrowIfError(cudaGetLastError()); + } + + +// "event_sum2" operator // + template + __global__ void _coo_event_sum_homo_kernel( + const std::uint32_t size, + const bool *events, + const I *pre_ids, + const I *post_ids, + const F &value, + F *result + ) { + for (std::uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; + i < size; i += blockDim.x * gridDim.x) { + if (events[pre_ids[i]]) { + atomicAdd(&result[post_ids[i]], value); + } + } + } + + template + inline void gpu_coo_event_sum_homo(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + // size + const COOEventSumDescriptor &d = *UnpackDescriptor(opaque, opaque_len); + const std::uint32_t conn_size = d.conn_size; + const std::uint32_t post_size = d.post_size; + + // input and output data + const bool *events = reinterpret_cast(buffers[0]); + const I *pre_ids = reinterpret_cast(buffers[1]); + const I *post_ids = reinterpret_cast(buffers[2]); + const F *weights = reinterpret_cast(buffers[3]); + F *result = reinterpret_cast(buffers[4]); + + // call kernel + const int block_dim = 512; + const int grid_dim = (conn_size + block_dim - 1) / block_dim; + cudaMemset(result, 0, sizeof(F) * post_size); + _coo_event_sum_homo_kernel<<>>( + conn_size, events, pre_ids, post_ids, weights[0], result); + ThrowIfError(cudaGetLastError()); + } + + template + __global__ void _coo_event_sum_heter_kernel( + const std::uint32_t size, + const bool *events, + const I *pre_ids, + const I *post_ids, + const F *values, + F *result + ) { + for (std::uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; + i < size; i += blockDim.x * gridDim.x) { + if (events[pre_ids[i]]) { + atomicAdd(&result[post_ids[i]], values[i]); + } + } + } + + template + inline void gpu_coo_event_sum_heter(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + // size + const COOEventSumDescriptor &d = *UnpackDescriptor(opaque, opaque_len); + const std::uint32_t conn_size = d.conn_size; + const std::uint32_t post_size = d.post_size; + + // iput and output data + const bool *events = reinterpret_cast(buffers[0]); + const I *pre_ids = reinterpret_cast(buffers[1]); + const I *post_ids = reinterpret_cast(buffers[2]); + const F *values = reinterpret_cast(buffers[3]); + F *result = reinterpret_cast(buffers[4]); + + // call kernel + const int block_dim = 512; + const int grid_dim = (conn_size + block_dim - 1) / block_dim; + cudaMemset(result, 0, sizeof(F) * post_size); + _coo_event_sum_heter_kernel < F, I ><<>>( + conn_size, events, pre_ids, post_ids, values, result); + ThrowIfError(cudaGetLastError()); + } + + + + + // The third method to make "event_sum" // + // This method is inspired by GeNN codes. + + __global__ void collect_spike_info(const bool *events, + const std::uint32_t pre_size, + unsigned int *event_ids, + unsigned int *event_num) { + const unsigned int id = blockDim.x * blockIdx.x + threadIdx.x; + __shared__ unsigned int shSpk[64]; + __shared__ unsigned int shPosSpk; + __shared__ unsigned int shSpkCount; + if (threadIdx.x == 0) { + shSpkCount = 0; + } + __syncthreads(); + + if (id < pre_size) { + if (events[id]) { + const unsigned int spkIdx = atomicAdd(&shSpkCount, 1); + shSpk[spkIdx] = id; + } + __syncthreads(); + + if (threadIdx.x == 0) { + if (shSpkCount > 0) { + shPosSpk = atomicAdd(&event_num[0], shSpkCount); + } + } + __syncthreads(); + + if (threadIdx.x < shSpkCount) { + const unsigned int n = shSpk[threadIdx.x]; + event_ids[shPosSpk + threadIdx.x] = n; + } + } + } + + + } // namespace + + + // homogenous event sum 1 + pybind11::bytes build_csr_event_sum_descriptor(std::uint32_t pre_size, + std::uint32_t post_size) { + return PackDescriptor(CSREventSumDescriptor{pre_size, post_size}); + } + + + void gpu_csr_event_sum_homo_f32_i32(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_csr_event_sum_homo(stream, buffers, opaque, opaque_len); + } + + void gpu_csr_event_sum_homo_f32_i64(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_csr_event_sum_homo(stream, buffers, opaque, opaque_len); + } + + void gpu_csr_event_sum_homo_f64_i32(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_csr_event_sum_homo(stream, buffers, opaque, opaque_len); + } + + void gpu_csr_event_sum_homo_f64_i64(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_csr_event_sum_homo(stream, buffers, opaque, opaque_len); + } + + + // heterogeneous event sum 1 + void gpu_csr_event_sum_heter_f32_i32(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_csr_event_sum_heter(stream, buffers, opaque, opaque_len); + } + + void gpu_csr_event_sum_heter_f32_i64(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_csr_event_sum_heter(stream, buffers, opaque, opaque_len); + } + + void gpu_csr_event_sum_heter_f64_i32(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_csr_event_sum_heter(stream, buffers, opaque, opaque_len); + } + + void gpu_csr_event_sum_heter_f64_i64(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_csr_event_sum_heter(stream, buffers, opaque, opaque_len); + } + + + + // homogenous event sum 2 + pybind11::bytes build_coo_event_sum_descriptor(std::uint32_t conn_size, + std::uint32_t post_size) { + return PackDescriptor(COOEventSumDescriptor{conn_size, post_size}); + } + + void gpu_coo_event_sum_homo_f32_i32(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_event_sum_homo(stream, buffers, opaque, opaque_len); + } + + void gpu_coo_event_sum_homo_f32_i64(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_event_sum_homo(stream, buffers, opaque, opaque_len); + } + + void gpu_coo_event_sum_homo_f64_i32(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_event_sum_homo(stream, buffers, opaque, opaque_len); + } + + void gpu_coo_event_sum_homo_f64_i64(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_event_sum_homo(stream, buffers, opaque, opaque_len); + } + + // heterogeneous event sum 2 + void gpu_coo_event_sum_heter_f32_i32(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_event_sum_heter(stream, buffers, opaque, opaque_len); + } + + void gpu_coo_event_sum_heter_f32_i64(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_event_sum_heter(stream, buffers, opaque, opaque_len); + } + + void gpu_coo_event_sum_heter_f64_i32(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_event_sum_heter(stream, buffers, opaque, opaque_len); + } + + void gpu_coo_event_sum_heter_f64_i64(cudaStream_t stream, + void **buffers, + const char *opaque, + std::size_t opaque_len) { + gpu_coo_event_sum_heter(stream, buffers, opaque, opaque_len); + } + + +} // namespace brainpylib diff --git a/extensions/lib/gpu_event_sum.h b/extensions/lib/gpu_event_sum.h new file mode 100644 index 000000000..aee51bc18 --- /dev/null +++ b/extensions/lib/gpu_event_sum.h @@ -0,0 +1,67 @@ +#ifndef _BRAINPY_EVENT_SUM_KERNELS_H_ +#define _BRAINPY_EVENT_SUM_KERNELS_H_ + +#include +#include +#include "pybind11_kernel_helpers.h" +#include "kernel_helpers_gpu.h" + +namespace brainpy_lib { + struct CSREventSumDescriptor { + std::uint32_t pre_size; + std::uint32_t post_size; + }; + + pybind11::bytes build_csr_event_sum_descriptor(std::uint32_t pre_size, std::uint32_t post_size); + + // homogeneous + void + gpu_csr_event_sum_homo_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_csr_event_sum_homo_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_csr_event_sum_homo_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_csr_event_sum_homo_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + + // heterogeneous + + void + gpu_csr_event_sum_heter_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_csr_event_sum_heter_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_csr_event_sum_heter_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_csr_event_sum_heter_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + + struct COOEventSumDescriptor { + std::uint32_t conn_size; + std::uint32_t post_size; + }; + + pybind11::bytes build_coo_event_sum_descriptor(std::uint32_t conn_size, std::uint32_t post_size); + + // homogeneous + void + gpu_coo_event_sum_homo_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_coo_event_sum_homo_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_coo_event_sum_homo_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_coo_event_sum_homo_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + + // heterogeneous + void + gpu_coo_event_sum_heter_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_coo_event_sum_heter_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_coo_event_sum_heter_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + void + gpu_coo_event_sum_heter_f64_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len); + +} // namespace brainpy_lib + +#endif \ No newline at end of file diff --git a/extensions/lib/gpu_ops.cc b/extensions/lib/gpu_ops.cc index 6894816c9..ad132d166 100644 --- a/extensions/lib/gpu_ops.cc +++ b/extensions/lib/gpu_ops.cc @@ -5,9 +5,9 @@ // custom call can be found in kernels.cc.cu. #include "pybind11_kernel_helpers.h" -#include "event_sum_gpu.h" -#include "atomic_sum_gpu.h" -#include "atomic_prod_gpu.h" +#include "gpu_event_sum.h" +#include "gpu_atomic_sum.h" +#include "gpu_atomic_prod.h" using namespace brainpy_lib; @@ -15,49 +15,49 @@ namespace { pybind11::dict Registrations() { pybind11::dict dict; - // homogeneous event_sum - dict["gpu_event_sum_homo_f32_i32"] = EncapsulateFunction(gpu_event_sum_homo_f32_i32); - dict["gpu_event_sum_homo_f32_i64"] = EncapsulateFunction(gpu_event_sum_homo_f32_i64); - dict["gpu_event_sum_homo_f64_i32"] = EncapsulateFunction(gpu_event_sum_homo_f64_i32); - dict["gpu_event_sum_homo_f64_i64"] = EncapsulateFunction(gpu_event_sum_homo_f64_i64); - // heterogeneous event_sum - dict["gpu_event_sum_heter_f32_i32"] = EncapsulateFunction(gpu_event_sum_heter_f32_i32); - dict["gpu_event_sum_heter_f32_i64"] = EncapsulateFunction(gpu_event_sum_heter_f32_i64); - dict["gpu_event_sum_heter_f64_i32"] = EncapsulateFunction(gpu_event_sum_heter_f64_i32); - dict["gpu_event_sum_heter_f64_i64"] = EncapsulateFunction(gpu_event_sum_heter_f64_i64); + // homogeneous csr event_sum + dict["gpu_csr_event_sum_homo_f32_i32"] = EncapsulateFunction(gpu_csr_event_sum_homo_f32_i32); + dict["gpu_csr_event_sum_homo_f32_i64"] = EncapsulateFunction(gpu_csr_event_sum_homo_f32_i64); + dict["gpu_csr_event_sum_homo_f64_i32"] = EncapsulateFunction(gpu_csr_event_sum_homo_f64_i32); + dict["gpu_csr_event_sum_homo_f64_i64"] = EncapsulateFunction(gpu_csr_event_sum_homo_f64_i64); + // heterogeneous csr event_sum + dict["gpu_csr_event_sum_heter_f32_i32"] = EncapsulateFunction(gpu_csr_event_sum_heter_f32_i32); + dict["gpu_csr_event_sum_heter_f32_i64"] = EncapsulateFunction(gpu_csr_event_sum_heter_f32_i64); + dict["gpu_csr_event_sum_heter_f64_i32"] = EncapsulateFunction(gpu_csr_event_sum_heter_f64_i32); + dict["gpu_csr_event_sum_heter_f64_i64"] = EncapsulateFunction(gpu_csr_event_sum_heter_f64_i64); - // homogeneous event_sum2 - dict["gpu_event_sum2_homo_f32_i32"] = EncapsulateFunction(gpu_event_sum2_homo_f32_i32); - dict["gpu_event_sum2_homo_f32_i64"] = EncapsulateFunction(gpu_event_sum2_homo_f32_i64); - dict["gpu_event_sum2_homo_f64_i32"] = EncapsulateFunction(gpu_event_sum2_homo_f64_i32); - dict["gpu_event_sum2_homo_f64_i64"] = EncapsulateFunction(gpu_event_sum2_homo_f64_i64); - // heterogeneous event_sum2 - dict["gpu_event_sum2_heter_f32_i32"] = EncapsulateFunction(gpu_event_sum2_heter_f32_i32); - dict["gpu_event_sum2_heter_f32_i64"] = EncapsulateFunction(gpu_event_sum2_heter_f32_i64); - dict["gpu_event_sum2_heter_f64_i32"] = EncapsulateFunction(gpu_event_sum2_heter_f64_i32); - dict["gpu_event_sum2_heter_f64_i64"] = EncapsulateFunction(gpu_event_sum2_heter_f64_i64); + // homogeneous coo event_sum + dict["gpu_coo_event_sum_homo_f32_i32"] = EncapsulateFunction(gpu_coo_event_sum_homo_f32_i32); + dict["gpu_coo_event_sum_homo_f32_i64"] = EncapsulateFunction(gpu_coo_event_sum_homo_f32_i64); + dict["gpu_coo_event_sum_homo_f64_i32"] = EncapsulateFunction(gpu_coo_event_sum_homo_f64_i32); + dict["gpu_coo_event_sum_homo_f64_i64"] = EncapsulateFunction(gpu_coo_event_sum_homo_f64_i64); + // heterogeneous coo event_sum + dict["gpu_coo_event_sum_heter_f32_i32"] = EncapsulateFunction(gpu_coo_event_sum_heter_f32_i32); + dict["gpu_coo_event_sum_heter_f32_i64"] = EncapsulateFunction(gpu_coo_event_sum_heter_f32_i64); + dict["gpu_coo_event_sum_heter_f64_i32"] = EncapsulateFunction(gpu_coo_event_sum_heter_f64_i32); + dict["gpu_coo_event_sum_heter_f64_i64"] = EncapsulateFunction(gpu_coo_event_sum_heter_f64_i64); // homogeneous atomic_sum - dict["gpu_atomic_sum_homo_f32_i32"] = EncapsulateFunction(gpu_atomic_sum_homo_f32_i32); - dict["gpu_atomic_sum_homo_f32_i64"] = EncapsulateFunction(gpu_atomic_sum_homo_f32_i64); - dict["gpu_atomic_sum_homo_f64_i32"] = EncapsulateFunction(gpu_atomic_sum_homo_f64_i32); - dict["gpu_atomic_sum_homo_f64_i64"] = EncapsulateFunction(gpu_atomic_sum_homo_f64_i64); + dict["gpu_coo_atomic_sum_homo_f32_i32"] = EncapsulateFunction(gpu_coo_atomic_sum_homo_f32_i32); + dict["gpu_coo_atomic_sum_homo_f32_i64"] = EncapsulateFunction(gpu_coo_atomic_sum_homo_f32_i64); + dict["gpu_coo_atomic_sum_homo_f64_i32"] = EncapsulateFunction(gpu_coo_atomic_sum_homo_f64_i32); + dict["gpu_coo_atomic_sum_homo_f64_i64"] = EncapsulateFunction(gpu_coo_atomic_sum_homo_f64_i64); // heterogeneous atomic_sum - dict["gpu_atomic_sum_heter_f32_i32"] = EncapsulateFunction(gpu_atomic_sum_heter_f32_i32); - dict["gpu_atomic_sum_heter_f32_i64"] = EncapsulateFunction(gpu_atomic_sum_heter_f32_i64); - dict["gpu_atomic_sum_heter_f64_i32"] = EncapsulateFunction(gpu_atomic_sum_heter_f64_i32); - dict["gpu_atomic_sum_heter_f64_i64"] = EncapsulateFunction(gpu_atomic_sum_heter_f64_i64); + dict["gpu_coo_atomic_sum_heter_f32_i32"] = EncapsulateFunction(gpu_coo_atomic_sum_heter_f32_i32); + dict["gpu_coo_atomic_sum_heter_f32_i64"] = EncapsulateFunction(gpu_coo_atomic_sum_heter_f32_i64); + dict["gpu_coo_atomic_sum_heter_f64_i32"] = EncapsulateFunction(gpu_coo_atomic_sum_heter_f64_i32); + dict["gpu_coo_atomic_sum_heter_f64_i64"] = EncapsulateFunction(gpu_coo_atomic_sum_heter_f64_i64); // homogeneous atomic_prod - dict["gpu_atomic_prod_homo_f32_i32"] = EncapsulateFunction(gpu_atomic_prod_homo_f32_i32); - dict["gpu_atomic_prod_homo_f32_i64"] = EncapsulateFunction(gpu_atomic_prod_homo_f32_i64); - dict["gpu_atomic_prod_homo_f64_i32"] = EncapsulateFunction(gpu_atomic_prod_homo_f64_i32); - dict["gpu_atomic_prod_homo_f64_i64"] = EncapsulateFunction(gpu_atomic_prod_homo_f64_i64); + dict["gpu_coo_atomic_prod_homo_f32_i32"] = EncapsulateFunction(gpu_coo_atomic_prod_homo_f32_i32); + dict["gpu_coo_atomic_prod_homo_f32_i64"] = EncapsulateFunction(gpu_coo_atomic_prod_homo_f32_i64); + dict["gpu_coo_atomic_prod_homo_f64_i32"] = EncapsulateFunction(gpu_coo_atomic_prod_homo_f64_i32); + dict["gpu_coo_atomic_prod_homo_f64_i64"] = EncapsulateFunction(gpu_coo_atomic_prod_homo_f64_i64); // heterogeneous atomic_prod - dict["gpu_atomic_prod_heter_f32_i32"] = EncapsulateFunction(gpu_atomic_prod_heter_f32_i32); - dict["gpu_atomic_prod_heter_f32_i64"] = EncapsulateFunction(gpu_atomic_prod_heter_f32_i64); - dict["gpu_atomic_prod_heter_f64_i32"] = EncapsulateFunction(gpu_atomic_prod_heter_f64_i32); - dict["gpu_atomic_prod_heter_f64_i64"] = EncapsulateFunction(gpu_atomic_prod_heter_f64_i64); + dict["gpu_coo_atomic_prod_heter_f32_i32"] = EncapsulateFunction(gpu_coo_atomic_prod_heter_f32_i32); + dict["gpu_coo_atomic_prod_heter_f32_i64"] = EncapsulateFunction(gpu_coo_atomic_prod_heter_f32_i64); + dict["gpu_coo_atomic_prod_heter_f64_i32"] = EncapsulateFunction(gpu_coo_atomic_prod_heter_f64_i32); + dict["gpu_coo_atomic_prod_heter_f64_i64"] = EncapsulateFunction(gpu_coo_atomic_prod_heter_f64_i64); return dict; } @@ -65,9 +65,9 @@ namespace { PYBIND11_MODULE(gpu_ops, m ) { m.def("registrations", &Registrations); - m.def("build_event_sum_descriptor", &build_event_sum_descriptor); - m.def("build_event_sum2_descriptor", &build_event_sum2_descriptor); - m.def("build_atomic_sum_descriptor", &build_atomic_sum_descriptor); - m.def("build_atomic_prod_descriptor", &build_atomic_prod_descriptor); + m.def("build_csr_event_sum_descriptor", &build_csr_event_sum_descriptor); + m.def("build_coo_event_sum_descriptor", &build_coo_event_sum_descriptor); + m.def("build_coo_atomic_sum_descriptor", &build_coo_atomic_sum_descriptor); + m.def("build_coo_atomic_prod_descriptor", &build_coo_atomic_prod_descriptor); } } // namespace diff --git a/extensions/run.sh b/extensions/run.sh index 5196b9ccb..8b5f5364a 100644 --- a/extensions/run.sh +++ b/extensions/run.sh @@ -1,4 +1,4 @@ rm -rf build pip uninstall brainpylib -y python setup_cuda.py bdist_wheel -pip install dist/brainpylib-0.0.3+cuda115-cp39-cp39-linux_x86_64.whl \ No newline at end of file +pip install dist/brainpylib* \ No newline at end of file diff --git a/extensions/setup_cuda.py b/extensions/setup_cuda.py index 30a2a46f2..74d356434 100644 --- a/extensions/setup_cuda.py +++ b/extensions/setup_cuda.py @@ -6,7 +6,10 @@ import subprocess import sys -import pybind11 +try: + import pybind11 +except ModuleNotFoundError: + raise ModuleNotFoundError('Please install pybind11 before installing brainpylib!') from setuptools import find_packages, setup, Extension from setuptools.command.build_ext import build_ext @@ -55,23 +58,17 @@ def build_extensions(self): print(" ".join(cmake_args)) os.makedirs(self.build_temp, exist_ok=True) - subprocess.check_call( - ["cmake", '-DCMAKE_CUDA_FLAGS="-arch=sm_61"', HERE] + cmake_args, cwd=self.build_temp - ) + subprocess.check_call(["cmake", '-DCMAKE_CUDA_FLAGS="-arch=sm_80"', HERE] + cmake_args, + cwd=self.build_temp) # Build all the extensions super().build_extensions() # Finally run install - subprocess.check_call(["cmake", "--build", ".", "--target", "install"], - cwd=self.build_temp) + subprocess.check_call(["cmake", "--build", ".", "--target", "install"], cwd=self.build_temp) def build_extension(self, ext): - # target_name = ext.name.split(".")[-1] - #subprocess.check_call( - # ["cmake", "."], cwd=self.build_temp) - subprocess.check_call(["cmake", "--build", ".", "--target", "gpu_ops"], - cwd=self.build_temp) + subprocess.check_call(["cmake", "--build", ".", "--target", "gpu_ops"], cwd=self.build_temp) # version control with open(os.path.join(HERE, 'brainpylib', '__init__.py'), 'r') as f: