Skip to content

Commit

Permalink
perf/blas: reduce overhead of calling blas
Browse files Browse the repository at this point in the history
REFERENCES: #13
  • Loading branch information
hobofan committed Dec 3, 2015
1 parent 44ea377 commit 8b7a7ae
Show file tree
Hide file tree
Showing 6 changed files with 322 additions and 47 deletions.
8 changes: 8 additions & 0 deletions Cargo.toml
Expand Up @@ -34,3 +34,11 @@ rand = "0.3"
default = []
dev = []
lint = ["clippy"]

[profile.bench]
opt-level = 3
debug = false
rpath = false
lto = false
debug-assertions = false
codegen-units = 1
14 changes: 14 additions & 0 deletions README.md
Expand Up @@ -97,6 +97,20 @@ fn main() {
}
```

## Benchmarks

The following benchmarks highlight the overhead of calling the underlying library implementations.

Operation | Collenchyma (Native backend) | rust-blas
-------------------------------------------- | ---------------------------- | ----------
1000x Dot product of two vectors of size 100 | 48,870 ns (+/- 499) | 15,226 ns (+/- 244)
100x Dot product of two vectors of size 1000 | 9,997 ns (+/- 215) | 6,920 ns (+/- 179)
10x Dot product of two vectors of size 10000 | 10,958 ns (+/- 377) | 10,333 ns (+/- 460)
5x Dot product of two vectors of size 20000 | 10,784 ns (+/- 2,338) | 10,533 ns (+/- 1,981)

The overhead of Collenchyma becomes negligible when executing operations on vectors bigger than ~10000-20000 elements.
Reducing this overhead is a big priority and [you can help!](https://github.com/autumnai/collenchyma/issues/13)

## Contributing

Want to contribute? Awesome! We have
Expand Down
105 changes: 104 additions & 1 deletion benches/rblas_overhead.rs
Expand Up @@ -62,6 +62,109 @@ fn bench_1000_dot_100_collenchyma_profile(b: &mut Bencher, backend: &Backend<Nat
});
}

#[bench]
fn bench_100_dot_1000_rblas(b: &mut Bencher) {
let mut rng = thread_rng();
let slice_a = rng.gen_iter::<f32>().take(1000).collect::<Vec<f32>>();
let slice_b = rng.gen_iter::<f32>().take(1000).collect::<Vec<f32>>();

b.iter(|| {
for _ in 0..100 {
let res = Dot::dot(&slice_a, &slice_b);
test::black_box(res);
}
});
}

#[bench]
fn bench_100_dot_1000_collenchyma(b: &mut Bencher) {
let mut rng = thread_rng();
let slice_a = rng.gen_iter::<f32>().take(1000).collect::<Vec<f32>>();
let slice_b = rng.gen_iter::<f32>().take(1000).collect::<Vec<f32>>();

let backend = backend();
let shared_a = &mut SharedMemory::<f32>::new(backend.device(), 1000);
let shared_b = &mut SharedMemory::<f32>::new(backend.device(), 1000);
let shared_res = &mut SharedMemory::<f32>::new(backend.device(), 1);
shared_a.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_a);
shared_b.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_b);
let _ = backend.dot(shared_a, shared_b, shared_res);
bench_100_dot_1000_collenchyma_profile(b, &backend, shared_a, shared_b, shared_res);
}

#[inline(never)]
fn bench_100_dot_1000_collenchyma_profile(b: &mut Bencher, backend: &Backend<Native>, shared_a: &mut SharedMemory<f32>, shared_b: &mut SharedMemory<f32>, shared_res: &mut SharedMemory<f32>) {
b.iter(|| {
for _ in 0..100 {
let _ = backend.dot(shared_a, shared_b, shared_res);
}
});
}

#[bench]
fn bench_50_dot_2000_collenchyma(b: &mut Bencher) {
let mut rng = thread_rng();
let slice_a = rng.gen_iter::<f32>().take(2000).collect::<Vec<f32>>();
let slice_b = rng.gen_iter::<f32>().take(2000).collect::<Vec<f32>>();

let backend = backend();
let shared_a = &mut SharedMemory::<f32>::new(backend.device(), 2000);
let shared_b = &mut SharedMemory::<f32>::new(backend.device(), 2000);
let shared_res = &mut SharedMemory::<f32>::new(backend.device(), 1);
shared_a.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_a);
shared_b.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_b);
let _ = backend.dot(shared_a, shared_b, shared_res);
bench_50_dot_2000_collenchyma_profile(b, &backend, shared_a, shared_b, shared_res);
}

#[inline(never)]
fn bench_50_dot_2000_collenchyma_profile(b: &mut Bencher, backend: &Backend<Native>, shared_a: &mut SharedMemory<f32>, shared_b: &mut SharedMemory<f32>, shared_res: &mut SharedMemory<f32>) {
b.iter(|| {
for _ in 0..50 {
let _ = backend.dot(shared_a, shared_b, shared_res);
}
});
}

#[bench]
fn bench_10_dot_10000_rblas(b: &mut Bencher) {
let mut rng = thread_rng();
let slice_a = rng.gen_iter::<f32>().take(10000).collect::<Vec<f32>>();
let slice_b = rng.gen_iter::<f32>().take(10000).collect::<Vec<f32>>();

b.iter(|| {
for _ in 0..10 {
let res = Dot::dot(&slice_a, &slice_b);
test::black_box(res);
}
});
}

#[bench]
fn bench_10_dot_10000_collenchyma(b: &mut Bencher) {
let mut rng = thread_rng();
let slice_a = rng.gen_iter::<f32>().take(10000).collect::<Vec<f32>>();
let slice_b = rng.gen_iter::<f32>().take(10000).collect::<Vec<f32>>();

let backend = backend();
let shared_a = &mut SharedMemory::<f32>::new(backend.device(), 10000);
let shared_b = &mut SharedMemory::<f32>::new(backend.device(), 10000);
let shared_res = &mut SharedMemory::<f32>::new(backend.device(), 1);
shared_a.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_a);
shared_b.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_b);
let _ = backend.dot(shared_a, shared_b, shared_res);
bench_10_dot_10000_collenchyma_profile(b, &backend, shared_a, shared_b, shared_res);
}

#[inline(never)]
fn bench_10_dot_10000_collenchyma_profile(b: &mut Bencher, backend: &Backend<Native>, shared_a: &mut SharedMemory<f32>, shared_b: &mut SharedMemory<f32>, shared_res: &mut SharedMemory<f32>) {
b.iter(|| {
for _ in 0..10 {
let _ = backend.dot(shared_a, shared_b, shared_res);
}
});
}

#[bench]
fn bench_5_dot_20000_rblas(b: &mut Bencher) {
let mut rng = thread_rng();
Expand All @@ -85,7 +188,7 @@ fn bench_5_dot_20000_collenchyma(b: &mut Bencher) {
let backend = backend();
let shared_a = &mut SharedMemory::<f32>::new(backend.device(), 20000);
let shared_b = &mut SharedMemory::<f32>::new(backend.device(), 20000);
let shared_res = &mut SharedMemory::<f32>::new(backend.device(), 20000);
let shared_res = &mut SharedMemory::<f32>::new(backend.device(), 1);
shared_a.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_a);
shared_b.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_b);
let _ = backend.dot(shared_a, shared_b, shared_res);
Expand Down
129 changes: 129 additions & 0 deletions src/backend.rs
Expand Up @@ -44,6 +44,10 @@ use framework::IFramework;
use frameworks::{Native, OpenCL, Cuda};
use device::{IDevice, DeviceType};
use libraries::blas::IBlas;
use libraries::blas as bl;
use shared_memory::SharedMemory;
use libraries::blas::{IOperationAsum, IOperationAxpy, IOperationCopy, IOperationDot,
IOperationNrm2, IOperationScale, IOperationSwap};

#[derive(Debug, Clone)]
/// Defines the main and highest struct of Collenchyma.
Expand Down Expand Up @@ -129,9 +133,126 @@ impl IBlas<f32> for Backend<OpenCL> {
}
}

macro_rules! iblas_asum_for {
($t:ident, $b:ty) => (
fn asum(&self, x: &mut SharedMemory<$t>, result: &mut SharedMemory<$t>) -> Result<(), ::error::Error> {
match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
match result.add_device(self.device()) { _ => () }
Ok(try!(
<$b as IOperationAsum<$t>>::compute(&self,
try!(x.get(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `x`"))),
try!(result.get_mut(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `result`"))),
)
))
}
);
}

macro_rules! iblas_axpy_for {
($t:ident, $b:ty) => (
fn axpy(&self, a: &mut SharedMemory<$t>, x: &mut SharedMemory<$t>, y: &mut SharedMemory<$t>) -> Result<(), ::error::Error> {
match a.add_device(self.device()) { _ => try!(a.sync(self.device())) }
match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
match y.add_device(self.device()) { _ => try!(y.sync(self.device())) }
Ok(try!(
<$b as IOperationAxpy<$t>>::compute(&self,
try!(a.get(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `a`"))),
try!(x.get(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `x`"))),
try!(y.get_mut(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `y`"))),
)
))
}
);
}

macro_rules! iblas_copy_for {
($t:ident, $b:ty) => (
fn copy(&self, x: &mut SharedMemory<$t>, y: &mut SharedMemory<$t>) -> Result<(), ::error::Error> {
match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
match y.add_device(self.device()) { _ => () }
Ok(try!(
<$b as IOperationCopy<$t>>::compute(&self,
try!(x.get(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `x`"))),
try!(y.get_mut(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `y`"))),
)
))
}
);
}

macro_rules! iblas_dot_for {
($t:ident, $b:ty) => (
fn dot(&self, x: &mut SharedMemory<$t>, y: &mut SharedMemory<$t>, result: &mut SharedMemory<$t>) -> Result<(), Error> {
match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
match y.add_device(self.device()) { _ => try!(y.sync(self.device())) }
match result.add_device(self.device()) { _ => () }
Ok(try!(
<$b as IOperationDot<$t>>::compute(&self,
try!(x.get(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `x`"))),
try!(y.get(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `y`"))),
try!(result.get_mut(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `result`")))
)
))
}
);
}

macro_rules! iblas_nrm2_for {
($t:ident, $b:ty) => (
fn nrm2(&self, x: &mut SharedMemory<$t>, result: &mut SharedMemory<$t>) -> Result<(), ::error::Error> {
match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
match result.add_device(self.device()) { _ => () }
Ok(try!(
<$b as IOperationNrm2<$t>>::compute(&self,
try!(x.get(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `x`"))),
try!(result.get_mut(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `result`"))),
)
))
}
);
}

macro_rules! iblas_scale_for {
($t:ident, $b:ty) => (
fn scale(&self, a: &mut SharedMemory<$t>, x: &mut SharedMemory<$t>) -> Result<(), ::error::Error> {
match a.add_device(self.device()) { _ => try!(a.sync(self.device())) }
match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
Ok(try!(
<$b as IOperationScale<$t>>::compute(&self,
try!(a.get(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `a`"))),
try!(x.get_mut(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `x`"))),
)
))
}
);
}

macro_rules! iblas_swap_for {
($t:ident, $b:ty) => (
fn swap(&self, x: &mut SharedMemory<$t>, y: &mut SharedMemory<$t>) -> Result<(), ::error::Error> {
match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
match y.add_device(self.device()) { _ => try!(y.sync(self.device())) }
Ok(try!(
<$b as IOperationSwap<$t>>::compute(&self,
try!(x.get_mut(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `x`"))),
try!(y.get_mut(self.device()).ok_or(bl::Error::MissingArgument("Unable to resolve memory for `y`"))),
)
))
}
);
}

impl IBlas<f32> for Backend<Native> {
type B = ::frameworks::native::Binary;

iblas_asum_for!(f32, Backend<Native>);
iblas_axpy_for!(f32, Backend<Native>);
iblas_copy_for!(f32, Backend<Native>);
iblas_dot_for!(f32, Backend<Native>);
iblas_nrm2_for!(f32, Backend<Native>);
iblas_scale_for!(f32, Backend<Native>);
iblas_swap_for!(f32, Backend<Native>);

fn binary(&self) -> &Self::B {
self.binary()
}
Expand All @@ -144,6 +265,14 @@ impl IBlas<f32> for Backend<Native> {
impl IBlas<f64> for Backend<Native> {
type B = ::frameworks::native::Binary;

iblas_asum_for!(f64, Backend<Native>);
iblas_axpy_for!(f64, Backend<Native>);
iblas_copy_for!(f64, Backend<Native>);
iblas_dot_for!(f64, Backend<Native>);
iblas_nrm2_for!(f64, Backend<Native>);
iblas_scale_for!(f64, Backend<Native>);
iblas_swap_for!(f64, Backend<Native>);

fn binary(&self) -> &Self::B {
self.binary()
}
Expand Down

0 comments on commit 8b7a7ae

Please sign in to comment.