In [4]:
import numpy as np
size = 8192 * 8192
array = np.random.random(size).astype(np.float32)
%timeit -n 1 -r 1 result = np.sort(array)

8.12 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [7]:
import cupy as cp
array_gpu = cp.asarray(array)
%timeit -n 7 -r 1 result_gpu = cp.sort(array_gpu)

134 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 7 loops each)


In [10]:
import cupy

# size of the vectors
size = 1024

# allocating and populating the vectors
a_gpu = cupy.random.rand(size, dtype=cupy.float32)
b_gpu = cupy.random.rand(size, dtype=cupy.float32)
c_gpu = cupy.zeros(size, dtype=cupy.float32)

# CUDA vector_add
vector_add_cuda_code = r'''
extern "C"
__global__ void vector_add(const float * A, const float * B, float * C, const int size)
{
    int item = threadIdx.x;
    C[item] = A[item] + B[item];
}
'''
vector_add_gpu = cupy.RawKernel(vector_add_cuda_code, "vector_add")

import math

threads_per_block = 1024
grid_size = (int(math.ceil(size / threads_per_block)), 1, 1)
block_size = (threads_per_block, 1, 1)

%timeit -n 7 -r 1 vector_add_gpu(grid_size, block_size, (a_gpu, b_gpu, c_gpu, size))

163 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 7 loops each)


In [12]:
import numpy

def vector_add(A, B, C, size):
    for item in range(0, size):
        C[item] = A[item] + B[item]
    
    return C

a_cpu = cupy.asnumpy(a_gpu)
b_cpu = cupy.asnumpy(b_gpu)
c_cpu = numpy.zeros(size, dtype=numpy.float32)

%timeit -n 7 -r 1 vector_add(a_cpu, b_cpu, c_cpu, size)

239 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 7 loops each)


In [None]:
optimize_pose = r'''
extern "C"
__global__ void optimize_pose(const float * A, const float * B, float * C, const int dimension)
{
    int item = threadIdx.x;
    C[item] = A[item] + B[item];
}
'''