In [1]:
import numpy as np
import cupy as cp # import CuPy library

In [2]:
# Raw module class can be used to either compile raw CUDA sources or load CUDA modules; this class is useful when a number of CUDA kernels in the same source need to be retrieved

# CUDA source
loaded_from_source = r'''
  extern "C" {
  // Kernel Function
  __global__ void S1Vector(float* V, float* S1, int N){
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if(tid < N - 1){
      S1[tid] = V[tid] + V[tid + 1]; // Vector S_{1} := S[i] = V[i] + V[i+1]
    }
  }
  __global__ void S2Vector(float* V, float* S2, int N){
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if(tid > 0 && tid < N - 1){
      S2[tid - 1] = (V[tid + 1] + V[tid - 1]) / 2.0; // Vector S_{2} := S[i] = V[i+1] + V[i-1]/2
    }
  }
}'''

module = cp.RawModule(code = loaded_from_source)
S1Vector = module.get_function('S1Vector') # CUDA kernels can be retrieved by calling get_function()
S2Vector = module.get_function('S2Vector')

length = 11
V = cp.arange(1.0, length, 1.1, dtype=cp.float32) # Vector V with length N = 10
S1 = cp.zeros(length-2, dtype=cp.float32) # Vector S_{1} := S[i] = V[i] + V[i+1] with length N = 9
S2 = cp.zeros(length-3, dtype=cp.float32) # Vector S_{2} := S[i] = V[i+1] + V[i-1]/2 with length N = 8

N = 10
S1Vector((5,), (5,), (V, S1, N)) # kernel function calling
S2Vector((5,), (5,), (V, S2, N))

print(V)
print(S1) # Vector S1
print(S2) # Vector S2

[ 1.         2.1        3.2        4.3        5.4        6.5
  7.6000004  8.7        9.8       10.900001 ]
[ 3.1       5.3       7.5       9.700001 11.9      14.1      16.3
 18.5      20.7     ]
[2.1      3.2      4.3      5.4      6.5      7.6      8.700001 9.8     ]
