# Not

In [None]:
import cupy as cp

In [None]:
import numpy as np

In [None]:
array_cpu = np.random.randint(0, 255, size=(2000,2000))
print(array_cpu)
print()
print(f"bytes: {array_cpu.nbytes / 1e6}")   ## MB

[[188 253 163 ... 196 210 229]
 [253 100  14 ... 107 139  17]
 [135 191  79 ...  14  85  40]
 ...
 [176 207  58 ... 128 145  21]
 [120 103  29 ...  80 171  64]
 [200  42 246 ...  98  28 139]]

bytes: 32.0


In [None]:
array_gpu = cp.asarray(array_cpu)
print(array_gpu)
print()
print(f"bytes: {array_gpu.nbytes / 1e6}")

[[188 253 163 ... 196 210 229]
 [253 100  14 ... 107 139  17]
 [135 191  79 ...  14  85  40]
 ...
 [176 207  58 ... 128 145  21]
 [120 103  29 ...  80 171  64]
 [200  42 246 ...  98  28 139]]

bytes: 32.0


In [None]:
%%timeit 
cp.asarray(array_cpu)

100 loops, best of 5: 5.69 ms per loop


In [None]:
type(array_gpu)

cupy._core.core.ndarray

In [None]:
from scipy import fft

In [None]:
%%timeit 
fft.fftn(array_cpu)

10 loops, best of 5: 79.1 ms per loop


In [None]:
from cupyx.scipy import fft as fft_gpu

In [None]:
%%timeit
fft_gpu.fftn(array_gpu)

  cache = get_plan_cache()


The slowest run took 1604.12 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 5: 514 µs per loop


In [None]:
fft_cpu = fft.fftn(array_cpu)
fft_sent_back = cp.asnumpy(fft_gpu.fftn(array_gpu))

np.allclose(fft_sent_back, fft_cpu)

True

In [None]:
cp.random.randint(0, 255, size=(4000,4000))

array([[143,  93,  62, ..., 147, 183,  80],
       [103, 252,  77, ..., 116,  18,  95],
       [ 58, 106, 124, ...,  56, 204, 217],
       ...,
       [108,  76,  75, ...,  75,  77, 230],
       [ 43, 107, 116, ..., 168, 216,  94],
       [ 12, 106, 199, ...,  43, 239, 137]])

In [None]:
from numba import cuda, float32

In [None]:
cuda.detect()

Found 1 CUDA devices
id 0            b'Tesla K80'                              [SUPPORTED]
                      compute capability: 3.7
                           pci device id: 4
                              pci bus id: 0
Summary:
	1/1 devices are supported


True

In [None]:
x_cpu = np.random.randint(0, 10, size=(2000,2000))
print(x_cpu)
print(cp.asarray(x_cpu))
d_array = cuda.to_device(x_cpu)
d_array

[[1 1 1 ... 6 7 1]
 [2 9 9 ... 2 4 4]
 [2 9 9 ... 9 4 4]
 ...
 [5 6 6 ... 2 9 5]
 [9 3 2 ... 6 6 6]
 [6 9 5 ... 6 0 1]]
[[1 1 1 ... 6 7 1]
 [2 9 9 ... 2 4 4]
 [2 9 9 ... 9 4 4]
 ...
 [5 6 6 ... 2 9 5]
 [9 3 2 ... 6 6 6]
 [6 9 5 ... 6 0 1]]


<numba.cuda.cudadrv.devicearray.DeviceNDArray at 0x7fb8a1c949d0>

In [None]:
cp.asarray(d_array)

array([[1, 1, 1, ..., 6, 7, 1],
       [2, 9, 9, ..., 2, 4, 4],
       [2, 9, 9, ..., 9, 4, 4],
       ...,
       [5, 6, 6, ..., 2, 9, 5],
       [9, 3, 2, ..., 6, 6, 6],
       [6, 9, 5, ..., 6, 0, 1]])

In [None]:
d_array.copy_to_host()

array([[1, 1, 1, ..., 6, 7, 1],
       [2, 9, 9, ..., 2, 4, 4],
       [2, 9, 9, ..., 9, 4, 4],
       ...,
       [5, 6, 6, ..., 2, 9, 5],
       [9, 3, 2, ..., 6, 6, 6],
       [6, 9, 5, ..., 6, 0, 1]])

In [None]:
@cuda.jit
def add_one_kernel(A):
    ## select threads automaticaly
    row, column = cuda.grid(2)

    ## use limited threads
    if row < A.shape[0] and column < A.shape[1]:
        A[row, column] += 1

In [None]:
@cuda.jit
def matmul(A, B, C):
    i, j = cuda.grid(2)
    if i < C.shape[0] and j < C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[i, k] * B[k, j]
        C[i, j] = tmp

In [None]:
cp.random.seed(42)
A = cp.random.uniform(1, 10, size=(2000,2000), dtype=np.float64)
B = cp.random.uniform(1, 10, size=(2000,2000), dtype=np.float64)
C = cp.zeros((2000,2000), dtype=np.float64)
C

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
threadsperblock = (16, 16)  ## each block will contain 16*16 threads, typically 128-512  threads
blockspergrid_x = int(np.ceil(C.shape[0] / threadsperblock[0]))
blockspergrid_y = int(np.ceil(C.shape[1] / threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)  ## we calculate the gridsize(number of blocks)
print(blockspergrid)
print(f"The kernel will be executed up to element {threadsperblock[0]*blockspergrid_x}")

(125, 125)
The kernel will be executed up to element 2000


In [None]:
## execution of the kernel
matmul[blockspergrid, threadsperblock](A, B, C)
C

array([[59394.46607842, 58001.66377549, 58910.89964126, ...,
        58755.23643036, 59265.65525416, 58447.86197932],
       [59656.82462269, 58635.04995946, 59080.54393462, ...,
        59327.90030958, 60391.24930458, 59425.35827899],
       [62192.77335924, 60700.17680915, 60538.34933653, ...,
        61027.03460329, 61711.10155029, 60544.69882075],
       ...,
       [60649.27416407, 59951.20972379, 60170.2004206 , ...,
        60203.88074659, 60934.19598791, 59613.28418599],
       [61620.11922557, 61264.33868343, 62076.33462258, ...,
        61227.57661876, 62642.97523374, 61841.46799761],
       [61535.95697543, 59600.43760873, 59927.620961  , ...,
        60738.55627077, 61429.70009593, 59662.34901713]])

In [None]:
A_ = cp.asnumpy(A)
B_ = cp.asnumpy(B)
C_ = A_@B_
C_

array([[59394.46607842, 58001.66377549, 58910.89964126, ...,
        58755.23643036, 59265.65525416, 58447.86197932],
       [59656.82462269, 58635.04995946, 59080.54393462, ...,
        59327.90030958, 60391.24930458, 59425.35827899],
       [62192.77335924, 60700.17680915, 60538.34933653, ...,
        61027.03460329, 61711.10155029, 60544.69882075],
       ...,
       [60649.27416407, 59951.20972379, 60170.2004206 , ...,
        60203.88074659, 60934.19598791, 59613.28418599],
       [61620.11922557, 61264.33868343, 62076.33462258, ...,
        61227.57661876, 62642.97523374, 61841.46799761],
       [61535.95697543, 59600.43760873, 59927.620961  , ...,
        60738.55627077, 61429.70009593, 59662.34901713]])

In [None]:
## faster multiplication can be obtained by making use of shared memory between threads in the same block
## this requires more thinking about non-obvious implementation

from numba import float32, int32, float64

## Controls threads per block and shared memory usage.
## the computation will be done on blocks of TPB*TPB elements.

TPB = 16

@cuda.jit
def fast_matmul(A, B, C):
    ## Define an array in the shared memory
    ## The size and type of the arrays must be known at compile time
    sA = cuda.shared.array(shape=(TPB,TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB,TPB), dtype=float32)
    x, y = cuda.grid(2)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bpg = cuda.gridDim.x    ## blocks per grid

    if x >= C.shape[0] and y >= C.shape[1]:
        ## Quit it (x, y) is outside of valid C boundary
        return

    ## Each thread computes one element in the result matrix.
    ## The dot product is chunked into dot products of TPB-long vectors.
    tmp = 0.
    for i in range(bpg):
        sA[tx, ty] = A[x, ty + i*TPB]
        sB[tx, ty] = B[tx + i*TPB, y]

        ## Wait until all threads finish preloading
        cuda.syncthreads()

        ## Computes partial product on the shared memory
        for j in range(TPB):
            tmp += sA[tx, j] * sB[j, ty]

        ## Wait until all threads finish computing
        cuda.syncthreads()
    
    C[x, y] = tmp

In [None]:
## execution of the kernel
size_ = 4000
A = np.random.uniform(1, 10, size=(size_,size_))
B = np.random.uniform(1, 10, size=(size_,size_))
C_slow = np.zeros((size_,size_), dtype=np.float32)
C_fast = np.zeros((size_,size_), dtype=np.float32)

A_ = cp.asarray(A)
B_ = cp.asarray(B)

In [None]:
threadsperblock = (TPB, TPB)
blockspergrid = int(np.ceil(size_ / threadsperblock[0]))
blockspergrid = (blockspergrid, blockspergrid)

In [None]:
cuda.synchronize()
matmul[blockspergrid, threadsperblock](A, B, C_slow)
cuda.synchronize()

In [None]:
cuda.synchronize()
fast_matmul[blockspergrid, threadsperblock](A, B, C_fast)
cuda.synchronize()

In [None]:
C_slow

array([[121041.234, 121214.516, 121292.28 , ..., 120658.375, 119565.27 ,
        121197.305],
       [121181.07 , 121006.195, 122100.47 , ..., 120782.23 , 119880.945,
        121381.54 ],
       [121515.59 , 121542.29 , 122722.3  , ..., 121815.69 , 120581.414,
        121508.74 ],
       ...,
       [122981.805, 121512.88 , 123252.85 , ..., 120886.31 , 122188.38 ,
        122298.336],
       [121443.734, 120890.71 , 122000.84 , ..., 120886.4  , 119577.62 ,
        120687.38 ],
       [122228.36 , 122134.336, 122875.516, ..., 121710.97 , 120776.67 ,
        122397.84 ]], dtype=float32)

In [None]:
C_fast

array([[121041.234, 121214.516, 121292.28 , ..., 120658.375, 119565.27 ,
        121197.305],
       [121181.07 , 121006.195, 122100.47 , ..., 120782.23 , 119880.945,
        121381.54 ],
       [121515.59 , 121542.29 , 122722.3  , ..., 121815.69 , 120581.414,
        121508.74 ],
       ...,
       [122981.805, 121512.88 , 123252.85 , ..., 120886.31 , 122188.38 ,
        122298.336],
       [121443.734, 120890.71 , 122000.84 , ..., 120886.4  , 119577.62 ,
        120687.38 ],
       [122228.36 , 122134.336, 122875.516, ..., 121710.97 , 120776.67 ,
        122397.84 ]], dtype=float32)

In [None]:
np.allclose(C_slow, C_fast)

True

In [None]:
%%time
cuda.synchronize()
matmul[blockspergrid, threadsperblock](A, B, C_slow)
cuda.synchronize()

CPU times: user 7.76 s, sys: 23.9 ms, total: 7.78 s
Wall time: 7.75 s


In [None]:
%%time
cuda.synchronize()
fast_matmul[blockspergrid, threadsperblock](A, B, C_fast)
cuda.synchronize()

CPU times: user 1.97 s, sys: 99.2 ms, total: 2.07 s
Wall time: 2.06 s


In [None]:
%%time
cp.dot(A_, B_)

CPU times: user 222 ms, sys: 46.6 ms, total: 268 ms
Wall time: 1.41 s


array([[121041.2365611 , 121214.51706939, 121292.27846453, ...,
        120658.37388934, 119565.27369506, 121197.30656048],
       [121181.07044843, 121006.19275931, 122100.46991976, ...,
        120782.2227743 , 119880.94490864, 121381.54114099],
       [121515.59650052, 121542.28540184, 122722.30067156, ...,
        121815.68900558, 120581.41577918, 121508.7425842 ],
       ...,
       [122981.80686798, 121512.87950477, 123252.84819006, ...,
        120886.31328923, 122188.38543583, 122298.33331723],
       [121443.73444946, 120890.70756655, 122000.8415079 , ...,
        120886.39966811, 119577.61949362, 120687.37916706],
       [122228.35696776, 122134.33857391, 122875.51225891, ...,
        121710.96885137, 120776.67019064, 122397.84222271]])

In [None]:
%%time
np.dot(A, B)

CPU times: user 7.2 s, sys: 72.2 ms, total: 7.27 s
Wall time: 3.79 s


array([[121041.2365611 , 121214.51706939, 121292.27846453, ...,
        120658.37388934, 119565.27369506, 121197.30656048],
       [121181.07044843, 121006.19275931, 122100.46991976, ...,
        120782.2227743 , 119880.94490864, 121381.54114099],
       [121515.59650052, 121542.28540184, 122722.30067156, ...,
        121815.68900558, 120581.41577918, 121508.7425842 ],
       ...,
       [122981.80686798, 121512.87950477, 123252.84819006, ...,
        120886.31328923, 122188.38543583, 122298.33331723],
       [121443.73444946, 120890.70756655, 122000.8415079 , ...,
        120886.39966811, 119577.61949362, 120687.37916706],
       [122228.35696776, 122134.33857391, 122875.51225891, ...,
        121710.96885137, 120776.67019064, 122397.84222271]])

In [None]:
x = np.random.randn(16900,25)
x_ = cp.asarray(x)
w = np.random.randn(25,1)
w_ = cp.asarray(w)
out = np.zeros((16900,1))
out_ = np.zeros((16900,1))
out_c = cp.asarray(out)

In [None]:
threads_per_block = (64,4)

blocks_per_grid_x = int(np.ceil(out.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(np.ceil(out.shape[1] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)  ## we calculate the gridsize(number of blocks)
print(blocks_per_grid)
print(f"The kernel will be executed up to element {threads_per_block[0]*blocks_per_grid_x}")

(265, 1)
The kernel will be executed up to element 16960


In [None]:
cuda.synchronize()
matmul[blocks_per_grid,threads_per_block](x,w,out)
cuda.synchronize()

In [None]:
%%timeit
cuda.synchronize()
matmul[blocks_per_grid,threads_per_block](x,w,out)
cuda.synchronize()

100 loops, best of 5: 3.47 ms per loop


In [None]:
%%timeit
np.dot(x,w)

The slowest run took 10.84 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 5: 190 µs per loop


In [None]:
%%timeit
cp.dot(x_,w_)

The slowest run took 11.85 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 5: 69.7 µs per loop


In [None]:
%%timeit
cp.asarray(x)

The slowest run took 27.17 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 5: 300 µs per loop


In [None]:
@cuda.jit
def add_one_kernel(A):
    ## select threads automatically
    row, column = cuda.grid(2)

    ## use limited threads
    if row < A.shape[0] and column < A.shape[1]:
        A[row, column] += 1

convolution 곱을 실행할 때, 크기가 2차원으로 존재한다. 예를 들면, (1,28,28) 데이터와 (1,5,5)의 필터를 적용하면(stride=1, pad=0 이라고 하자), output 크기는 
$
h_{out} = 1 + \frac{h + 2pad - fh}{stride}
$
이므로 24가 된다.
그럼 (24,24)의 block을 생성하고 그 안에 (5,5)의 thread를 생성해 계산을 수행하면 되지 않을까 싶다.

In [None]:
## pad는 먼저 처리해주기
# @cuda.jit
# def convolution_multiply(data, filter, stride, output):
#     tx, ty = cuda.threadIdx.x, cuda.threadIdx.y     ## threads, shape: filter.shape
#     bx, by
#     ...

배치 학습 버리기????

```
@cuda.jit
def matmul(A, B, C):
    i, j = cuda.grid(2)
    if i < C.shape[0] and j < C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[i, k] * B[k, j]
        C[i, j] = tmp
```

# Here

In [None]:
from numba import cuda, jit
import numpy as np
import cupy as cp

In [None]:
data = np.arange(28*28).reshape(28,28)
fh = 5
fw = 5
stride = 2

out_h = 1 + int((data.shape[0]-fh)/stride)
out_w = 1 + int((data.shape[1]-fw)/stride)

weight = np.ones((fh, fw)) / 10
output = np.zeros((out_h, out_w))

In [None]:
@jit(nopython=True)
def matmul(data, weight):
    out = 0

    for x in range(data.shape[0]):
        for y in range(data.shape[1]):
            out += data[x,y] * weight[x,y]
    return out

In [None]:
# first complie
%time matmul(data[0:5,0:5], weight)

CPU times: user 168 ms, sys: 1.9 ms, total: 170 ms
Wall time: 169 ms


145.0

In [None]:
# complied
%time matmul(data[0:5,0:5], weight)

CPU times: user 32 µs, sys: 1e+03 ns, total: 33 µs
Wall time: 40.5 µs


145.0

In [None]:
@cuda.jit
def convolution_test(data, weight, stride, output):
    row, column = cuda.grid(2)
    fh, fw = weight.shape

    if row < output.shape[0] and column < output.shape[1]:
        i, i_max = stride*row, stride*row + fh
        j, j_max = stride*column, stride*column + fw
        
        output[row,column] = matmul(data[i:i_max,j:j_max], weight)

In [None]:
convolution_test[1,(16,16)](data, weight, stride, output)
output

array([[ 145.,  150.,  155.,  160.,  165.,  170.,  175.,  180.,  185.,
         190.,  195.,  200.],
       [ 285.,  290.,  295.,  300.,  305.,  310.,  315.,  320.,  325.,
         330.,  335.,  340.],
       [ 425.,  430.,  435.,  440.,  445.,  450.,  455.,  460.,  465.,
         470.,  475.,  480.],
       [ 565.,  570.,  575.,  580.,  585.,  590.,  595.,  600.,  605.,
         610.,  615.,  620.],
       [ 705.,  710.,  715.,  720.,  725.,  730.,  735.,  740.,  745.,
         750.,  755.,  760.],
       [ 845.,  850.,  855.,  860.,  865.,  870.,  875.,  880.,  885.,
         890.,  895.,  900.],
       [ 985.,  990.,  995., 1000., 1005., 1010., 1015., 1020., 1025.,
        1030., 1035., 1040.],
       [1125., 1130., 1135., 1140., 1145., 1150., 1155., 1160., 1165.,
        1170., 1175., 1180.],
       [1265., 1270., 1275., 1280., 1285., 1290., 1295., 1300., 1305.,
        1310., 1315., 1320.],
       [1405., 1410., 1415., 1420., 1425., 1430., 1435., 1440., 1445.,
        1450., 1455.

In [None]:
np.sum(data[0:5,0:5]) / 10

145.0

In [None]:
## image to column
def im2col(input_data, filter_h, filter_w, stride=1, pad=0):
    N, C, H, W = input_data.shape
    out_h = (H + 2*pad - filter_h)//stride + 1
    out_w = (W + 2*pad - filter_w)//stride + 1
    
    # pad
    img = np.pad(input_data, [(0,0), (0,0), (pad,pad), (pad,pad)], "constant")
    
    # col
    col = np.zeros((N, C, filter_h, filter_w, out_h, out_w))
    
    for y in range(filter_h):
        y_max = y + stride*out_h
        for x in range(filter_w):
            x_max = x + stride*out_w
            ## y:y_max:stride => out_h만큼의 행렬 만듦
            col[:,:,y,x,:,:] = img[:,:,y:y_max:stride,x:x_max:stride]
            
    ## transpose to N, out_h, out_w, C, reshape it a line with length N*out_h*out_w 
    ## -1: do what you want numpy
    col = col.transpose(0,4,5,1,2,3).reshape(N*out_h*out_w, -1)
    
    return col

def col2im(col, input_shape, filter_h, filter_w, stride=1, pad=0):
    N, C, H, W = input_shape
    out_h = (H + 2*pad - filter_h)//stride + 1
    out_w = (W + 2*pad - filter_w)//stride + 1
    col = col.reshape(N, out_h, out_w, C, filter_h, filter_w).transpose(0,3,4,5,1,2)
    
    img = np.zeros((N, C, H + 2*pad + stride - 1, W + 2*pad + stride - 1))
    for y in range(filter_h):
        y_max = y + stride*out_h
        for x in range(filter_w):
            x_max = x + stride*out_w
            img[:,:, y:y_max:stride, x:x_max:stride] = col[:,:,y,x,:,:]
            
    return img[:, :, pad:H+pad, pad:W+pad]

In [None]:
x = np.random.randn(100,1,200,200)
w = cp.random.randn(100,1)

In [None]:
%%timeit

col = im2col(x, 15, 15, stride=7, pad=1)
col_gpu = cp.asarray(col)
# dot 연산 있음 => 시간이 더 걸림
img = col2im(col, x.shape, 15, 15, stride=7, pad=1)
col_cpu = cp.asnumpy(img)

1 loop, best of 5: 315 ms per loop


In [None]:
data = np.random.randn(200,200)
weight = np.random.randn(15,15)
fh, fw = weight.shape
stride = 7

out_h = 1 + int((data.shape[0]-fh)/stride)
out_w = 1 + int((data.shape[1]-fw)/stride)

output = np.zeros((out_h, out_w))

data_device = cuda.to_device(data)
weight_device = cuda.to_device(weight)
output_device = cuda.device_array_like(output)

In [None]:
%%timeit

convolution_test[1,(32,32)](data_device, weight_device, stride, output_device)
cuda.synchronize()
output_device.copy_to_host()

1000 loops, best of 5: 1.01 ms per loop


In [None]:
x = np.random.randn(100,1,28,28)
w = cp.random.randn(100,1)

In [None]:
%%timeit

col = im2col(x, 5, 5, stride=2, pad=1)
col_gpu = cp.asarray(col)
# dot 연산 있음 => 시간이 더 걸림
img = col2im(col, x.shape, 5, 5, stride=2, pad=1)
col_cpu = cp.asnumpy(img)

100 loops, best of 5: 5.01 ms per loop


In [None]:
data = np.random.randn(28,28)
weight = np.random.randn(5,5)
fh, fw = weight.shape
stride = 2

out_h = 1 + int((data.shape[0]-fh)/stride)
out_w = 1 + int((data.shape[1]-fw)/stride)

output = np.zeros((out_h, out_w))

data_device = cuda.to_device(data)
weight_device = cuda.to_device(weight)
output_device = cuda.device_array_like(output)

In [None]:
%%timeit

convolution_test[1,(16,16)](data_device, weight_device, stride, output_device)
cuda.synchronize()
output_device.copy_to_host()

The slowest run took 18.88 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 5: 450 µs per loop


# What?

In [None]:
from numba import cuda, jit
import numpy as np

In [None]:
@jit(nopython=True)
def matmul(data, weight):
    out = 0

    for x in range(data.shape[0]):
        for y in range(data.shape[1]):
            out += data[x,y] * weight[x,y]
    return out

In [None]:
@cuda.jit
def convolution_multiply(data, weight, stride, output):
    num, row, column = cuda.grid(3)
    fh, fw = weight.shape

    if num < output.shape[0] and row < output.shape[1] and column < output.shape[2]:
        i, i_max = stride*row, stride*row + fh
        j, j_max = stride*column, stride*column + fw
        
        output[num,row,column] = matmul(data[num,i:i_max,j:j_max], weight)

In [None]:
%%timeit 
convolution_multiply[100,(1,32,32)](data_device, weight_device, stride, output_device)
cuda.synchronize()
output_device.copy_to_host()

100 loops, best of 5: 4.57 ms per loop


In [None]:
%%timeit
col = im2col(data_, 15, 15, stride=7)
col2im(col, data_.shape, 15, 15, stride=7)

1 loop, best of 5: 281 ms per loop


In [None]:
data = np.arange(100*200*200).reshape(100,200,200)
data_ = data.reshape(100,1,200,200)
num = data.shape[0]
weight = np.ones((15,15))
fh = weight.shape[0]
fw = weight.shape[1]
stride = 7

out_h = 1 + int((data.shape[1]-fh)/stride)
out_w = 1 + int((data.shape[2]-fw)/stride)

output = np.zeros((num, out_h, out_w))
data_device = cuda.to_device(data)
weight_device = cuda.to_device(weight)
output_device = cuda.to_device(output)

print(output.shape)

(100, 27, 27)


In [None]:
output_device.copy_to_host()[0,0,0]

316575.0

In [None]:
np.sum(data[0,0:15,0:15])

316575