In [1]:
import numpy as np
from numba import cuda, float64
import time
import math

# 1 The data

In [2]:
arr = np.loadtxt('1.csv', delimiter=',', skiprows=1, usecols=range(1, 32))  # takes more than 40s to load data
print(arr.shape)

(3912038, 31)


In [3]:
arr[0]

array([0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0.,
       1., 1., 1., 0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0.])

In [4]:
arr

array([[ 0.,  1.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  1.,  0., ...,  1.,  0.,  0.],
       ...,
       [ 0.,  0.,  2., ..., 27.,  0.,  0.],
       [ 0.,  0.,  2., ..., 27.,  0.,  0.],
       [ 0.,  0.,  2., ..., 27.,  0.,  0.]])

# 2 Problem

计算两两行之间相同的单元格是否相等，相等保留原值，不等变为零

由于输出太大，只计算所有行和第一行做比较

# 3 Solve it in CPU

In [17]:
def compare_two_lines(line1, line2, output, indx):
    for i in range(len(line1)):
        if line2[i] == line1[i]:
            output[indx, i] = line2[i]  # 保留原值
        else:
            output[indx, i] = 0         # 变为零

In [18]:
start = time.time()

output_cpu = np.zeros(arr.shape)
first_line = arr[0]
print(first_line)
for i in range(len(output_cpu)):
    compare_two_lines(first_line, arr[i], output_cpu, i)

print('\ntime: {}s'.format((time.time()-start)))

[0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1.
 0. 1. 0. 0. 1. 0. 0.]

time: 36.77397704124451s


In [19]:
output_cpu

array([[0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# 4 Solve it in GPU

In [8]:
@cuda.jit
def kernel(first, array, output):
    i, j = cuda.grid(2)
    if i < output.shape[0] and j < output.shape[1]:
        if array[i][j] == first[j]:
            output[i][j] = array[i][j]
        else:
            output[i][j] = 0

In [9]:
def host(array):
    d_first = cuda.to_device(array[0])  # changing to integer will speed up ?
    d_array = cuda.to_device(array)
    d_output = cuda.device_array(array.shape, np.float64)
    
    TPB = 32
    threadsperblock = (TPB, TPB)
    blockspergrid_x = math.ceil(array.shape[0]/TPB)
    blockspergrid_y = math.ceil(array.shape[1]/TPB)
    blockspergrid   = (blockspergrid_x, blockspergrid_y)
    print('blocks per grid ', blockspergrid)
    print('threads per grid', threadsperblock)
    
    kernel[blockspergrid, threadsperblock](d_first, d_array, d_output)
    
    return d_output.copy_to_host()

In [10]:
start = time.time()

output_gpu = host(arr)

print('\ntime: {}s'.format((time.time()-start)))

blocks per grid  (122252, 1)
threads per grid (32, 32)

time: 0.7456362247467041s


In [11]:
output_gpu

array([[0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# 5 Check CPU and GPU are getting the same results

In [20]:
np.min(output_gpu - output_cpu), np.max(output_gpu - output_cpu)

(0.0, 0.0)

结果一模一样。

时间从36.7s减少到0.74s。速度提升大约50倍。