# 线程束洗牌
使用 **__shfl_down** 实现对线程束中所有线程的局部变量求和。
书中的函数 **__shfl_down** 已在CUDA 9之后弃用，变更为 **__shfl_down_sync** 。文档参考[此处](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions)。

In [2]:
from __future__ import division
import numpy as np
from pycuda.compiler import SourceModule
import pycuda.autoinit
from pycuda import gpuarray


ShflSumCode='''
__global__ void shfl_sum_ker(int *input, int *out) {
 int temp = input[threadIdx.x];
 for (int i=1; i < 32; i *= 2)
     temp += __shfl_down_sync (temp, i, 32);
 if (threadIdx.x == 0)
     *out = temp;
}'''

shfl_mod = SourceModule(ShflSumCode)
shfl_sum_ker = shfl_mod.get_function('shfl_sum_ker')

array_in = gpuarray.to_gpu(np.int32(range(32)))
out = gpuarray.empty((1,), dtype=np.int32)

shfl_sum_ker(array_in, out, grid=(1,1,1), block=(32,1,1))

print('Input array: %s' % array_in.get())
print('Summed value: %s' % out.get()[0])
print('Does this match with Python''s sum? : %s' % (out.get()[0] == sum(array_in.get()) ))

Input array: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31]
Summed value: 31
Does this match with Pythons sum? : False
