# 线程束洗牌
使用 **__shfl_xor** 在索引为偶数和奇数的束内线程之间交换特定变量的值。
书中的函数 **__shfl_xor** 已在CUDA 9之后弃用，变更为 **__shfl_xor_sync** 。文档参考[此处](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions)。

In [2]:
from __future__ import division
import numpy as np
from pycuda.compiler import SourceModule
import pycuda.autoinit
from pycuda import gpuarray


ShflCode='''
__global__ void shfl_xor_ker(int *input, int * output) {
int temp = input[threadIdx.x];
temp = __shfl_xor_sync(temp, 1, blockDim.x);
output[threadIdx.x] = temp;
}'''

shfl_mod = SourceModule(ShflCode)
shfl_ker = shfl_mod.get_function('shfl_xor_ker')

dinput = gpuarray.to_gpu(np.int32(range(32)))
doutout = gpuarray.empty_like(dinput)

shfl_ker(dinput, doutout, grid=(1,1,1), block=(32,1,1))

print('input array: %s' % dinput.get())
print('array after __shfl_xor: %s' % doutout.get())

input array: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31]
array after __shfl_xor: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
