In [1]:
print("hello  world")

hello  world


In [19]:
import numpy as np
import matplotlib as plt
import IPythonMagic
import pycuda.driver as cuda_driver
import pycuda.compiler as cuda_compiler
from pycuda.gpuarray import GPUArray
from Timer import Timer 

In [20]:
%setup_logging

Global logger already initialized!


In [21]:
%cuda_context_handler context

Registering context in user workspace
Context already registered! Ignoring


In [41]:
kernel_src="""
__global__ void shmemReduction(float* output, float* input, int size) {
    //First we strike through global memory and compute the maximum for every thread
    int gid=blockIdx.x*blockDim.x+threadIdx.x; //blockIdx.x is always zero because we use just one block
    
    float max_value = -999999.99; //FIX ME 
    for (int i=threadIdx.x; i<size; i=i+blockDim.x ) {
        max_value=fmaxf(max_value,input[i]);
        
    
    }
    //Temporary write to check things
    output[threadIdx.x]=max_value;
    
    //Store the per-thread maximum in shared memory
    __shared__ float max_shared[128];
    max_shared[threadIdx.x]=max_value;
    
    //Synchronize so that all thread see the same shared memory
    __syncthreads();
    
    //Find the maximum in shared memory
    //Reduce from 128 to 64 elements
    if (threadIdx.x<64) {
            max_shared[threadIdx.x]=fmaxf(max_shared[threadIdx.x],max_shared[threadIdx.x+64]);
    }     
    
     __syncthreads(); //Since we have more than 1 warp we must sychronize all threads 
    //Reduce from 64 to 32 elements
    if (threadIdx.x<32) {
            max_shared[threadIdx.x]=fmaxf(max_shared[threadIdx.x],max_shared[threadIdx.x+32]);
    }    
    
    
    //Reduce from 32 to 16 elements
    if (threadIdx.x<16) {
            max_shared[threadIdx.x]=fmaxf(max_shared[threadIdx.x],max_shared[threadIdx.x+16]);
    }
    
    //Reduce from 16 to 8 elements
    if (threadIdx.x<8) {
            max_shared[threadIdx.x]=fmaxf(max_shared[threadIdx.x],max_shared[threadIdx.x+8]);
    }
    
    //Reduce from 8 to 4 elements
    if (threadIdx.x<4) {
            max_shared[threadIdx.x]=fmaxf(max_shared[threadIdx.x],max_shared[threadIdx.x+8]);
    }
    
    //Reduce from 4 to 2 elements
    if (threadIdx.x<2) {
            max_shared[threadIdx.x]=fmaxf(max_shared[threadIdx.x],max_shared[threadIdx.x+4]);
    }
    
    //Reduce from 2 to 1 elements
    if (threadIdx.x<1) {
            max_shared[threadIdx.x]=fmaxf(max_shared[threadIdx.x],max_shared[threadIdx.x+1]);
    }
    
    //Finally write out to out
    if (threadIdx.x==0) {
        output[0]=max_shared[0];
    }
}
"""
kernel_module=cuda_compiler.SourceModule(kernel_src)
kernel_function=kernel_module.get_function("shmemReduction")





In [42]:
n=256
a=np.random.random((1,n)).astype(np.float32)
a_g=GPUArray(a.shape,a.dtype)
a_g.set(a)

num_threads=128
b=np.empty((1,num_threads)).astype(np.float32)
b_g=GPUArray(b.shape,b.dtype)

In [43]:
block_size=(num_threads,1,1)
grid_size=(1,1,1)
kernel_function(b_g,a_g,np.int32(n),grid=grid_size,block=block_size)
b_g.get(b)
print(a)
print(b)
print(np.max(a))

[[0.32190987 0.73669046 0.79639834 0.5749647  0.37169003 0.93374985
  0.13384005 0.14896119 0.06422897 0.27464053 0.5667491  0.82810557
  0.15946524 0.5317411  0.2200729  0.5245224  0.17068164 0.81901866
  0.15178077 0.09561454 0.71968585 0.8010821  0.41355345 0.91683173
  0.8387901  0.26513612 0.50686264 0.8826764  0.21542163 0.5074297
  0.8487877  0.24108186 0.7996128  0.42004842 0.02840959 0.82682496
  0.96313024 0.445275   0.5112033  0.33561134 0.1547946  0.8641053
  0.08838812 0.37754104 0.5601205  0.84132665 0.49553826 0.30761972
  0.7204642  0.23711719 0.09768553 0.25728643 0.42013776 0.253011
  0.18745714 0.7056812  0.569525   0.42929474 0.6766775  0.08781328
  0.9807683  0.413401   0.64580375 0.42392704 0.10422944 0.17686765
  0.00446184 0.908137   0.88811606 0.4777462  0.5021096  0.04944073
  0.728923   0.94791824 0.41820842 0.39544752 0.01220619 0.16160762
  0.3169896  0.36709872 0.21053743 0.7822698  0.35975596 0.69406986
  0.4851765  0.65120757 0.98760766 0.801135   0.8476