# License

    IPython notebook for running shared memory reductions
    Copyright (C) 2018 Andre.Brodtkorb@ifi.uio.no

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [9]:
import numpy as np
import matplotlib as plt

import pycuda.driver as cuda_driver
import pycuda.compiler as cuda_compiler
from pycuda.gpuarray import GPUArray

import IPythonMagic
from Timer import Timer

import pytest
from ipytest import run_pytest, clean_tests

In [10]:
%setup_logging

Global logger already initialized!


In [11]:
%cuda_context_handler context

Registering context in user workspace
Context already registered! Ignoring


In [12]:
kernel_src = """

#include <float.h>

__global__ void shmemReduction(float* output, float* input, int size) {
    // First we stride through global mememory and compute
    // the maximum for every thread
    float max_value = -FLT_MAX; //FIXME: Use proper value here
    for (int i = threadIdx.x; i < size; i = i + blockDim.x) {
        max_value = fmaxf(max_value, input[i]);
    }
    
    // Store the per-thread maximum in shared memory
    __shared__ float max_shared[512];
    max_shared[threadIdx.x] = max_value;
    
    // Synchronze so that all thread see the same shared memory
    __syncthreads();
    
    
    
    
    // Find the maximum in shared memory
    //Reduce from 512 to 256 elements
    if (threadIdx.x < 256) {
        max_shared[threadIdx.x] = fmaxf(max_shared[threadIdx.x], max_shared[threadIdx.x + 256]);
    }
    __syncthreads();
    
    //Reduce from 256 to 128 elements
    if (threadIdx.x < 128) {
        max_shared[threadIdx.x] = fmaxf(max_shared[threadIdx.x], max_shared[threadIdx.x + 128]);
    }
    __syncthreads();
    
    //Reduce from 128 to 64 elements
    if (threadIdx.x < 64) {
        max_shared[threadIdx.x] = fmaxf(max_shared[threadIdx.x], max_shared[threadIdx.x + 64]);
    }
    __syncthreads();
    
    
    
    
    
    
    //Reduce from 32 to 16 elements
    //Since we here have only one active warp (threadIdx.x > 32)
    //we do not need to call syncthreads anymore
    volatile float* p = &max_shared[0]; //To help the compiler not cache this variable...
    if (threadIdx.x < 32) {
        p[threadIdx.x] = fmaxf(p[threadIdx.x], p[threadIdx.x + 32]);
        p[threadIdx.x] = fmaxf(p[threadIdx.x], p[threadIdx.x + 16]);
        p[threadIdx.x] = fmaxf(p[threadIdx.x], p[threadIdx.x + 8]);
        p[threadIdx.x] = fmaxf(p[threadIdx.x], p[threadIdx.x + 4]);
        p[threadIdx.x] = fmaxf(p[threadIdx.x], p[threadIdx.x + 2]);
        p[threadIdx.x] = fmaxf(p[threadIdx.x], p[threadIdx.x + 1]);
    }
    
    
    
    
    // Finally write out to output
    if (threadIdx.x == 0) {
        output[0] = p[0];
    }
}
"""

kernel_module = cuda_compiler.SourceModule(kernel_src)
kernel_function = kernel_module.get_function("shmemReduction")

In [13]:
def findMaxGPU(a):
    a_g = GPUArray(a.shape, a.dtype)
    a_g.set(a)

    b = np.empty((1, 1), dtype=np.float32)
    b_g = GPUArray(b.shape, b.dtype)
    
    num_threads = 512
    block_size = (num_threads, 1, 1)
    grid_size = (1, 1, 1)

    kernel_function(b_g, a_g, np.int32(a.shape[1]), grid=grid_size, block=block_size)

    b_g.get(b)
    return b

In [19]:
clean_tests()

def test_findMaxGPU():
    #Run through each element and check that we can find it
    n = 1024
    a = np.ones((1, n), dtype=np.float32)
    for i in range(n):
        a[0,i] = 2.0

        b = findMaxGPU(a)
        
        a[0,i] = 1.0

        assert b == 2.0
        
    #Test a random dataset
    a = np.random.random((1, n)).astype(np.float32)
    b = findMaxGPU(a)
    assert np.max(a) == b
        
run_pytest(filename='13 Shared memory.ipynb', pytest_options=['-vvv'])


platform linux -- Python 3.6.6, pytest-3.8.2, py-1.6.0, pluggy-0.7.1 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /home/ubuntu/jupyter_notebooks/Andre Brodtkorb/MilanoGPU2018/notebooks, inifile:
collecting ... collected 1 item

13 Shared memory.py::test_findMaxGPU <- <ipython-input-19-e561f4232239> PASSED [100%]



0