Pinned memory (page locked)


In [1]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

In [2]:
class GameOfLife(object):
    def __init__(self,shape):
        self.shape = shape
        self.reset()
        self.init_cuda()
        
    def reset(self):
        self.array = np.random.randint(0,2,(self.shape)).astype(np.bool_)
        
        self.blockSizeX = 30
        self.blockSizeY = 30
        self.nBlocksX = self.shape[1]//self.blockSizeX + (0 if self.shape[1]%self.blockSizeX == 0 else 1);
        self.nBlocksY = self.shape[0]//self.blockSizeY + (0 if self.shape[0]%self.blockSizeY == 0 else 1);
        self.nbThreadX = np.uint32(self.nBlocksX)
        self.nbThreadY = np.uint32(self.nBlocksY)
        
        #page locked optimization
        sizeBuff = self.blockSizeY*self.nBlocksY,self.blockSizeX*self.nBlocksX
        self.cpu_locked_buff = cuda.pagelocked_empty(sizeBuff, np.bool_)
        self.gpu_buff = cuda.mem_alloc(cpu_buff.nbytes)
        
        
    def iterate(self,array):   
        Xtmp = np.zeros((self.blockSizeY*self.nBlocksY,self.blockSizeX*self.nBlocksX)).astype(array.dtype)
        Xtmp[0:self.shape[0],0:self.shape[1]] = array
        Ytmp = np.zeros_like(Xtmp)
        
        self.cuda_func(
                drv.Out(Ytmp), drv.In(Xtmp),self.nbThreadX,self.nbThreadY,
                block=(self.blockSizeX,self.blockSizeY,1), grid=(self.nBlocksX,self.nBlocksY))
        Y = Ytmp[0:self.shape[0],0:self.shape[1]]

        return Y
        
        
        
    def run(self,nbIteration=1):
        for i in range(nbIteration):
            self.array = self.iterate(self.array)
            
            

    def init_cuda(self):
        mod = SourceModule("""
            __global__ void bit_gol2(bool *dest, bool *a,
                unsigned int nbThreadX,unsigned int nbThreadY)
            {

                unsigned int x =  blockIdx.x*blockDim.x + threadIdx.x;
                unsigned int y =  blockIdx.y*blockDim.y + threadIdx.y;
                unsigned int nbRow = blockDim.y*nbThreadY;
                unsigned int nbCol = blockDim.x*nbThreadX;
                const unsigned int i = x + y*nbCol;
                unsigned int N;

                if(x > 0 && y > 0 && x < nbCol-1 && y < nbRow-1){
                    N = ( a[x-1 + (y-1)*nbCol] + a[x + (y-1)*nbCol] + a[x+1 + (y-1)*nbCol] +
                          a[x-1 + ( y )*nbCol] +                    + a[x+1 + ( y )*nbCol] +
                          a[x-1 + (y+1)*nbCol] + a[x + (y+1)*nbCol] + a[x+1 + (y+1)*nbCol] );
                }else{
                    N = 0;
                }

                dest[i] = (a[i] & (((N==2) | (N==3)))) 
                          | (!a[i] & N==3);            
                

            }
            """)

        self.cuda_func = mod.get_function("bit_gol2")

       
                    

   
