In [1]:
#Lets have matplotlib "inline"
%matplotlib inline

#Import packages we need
import numpy as np
from matplotlib import animation, rc
from matplotlib import pyplot as plt

#Set large figure sizes
#Note, this prevents nice figures for articles...
rc('figure', figsize=(16.0, 12.0))
rc('animation', html='html5')

In [2]:
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
from pycuda import gpuarray
from pycuda.tools import PageLockedMemoryPool

import time

In [3]:
kernel_src = """
__global__ void mandelbrotKernel(float* output, unsigned int pitch, 
            unsigned int nx, unsigned int ny, 
            unsigned int iterations, 
            float x0, float y0, 
            float dx, float dy) {

    //Get thread id of this thread
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y*blockDim.y + threadIdx.y;

    //Check for out of bounds
    if (i < nx && j < ny) {
        float x = i*dx + x0;
        float y = j*dy + y0;

        float2 z0 = make_float2(x, y);
        float2 z = z0;
        int k = 0;

        //Loop until iterations or until it diverges
        while (z.x*z.x + z.y*z.y < 25.0 && k < iterations) {
            float tmp = z.x*z.x - z.y*z.y + z0.x;
            z.y = 2 * z.x*z.y + z0.y;
            z.x = tmp;
            ++k;
        }

        //Write out result to GPU memory
        if (k < iterations) {
            float* row = (float*)((char*)output + j*pitch);
            row[i] = fmod((k - log(log(sqrt(z.x*z.x + z.y*z.y)) / log(5.0)) / log(2.0)) / 100, 1.0);
        }
        else {
            float* row = (float*)((char*)output + j*pitch);
            row[i] = 0.0f;
        }
    }
}
"""
mandelbrotModule = SourceModule(kernel_src)
mandelbrotKernel = mandelbrotModule.get_function("mandelbrotKernel")
mandelbrotKernel.prepare("Piiiiffff")

memorypool = PageLockedMemoryPool()

In [4]:
def mandelbrot(nx, ny, iterations,
              x0, y0, 
              dx, dy,
              block_width=8, block_height=8):
    num_zooms = len(x0)
    zooms = list(range(num_zooms))
    
    assert num_zooms == len(x0)
    assert num_zooms == len(y0)
    assert num_zooms == len(dx)
    assert num_zooms == len(dy)
    
    #Create block dimensions and grid dimensions
    block = (block_width, block_height, 1)
    grid = (int((nx + block_width - 1) / block_width), int((nx + block_height - 1) / block_height), 1)
    
    #allocate gpu data
    output_gpu = [None]*num_zooms
    for i in range(num_zooms):
        output_gpu[i] = gpuarray.zeros((ny, nx), dtype=np.float32)
        
    #create stream
    stream = cuda.Stream()
    
    #Create timing events
    start_events = [None]*num_zooms
    end_events = [None]*num_zooms
    for i in zooms:
        start_events[i] = cuda.Event()
        end_events[i] = cuda.Event()
        
    #Run kernel and generate images
    def launch(i):
        start_events[i].record(stream)
        mandelbrotKernel.prepared_async_call(grid, block, stream, 
                                            output_gpu[i].ptr, np.uint32(output_gpu[i].strides[0]),
                                            np.uint32(nx), np.uint32(ny), np.uint32(iterations),
                                            np.float32(x0[i]), np.float32(y0[i]),
                                            np.float32(dx[i]), np.float32(dy[i]))
        end_events[i].record(stream)

    enqueue_compute_start = time.time()
    [launch(i) for i in zooms]
    enqueue_compute_end = time.time()
    
    #Synchronize
    sync_compute_start = time.time()
    gpu_time_compute = 0.0;
    for i in zooms:
        end_events[i].synchronize()
        milliseconds = end_events[i].time_since(start_events[i])
        print("Iteration {:d} took {:f} ms".format(i, milliseconds))
        gpu_time_compute += milliseconds
    sync_compute_end = time.time()
    
    print("Compute")
    print("Enqueue:  {:f} s".format(enqueue_compute_end - enqueue_compute_start))
    print("Sync:     {:f} s".format(sync_compute_end - sync_compute_start))
    print("CPU time: {:f} s".format(enqueue_compute_end + sync_compute_end - enqueue_compute_start - sync_compute_start))
    print("GPU time: {:f} s".format(gpu_time_compute * 1.0e-3))

    #Allocate CPU data
    retval = [None]*num_zooms
    for i in range(num_zooms):
        retval[i] = memorypool.allocate((ny, nx), np.float32)
    
    #Download from GPU to CPU
    def download(i):
        start_events[i].record(stream)
        #pycuda.driver.memcpy_htod_async(retval[i], output_gpu[i], stream)
        output_gpu[i].get_async(stream=stream, ary=retval[i])
        end_events[i].record(stream)
    enqueue_dl_start = time.time()
    [download(i) for i in zooms]
    enqueue_dl_end = time.time()
    
    #synchronize
    sync_dl_start = time.time()
    gpu_time_dl = 0.0
    for i in zooms:
        end_events[i].synchronize()
        milliseconds = end_events[i].time_since(start_events[i])
        print("Iteration {:d} took {:f} ms".format(i, milliseconds))
        gpu_time_dl += milliseconds
    sync_dl_end = time.time()
    
    print("Download")
    print("Enqueue:  {:f} s".format(enqueue_dl_end - enqueue_dl_start))
    print("Sync:     {:f} s".format(sync_dl_end - sync_dl_start))
    print("CPU time: {:f} s".format(enqueue_dl_end + sync_dl_end - enqueue_dl_start - sync_dl_start))
    print("GPU time: {:f} s".format(gpu_time_dl * 1.0e-3))
    
    print("========")
    print("Averages")
    print("Enqueue compute:  {:f} ms".format(1.0e3*(enqueue_compute_end - enqueue_compute_start) / num_zooms))
    print("Enqueue download: {:f} ms".format(1.0e3*(enqueue_dl_end - enqueue_dl_start) / num_zooms))
    print("Kernel:           {:f} ms".format(gpu_time_compute / num_zooms))
    print("Download:         {:f} ms".format(gpu_time_dl / num_zooms))
    print("========")
    
    return retval

In [5]:
n = 1024
nx = 3*n
ny = 2*n
iterations = 5000
num_zooms = 5

x_center = -0.75 + 0.0025
y_center = 0.1
factor = 0.95

x0 = np.empty(num_zooms, dtype=np.float32)
y0 = np.empty(num_zooms, dtype=np.float32)
dx = np.empty(num_zooms, dtype=np.float32)
dy = np.empty(num_zooms, dtype=np.float32)

x0[0] = x_center - 1.5
y0[0] = y_center - 1.0
dx[0] = 3.0 / nx
dy[0] = 2.0 / ny

for i in range(1, num_zooms):
    dx[i] = dx[i-1] * factor
    dy[i] = dy[i-1] * factor
    
    x0[i] = x_center - dx[i]*nx/2
    y0[i] = y_center - dy[i]*ny/2
    
    print("{:f} x {:f}".format(dx[i]*nx, dy[i]*ny))
    
results = mandelbrot(nx, ny, iterations, x0, y0, dx, dy)

2.850000 x 1.900000
2.707500 x 1.805000
2.572125 x 1.714750
2.443519 x 1.629012
Iteration 0 took 320.320770 ms
Iteration 1 took 337.097015 ms
Iteration 2 took 370.438354 ms
Iteration 3 took 405.441223 ms
Iteration 4 took 444.737946 ms
Compute
Enqueue:  0.001001 s
Sync:     1.880371 s
CPU time: 1.881371 s
GPU time: 1.878035 s
Iteration 0 took 15.237504 ms
Iteration 1 took 15.062144 ms
Iteration 2 took 15.012000 ms
Iteration 3 took 14.974656 ms
Iteration 4 took 15.241632 ms
Download
Enqueue:  0.000985 s
Sync:     0.076056 s
CPU time: 0.077041 s
GPU time: 0.075528 s
Averages
Enqueue compute:  0.200129 ms
Enqueue download: 0.197077 ms
Kernel:           375.607062 ms
Download:         15.105587 ms


In [6]:
dpi=300

fig = plt.figure(figsize=(nx/dpi, ny/dpi), dpi=dpi)
ax = plt.axes([0, 0, 1, 1])
im = plt.imshow(results[0], origin='lower', cmap="terrain", vmax=1.0, vmin=0.0)
plt.axis('off')
#gca().xaxis.set_major_locator(NullLocator())
#gca().yaxis.set_major_locator(NullLocator())
plt.tight_layout()

def animate(i):
    im.set_data(results[i])

anim = animation.FuncAnimation(fig, animate, interval=150, frames=range(len(results)))
plt.close()

from matplotlib.animation import FFMpegWriter
from IPython.display import display, HTML
writer = FFMpegWriter(fps=25)
anim.save("mandelbrot.mp4", writer=writer)
display(HTML("""
<div align="middle">
<video width="80%" controls>
<source src="{:s}" type="video/mp4">
</video>
</div>
""".format("mandelbrot.mp4")))

animation.py:350 - babrodtk-fix




