# Introduction to GPU Programming with Python
## Solutions to the notebook 5

### Polynomial evaluation

In [None]:
import numpy as np
from numba import jit,cuda

In [None]:
#Part 3: Modify polynomial function to make it work with numba.cuda
@cuda.jit
def cuda_polyval(result, array, coeffs):
    # Evaluate a polynomial function over an array with Horner's method.
    # The coefficients are given in descending order.
    i = cuda.grid(1) # equivalent to i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    val = coeffs[0]
    for coeff in coeffs[1:]:
        val = val * array[i] + coeff
    result[i] = val

In [None]:
#Part 1: Allocate integer array (int32), size of 2048 * 1024. Also make an empty array for result, same size
array = np.random.randn(2048 * 1024).astype(np.float32)
coeffs = np.float32(range(1, 10))
result = np.empty_like(array)

In [None]:
#Part 2: Prepare grid
blocks=2048
threads=1024

In [None]:
#Part 4: Call the kernel and measure execution time
%timeit cuda_polyval[blocks,threads](result, array, coeffs)

In [None]:
#Part 5: Call the built-in NumPy polynomial function  np.polyval(coeffs, array) and compare results
numpy_result = np.polyval(coeffs, array)
print('Maximum relative error compared to numpy.polyval:', np.max(np.abs(numpy_result - result)))

In [None]:
#Part 6: Go back to the kernel (Part 3) and modify it to make it work on CPU with @jit
@jit
def host_polyval_CPU(result, array, coeffs):
    for i in range(len(array)):
        val = coeffs[0]
        for coeff in coeffs[1:]:
            val = val * array[i] + coeff
        result[i] = val

In [None]:
%timeit host_polyval_CPU(result, array, coeffs)
print('Maximum relative error compared to numpy.polyval:', np.max(np.abs(numpy_result - result)))

### Mandelbrot Example

In [None]:
import numpy as np
from matplotlib.pyplot import imshow, show
from timeit import default_timer as timer
from numba import jit,cuda

In [None]:
def mandel(x, y, max_iters):
  
  c = complex(x, y)
  z = 0.0j
  for i in range(max_iters):
    z = z*z + c
    if (z.real*z.real + z.imag*z.imag) >= 4:
      return i

  return max_iters

In [None]:
#Part1 : Make a create_fractal function
def create_fractal(min_x, max_x, min_y, max_y, image, iters):
  height = image.shape[0]
  width = image.shape[1]

  pixel_size_x = (max_x - min_x) / width
  pixel_size_y = (max_y - min_y) / height
    
  for x in range(width):
    real = min_x + x * pixel_size_x
    for y in range(height):
      imag = min_y + y * pixel_size_y
      color = mandel(real, imag, iters)
      image[y, x] = color


In [None]:
#Part 2: Next we create an empty array, size 1024x1024, type np.uint8. Call create_fractal with appropriate coordinates 
#to fit the whole mandelbrot set. Then show the image. Measure the execution time.
image = np.zeros((1024, 1024), dtype = np.uint8)
%timeit create_fractal(-2.0, 1.0, -1.0, 1.0, image, 20) 

imshow(image)
show()

In [None]:
#Part 3: Modify both mandel and create_fractal function and optimize/parallelize them with jit decorator 
#to work on the CPU
@jit
def mandel(x, y, max_iters):
  
  c = complex(x, y)
  z = 0.0j
  for i in range(max_iters):
    z = z*z + c
    if (z.real*z.real + z.imag*z.imag) >= 4:
      return i

  return max_iters

@jit
def create_fractal(min_x, max_x, min_y, max_y, image, iters):
  height = image.shape[0]
  width = image.shape[1]

  pixel_size_x = (max_x - min_x) / width
  pixel_size_y = (max_y - min_y) / height
    
  for x in range(width):
    real = min_x + x * pixel_size_x
    for y in range(height):
      imag = min_y + y * pixel_size_y
      color = mandel(real, imag, iters)
      image[y, x] = color

In [None]:
#Part 4: Run again and measure the execution time
image = np.zeros((1024, 1024), dtype = np.uint8)
%timeit create_fractal(-2.0, 1.0, -1.0, 1.0, image, 20) 

imshow(image)
show()

In [None]:
#Part 5: Write the kernel function mandel_kernel  with numba.cuda. Also modify mandel to mandel_gpu with cuda.jit
mandel_gpu = cuda.jit(device=True)(mandel)

@cuda.jit
def mandel_kernel(min_x, max_x, min_y, max_y, image, iters):
  height = image.shape[0]
  width = image.shape[1]

  pixel_size_x = (max_x - min_x) / width
  pixel_size_y = (max_y - min_y) / height

  startX, startY = cuda.grid(2)
  gridX = cuda.gridDim.x * cuda.blockDim.x;
  gridY = cuda.gridDim.y * cuda.blockDim.y;

  for x in range(startX, width, gridX):
    real = min_x + x * pixel_size_x
    for y in range(startY, height, gridY):
      imag = min_y + y * pixel_size_y 
      image[y, x] = mandel_gpu(real, imag, iters)

In [None]:
#Part 6: Create cuda grid
image = np.zeros((1024, 1024), dtype = np.uint8)
blockdim = (32,8)
griddim = (32,16)

In [None]:
#Part 7: Run the kernel. Also measure the execution time.
%timeit mandel_kernel[griddim,blockdim](-2.0, 1.0, -1.0, 1.0, image, 20) 