# Imports

In [1]:
import numpy as np
import cupy as cp
from numba import cuda, vectorize
from numpy import format_float_scientific as fs
import matplotlib.pyplot as plt
from matplotlib import cm
import time
import math

#from numba_kernels import * 
from Timers import TimersManager
from lbmFlowAroundCylinder import inivel, obstacle_fun

## Timers definition

In [2]:
timers = TimersManager(gpu=True)
timers.add("main")
timers.add("equilibrium")
timers.add("collision")
timers.add("streaming")
timers.add("macroscopic")
timers.add("rightwall")
timers.add("leftwall")
timers.add("fin_inflow")
timers.add("bounceback")
#timers.add("move_gpu->cpu")
#timers.add("move_cpu->gpu")

## Flow definitions

In [3]:
maxIter = 2000    # Total number of time iterations.
Re = 150.0          # Reynolds number.
nx, ny = 4200, 4000   # Numer of lattice nodes.
ly = ny-1           # Height of the domain in lattice units.
cx, cy, r = nx//4, ny//2, ny//9 # Coordinates of the cylinder.
uLB     = 0.04                  # Velocity in lattice units.
nulb    = uLB*r/Re;             # Viscoscity in lattice units.
omega = 1 / (3*nulb+0.5);    # Relaxation parameter.
save_figures = False
profile = True

## Lattice constants

In [4]:
v = np.array([ [ 1,  1], [ 1,  0], [ 1, -1], [ 0,  1], [ 0,  0],
               [ 0, -1], [-1,  1], [-1,  0], [-1, -1] ], dtype=np.int32) # 9 vecteurs : 9 directions de déplacement


t = np.array([ 1/36, 1/9, 1/36, 1/9, 4/9, 1/9, 1/36, 1/9, 1/36], 
                            dtype=np.float32)

col1 = np.array([0, 1, 2])
col2 = np.array([3, 4, 5])
col3 = np.array([6, 7, 8])

In [5]:
@cuda.jit
def rightwall_cuda(fin):
    y = cuda.grid(1)
    if y < fin.shape[2]:
        nx = fin.shape[1]
        fin[6, nx-1, y] = fin[6, nx-2, y]
        fin[7, nx-1, y] = fin[7, nx-2, y]
        fin[8, nx-1, y] = fin[8, nx-2, y]


@cuda.jit
def macroscopic_cuda(fin, v, rho_out, u_out):
    x, y = cuda.grid(2)
    if x < rho_out.shape[0] and y < rho_out.shape[1]:
        rho_tmp = 0
        ux_tmp = 0
        uy_tmp = 0
        for ipop in range(9):
            f_tmp = fin[ipop, x, y]
            rho_tmp += f_tmp
            ux_tmp += v[ipop, 0] * f_tmp
            uy_tmp += v[ipop, 1] * f_tmp
        rho_out[x, y] = rho_tmp
        u_out[0, x, y] = ux_tmp / rho_tmp
        u_out[1, x, y] = uy_tmp / rho_tmp



@cuda.jit
def leftwall_cuda(fin, vel, u_out, rho_out):
    y = cuda.grid(1)
    if y < rho_out.shape[1]:
        ux = vel[0, 0, y]
        u_out[0, 0, y] = ux
        u_out[1, 0, y] = vel[1, 0, y]
        rho_out[0, y] = (1 / (1 - ux)) * (fin[3, 0, y] + fin[4, 0, y] + fin[5, 0, y] + \
                                        2 * (fin[6, 0, y] + fin[7, 0, y] + fin[8, 0, y]))


@cuda.jit
def equilibrium_cuda(rho, u, v, t, feq_out):
    x, y = cuda.grid(2)
    if x < feq_out.shape[1] and y < feq_out.shape[2]:
        usqr = 1.5 * (u[0, x, y] * u[0, x, y] + u[1, x, y] * u[1, x, y]) 
        for ipop in range(9):
            cu = 3 * (v[ipop, 0] * u[0, x, y] + v[ipop, 1] * u[1, x, y])
            feq_out[ipop, x, y] = rho[x, y] * t[ipop] * (1 + cu + 0.5 * cu * cu - usqr)



@cuda.jit
def fin_inflow_cuda(feq, fin_out):
    y = cuda.grid(1)
    if y < fin_out.shape[2]:
        fin_out[0, 0, y] = feq[0, 0, y] + fin_out[8, 0, y] - feq[8, 0, y]
        fin_out[1, 0, y] = feq[1, 0, y] + fin_out[7, 0, y] - feq[7, 0, y]
        fin_out[2, 0, y] = feq[2, 0, y] + fin_out[6, 0, y] - feq[6, 0, y]



@cuda.jit
def collision_cuda(fin, feq, f_out):
    x, y = cuda.grid(2)
    if (x < f_out.shape[1]) and (y < f_out.shape[2]):
        for ipop in range(9):
            f_out[ipop, x, y] = fin[ipop, x, y] - omega * (fin[ipop, x, y] - feq[ipop, x, y])



@cuda.jit
def bounceback_cuda(fin, obstacle, f_out):
    x, y = cuda.grid(2)
    if x < obstacle.shape[0] and y < obstacle.shape[1]:
        if obstacle[x, y] == 1:
            for i in range(9):
                f_out[i, x, y] = fin[8 - i, x, y]



@cuda.jit
def streaming_cuda(fout, v, fin_out):
    x, y = cuda.grid(2)
    if x < fout.shape[1] and y < fout.shape[2]:
        for ipop in range(9):
            i_out = x - v[ipop, 0]
            if i_out < 0:
                i_out += nx
            if i_out > nx - 1:
                i_out -= nx
            j_out = y - v[ipop, 1]
            if j_out < 0:
                j_out += ny
            if j_out > ny - 1:
                j_out -= ny
            fin_out[ipop, x, y] = fout[ipop, i_out, j_out]

In [6]:
def macroscopic(fin): 
    """Compute macroscopic variables (density, velocity)

    fluid density is 0th moment of distribution functions 
    fluid velocity components are 1st order moments of dist. functions
    """
    rho = np.sum(fin, axis=0)
    u = np.zeros((2, nx, ny))
    for i in range(9):
        u[0,:,:] += v[i,0] * fin[i,:,:]
        u[1,:,:] += v[i,1] * fin[i,:,:]
    u /= rho
    return rho, u

def equilibrium(rho, u):
    """Equilibrium distribution function.
    """
    usqr = 3/2 * (u[0]**2 + u[1]**2)
    feq = np.zeros((9,nx,ny))
    for i in range(9):
        cu = 3 * (v[i,0]*u[0,:,:] + v[i,1]*u[1,:,:])
        feq[i,:,:] = rho*t[i] * (1 + cu + 0.5*cu**2 - usqr) 
        # feq[i,:,:] : dimension 1 la direction de déplacement de la particule
        #               dimension 2 et 3 : x et y la position
    return feq

# Code main functions

## Kernel parameters

In [7]:
##### Right Wall ##### 
rig_threadsperblock = 16
rig_blockspergrid_y = math.ceil(ny / rig_threadsperblock)
rig_blockspergrid = (rig_blockspergrid_y)

##### Macroscopic ##### 
mac_threadsperblock = (8, 8)
mac_blockspergrid_x = math.ceil(nx / mac_threadsperblock[0])
mac_blockspergrid_y = math.ceil(ny / mac_threadsperblock[1])
mac_blockspergrid = (mac_blockspergrid_x, mac_blockspergrid_y)

##### Left Wall ##### 
lef_threadsperblock = 16
lef_blockspergrid_y = math.ceil(ny / lef_threadsperblock)
lef_blockspergrid = (lef_blockspergrid_y)


##### Equilibrium #####
equ_threadsperblock = (8, 8)
equ_blockspergrid_x = math.ceil(nx / equ_threadsperblock[0])
equ_blockspergrid_y = math.ceil(ny / equ_threadsperblock[1])
equ_blockspergrid   = (equ_blockspergrid_x, equ_blockspergrid_y)


##### Fin_Inflow #####
inf_threadsperblock = 16
inf_blockspergrid_y = math.ceil(ny / inf_threadsperblock)
inf_blockspergrid = (inf_blockspergrid_y)


##### Collision ##### 
col_threadsperblock = (8, 8)
col_blockspergrid_x = math.ceil(nx / col_threadsperblock[0])
col_blockspergrid_y = math.ceil(ny / col_threadsperblock[1])
col_blockspergrid   = (col_blockspergrid_x, col_blockspergrid_y)


##### BounceBack #####
bou_threadsperblock = (8, 8)
bou_blockspergrid_x = math.ceil(nx / bou_threadsperblock[0])
bou_blockspergrid_y = math.ceil(ny / bou_threadsperblock[1])
bou_blockspergrid   = (bou_blockspergrid_x, bou_blockspergrid_y)


##### Streaming #####
str_threadsperblock = (8, 8)
str_blockspergrid_x = math.ceil(nx / str_threadsperblock[0])
str_blockspergrid_y = math.ceil(ny / str_threadsperblock[1])
str_blockspergrid   = (str_blockspergrid_x, str_blockspergrid_y)

### Main loop

In [8]:
def oneLoop(obstacle, vel, v, t, fin, rho, u, feq, fout):
    rightwall_cuda[rig_blockspergrid, rig_threadsperblock](fin)
    
    macroscopic_cuda[mac_blockspergrid, mac_threadsperblock](fin, v, rho, u) 

    leftwall_cuda[lef_blockspergrid, lef_threadsperblock](fin, vel, u, rho)

    equilibrium_cuda[equ_blockspergrid, equ_threadsperblock](rho, u, v, t, feq) 
    
    fin_inflow_cuda[inf_blockspergrid, inf_threadsperblock](feq, fin)
    
    collision_cuda[col_blockspergrid, col_threadsperblock](fin, feq, fout)
    
    bounceback_cuda[bou_blockspergrid, bou_threadsperblock](fin, obstacle, fout)
    
    streaming_cuda[str_blockspergrid, str_threadsperblock](fout, v, fin)
        

In [9]:
def main(v, t):
    # create obstacle mask array from element-wise function
    obstacle_device = cuda.to_device(np.fromfunction(obstacle_fun, (nx,ny), dtype=np.float32))
    
    # initial velocity field vx,vy from element-wise function
    # vel is also used for inflow border condition
    vel = np.fromfunction(inivel, (2,nx,ny), dtype=np.float32) 
    vel_device = cuda.to_device(vel)
    
    # Initialization of the populations at equilibrium 
    # with the given velocity.
    fin_device = cuda.to_device(equilibrium(1, vel).astype(np.float32))
    
    rho_device = cuda.to_device(np.zeros(shape=(fin_device.shape[1], fin_device.shape[2]), dtype=np.float32))
    
    u_device = cuda.to_device(np.zeros((2, nx, ny), dtype=np.float32))
    
    feq_device = cuda.to_device(np.zeros_like(fin_device, dtype=np.float32))
    
    fout_device = cuda.to_device(np.zeros_like(fin_device, dtype=np.float32))
    

    for time in range(maxIter):
        oneLoop(obstacle_device, vel_device, v, t, fin_device,
                     rho_device, u_device, feq_device, fout_device)
        

def main_profile(v, t):
    
    v_device = cuda.to_device(v)
    t_device = cuda.to_device(t)
    
    # create obstacle mask array from element-wise function
    obstacle = np.fromfunction(obstacle_fun, (nx,ny), dtype=np.float32)
    obstacle_device = cuda.to_device(obstacle)
    
    # initial velocity field vx,vy from element-wise function
    # vel is also used for inflow border condition
    vel = np.fromfunction(inivel, (2,nx,ny), dtype=np.float32)
    vel_device = cuda.to_device(vel)
    
    
    rho = np.ones(shape=(nx, ny), dtype=np.float32)
    rho_device = cuda.to_device(rho)
    
    
    u = np.zeros((2, nx, ny), dtype=np.float32)
    u_device = cuda.to_device(u)
    
    fin = np.zeros((9, nx, ny), dtype=np.float32)
    fin_device = cuda.to_device(fin)
    
    feq = np.zeros((9, nx, ny), dtype=np.float32)
    feq_device = cuda.to_device(feq)
    
    
    fout = np.zeros((9, nx, ny), dtype=np.float32)
    fout_device = cuda.to_device(fout)
    # Initialization of the populations at equilibrium 
    # with the given velocity.
    equilibrium_cuda[equ_blockspergrid, equ_threadsperblock](rho_device, u_device, v, t, fin_device)

    
    ###### Main time loop ########
    for time in range(maxIter):
        # Right wall: outflow condition.
        # we only need here to specify distrib. function for velocities
        # that enter the domain (other that go out, are set by the streaming step)
        
        timers.get("rightwall").start()
        rightwall_cuda[rig_blockspergrid, rig_threadsperblock,
                      timers.get("rightwall").getStream()](fin_device)
        timers.get("rightwall").end()

        
        # Compute macroscopic variables, density and velocity.
        timers.get("macroscopic").start()
        macroscopic_cuda[mac_blockspergrid, mac_threadsperblock,
                        timers.get("macroscopic").getStream()](fin_device, v_device, rho_device, u_device)
        #rho, u = macroscopic(fin)
        timers.get("macroscopic").end()
        
        # Left wall: inflow condition.
        timers.get("leftwall").start()
        leftwall_cuda[lef_blockspergrid, lef_threadsperblock,
                     timers.get("leftwall").getStream()](fin_device, vel_device, u_device, rho_device)
        timers.get("leftwall").end()
        

        
        # Compute equilibrium.
        timers.get("equilibrium").start()
        equilibrium_cuda[equ_blockspergrid, equ_threadsperblock,
                        timers.get("equilibrium").getStream()](rho_device, u_device, 
                                                               v_device, t_device, feq_device) 
        #feq = equilibrium(rho, u)
        timers.get("equilibrium").end()
    

    
    
        timers.get("fin_inflow").start()
        fin_inflow_cuda[inf_blockspergrid, inf_threadsperblock, 
                        timers.get("fin_inflow").getStream()](feq_device, fin_device)
        #fin[[0,1,2],0,:] = feq[[0,1,2],0,:] + fin[[8,7,6],0,:] - feq[[8,7,6],0,:]
        timers.get("fin_inflow").end()


        # Collision step.
        timers.get("collision").start()
        #fout = fin - omega * (fin - feq) # Noyau de calcul 1
        collision_cuda[col_blockspergrid, col_threadsperblock](fin_device, feq_device, fout_device)
        timers.get("collision").end()

    
        
        # Bounce-back condition for obstacle.
        # in python language, we "slice" fout by obstacle
        timers.get("bounceback").start()
        bounceback_cuda[bou_blockspergrid, bou_threadsperblock, 
                        timers.get("bounceback").getStream()](fin_device, obstacle_device, fout_device)
        timers.get("bounceback").end()


        # Streaming step.
        timers.get("streaming").start()
        streaming_cuda[str_blockspergrid, str_threadsperblock, 
                       timers.get("streaming").getStream()](fout_device, v_device, fin_device)
        timers.get("streaming").end()


        
        if ((time%100==0) and save_figures):
            plt.clf()
            u_device.copy_to_host(u)
            plt.imshow(np.sqrt(u[0]**2+u[1]**2).transpose(), cmap=cm.Reds)
            plt.show()
            #plt.savefig("figures/vel.{0:04d}.png".format(time//100))


In [10]:
if profile:
    timers.get("main").start()
    main_profile(v, t)
    timers.get("main").end()
else:
    timers.get("main").start()
    main(v, t)
    timers.get("main").end()

# Warnings ignorés pour l'instant, à régler à la fin



In [11]:
total = np.sum(timers.get("main").getMeasures())
print(f"Total time : {total:4.2f}s")
timers.printInfo()
timers.printBd(nx, ny, 4)
timers.printGflops(nx, ny)

Total time : 17.61s
--> Timer 'main         ' : N =    1 | Mean 1.761e+01 +- 0.e+00     | 100.0% of total time.
--> Timer 'equilibrium  ' : N = 2000 | Mean 1.641e-03 +- 1.08e-04   | 18.64% of total time.
--> Timer 'collision    ' : N = 2000 | Mean 2.293e-03 +- 1.421e-03  | 26.04% of total time.
--> Timer 'streaming    ' : N = 2000 | Mean 1.78e-03  +- 1.833e-03  | 20.22% of total time.
--> Timer 'macroscopic  ' : N = 2000 | Mean 1.159e-03 +- 1.765e-03  | 13.16% of total time.
--> Timer 'rightwall    ' : N = 2000 | Mean 9.272e-05 +- 1.151e-03  |  1.05% of total time.
--> Timer 'leftwall     ' : N = 2000 | Mean 1.764e-04 +- 3.408e-03  |   2.0% of total time.
--> Timer 'fin_inflow   ' : N = 2000 | Mean 1.15e-04  +- 1.599e-03  |  1.31% of total time.
--> Timer 'bounceback   ' : N = 2000 | Mean 3.358e-04 +- 1.235e-03  |  3.81% of total time.
--> Timer 'remains      ' : N =    1 | Mean 2.424e+00 | 13.76% of total time
mem bandwidth equilibrium   : 818.983 GB/s
mem bandwidth collision     : 10

## Tests