<h1>Introduction</h1>

This notebook demonstrates how to scale a synthetic form of Full Waveform Inversion using multiple FPGAs. The FWI algorithm uses a Finite Difference Forward model and a Conjugate Gradient Descent optimization algorithm for the inversion. In the first step, we connect to an existing Dask cluster using it's scheduler's IP address.

In [1]:
from dask.distributed import Client, progress, get_worker
import os
import binascii

#replace with IP address of the Dask scheduler
client = Client("tcp://10.1.212.127:8786")
client

0,1
Connection method: Direct,
Dashboard: http://10.1.212.127:8787/status,

0,1
Comm: tcp://10.1.212.127:8786,Workers: 2
Dashboard: http://10.1.212.127:8787/status,Total threads: 2
Started: 1 minute ago,Total memory: 0 B

0,1
Comm: tcp://10.1.212.127:35789,Total threads: 1
Dashboard: http://10.1.212.127:45347/status,Memory: 0 B
Nanny: None,
Local directory: /mnt/scratch/ldierick/octoray/fwi/dask-worker-space/worker-roe782m7,Local directory: /mnt/scratch/ldierick/octoray/fwi/dask-worker-space/worker-roe782m7
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 134.21 MiB,Spilled bytes: 0 B
Read bytes: 2.28 kiB,Write bytes: 2.28 kiB

0,1
Comm: tcp://10.1.212.127:42299,Total threads: 1
Dashboard: http://10.1.212.127:36361/status,Memory: 0 B
Nanny: None,
Local directory: /mnt/scratch/ldierick/octoray/fwi/dask-worker-space/worker-6xwsf6g3,Local directory: /mnt/scratch/ldierick/octoray/fwi/dask-worker-space/worker-6xwsf6g3
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 135.97 MiB,Spilled bytes: 0 B
Read bytes: 4.37 kiB,Write bytes: 6.61 kiB


<h1> Define experiment parameters</h1>

DIR_PATH => path to to the directory with the input/output folders  

XCLBIN_PATH_DEFAULT => Default path for the .xclbin file if one not provided via command line args  

DEVICE_NAME_DEFAULT => Default name for the FPGA device if one note provided via command line args  

In [2]:
DIR_PATH = "default/"
XCLBIN_PATH_DEFAULT="bitstreams/u280_xclbin/500_250_HBM/FullW.xclbin"
XCLBIN_PATH_MULTCU="bitstreams/u280_xclbin/500_250_HBM/FullW.xclbin"

dask-worker tcp://10.1.212.126:8786 --preload pynqimport.py --memory-limit 0 --no-nanny --nthreads 1


In [3]:
ROW = 500
COL = 250

def init_hw_dotProduct(s_in_V,s2_in_V):       
    COL = 250
    ROW = 500
    for col in range(COL):
        s2_in_V[col] = col * 1.0
        for row in range(ROW):
            s_in_V[row][col] = complex(row*1.0,row*col*0.33)
            
    return s_in_V, s2_in_V

def init_hw_updateDirection(resV_V,kappa_V):       
    for row in range(ROW):
        resV_V[row] = complex(row*1.0,row*0.33)
        for col in range(COL):
            kappa_V[row][col] = complex(row*1.0,row*col*0.33)
    return resV_V, kappa_V


def init_sw_dotProduct():
    s_in_SW = np.zeros(shape=(ROW,COL), dtype = np.complex64)
    s2_in_SW = np.zeros(shape=(COL,), dtype = np.float32)
    for col in range(COL):
        s2_in_SW[col] = col *1.0
        for row in range(ROW):        
            s_in_SW[row][col] = complex(row*1.0,row*col*0.33)            
    
    return s_in_SW, s2_in_SW

def init_sw_updateDirection():
    kappa_SW = np.zeros(shape=(ROW,COL), dtype=np.complex64)
    res_SW = np.zeros(shape=(ROW,),dtype=np.complex64)
        
    for row in range(ROW):
        res_SW[row] = complex(row*1.0, row*0.33)
        for col in range(COL):
            kappa_SW[row][col] = complex(row*1.0,row*col*0.33)
            
    return res_SW, kappa_SW

def dotProduct_SW(s_in, s2_in):
    out = np.zeros(ROW, dtype=np.complex64)
#     for row in range(ROW):
#         for col in range(COL):
#             out[row] +=s_in[row][col]*s2_in[col]
    for i in range(ROW):
        out[i] = np.dot(s_in[i],s2_in)
    return out 

def updateDirection_SW(res, kappa):
    out = np.zeros(shape=(COL),dtype = np.complex64)    
    
    for i in range(5):
        l_i = i * 10 * 10
        for j in range(10):
            l_j = j* 10
            for k in range(10):
                dummy = kappa[l_i + l_j + k]
                dummy.conjugate()
                out = out + res[l_i + l_j + k]*dummy.data
        
    return out      

<h1> Setup Overlay </h1>

In [4]:
def setup_multcu():
    try:
        from pynq import Device, Overlay
        ol = Overlay("bitstreams/u280_xclbin/500_250_HBM/FullW.xclbin",download=True,device=Device.devices[0])
        print("Overlay downloaded.")
    except Exception as e:
        return f" error: {e}"
    return 'setup succesful'

<h1> Version 1 function </h1>

In [5]:
def version_1(cu):
    import numpy as np
    import time
    from pynq import Overlay, Device, allocate
    
    devices = Device.devices
    ol = Overlay(XCLBIN_PATH_MULTCU, download=False, device=devices[0])
    

    if cu == 1:
        # set up the kernel IP's
        dotprod = ol.dotprod_1
        update = ol.update_1
    else:
        dotprod = ol.dotprod_2
        update = ol.update_2
        
    resolution = 500
    gridsize = 250

   # Allocate the buffers
    A = allocate(shape=(resolution,gridsize), dtype=np.complex64, target=getattr(ol,"HBM"+str(0 + (cu-1)*3)))
    B = allocate(shape=(gridsize,), dtype=np.float32, target=getattr(ol,"HBM"+str(1 + (cu-1)*3)))
    C = allocate(shape=(resolution,), dtype=np.complex64, target=getattr(ol,"HBM"+str(2 + (cu-1)*3)))

    D = allocate(shape=(resolution,gridsize), dtype=np.complex64,  target=getattr(ol,"HBM"+str(6 + (cu-1)*3)))
    E = allocate(shape=(resolution,),dtype=np.complex64,  target=getattr(ol,"HBM"+str(7 + (cu-1)*3)))
    F = allocate(shape=(gridsize), dtype=np.complex64, target=getattr(ol,"HBM"+str(8 + (cu-1)*3)))
    
    iter = []
    for i in range(3): 
        start_i = time.time()
        A, B = init_hw_dotProduct(A, B)
        E, D = init_hw_updateDirection(E,D)

        print(f'start dotprod_{cu}')
        A.sync_to_device()
        B.sync_to_device()
        dotprod.call(A,B,C)   
        C.sync_from_device()
        print(f'end dotprod_{cu}')

        print(f'start update_{cu}')
        D.sync_to_device()
        E.sync_to_device()
        update.call(D,E,F)
        F.sync_from_device()
        print(f'end update_{cu}')
        
        iter.append(time.time()-start_i)
        
    return(f"cu {cu} iterations: {iter} total time: {np.sum(iter)}")
    

<h1> Version 2 function </h1>

In [None]:
def version_2():
    
    from multiprocessing import Process, Queue
    
    ROW = 500
    COL = 250

    resolution = ROW
    gridsize = COL
    from pynq import Overlay, allocate, Device, lib
    import numpy as np
    import time

    devices = Device.devices

    # import and download the overlay to the PL.
    ol = Overlay(XCLBIN_PATH_MULTCU, download=True, device=devices[0])


    def run_process(q,cu):

        ol = Overlay(XCLBIN_PATH_MULTCU, download=False, device=devices[0])

        if cu == 1:
            # set up the kernel IP's
            dotprod = ol.dotprod_1
            update = ol.update_1
        else:
            dotprod = ol.dotprod_2
            update = ol.update_2

       # Allocate the buffers
        A = allocate(shape=(resolution,gridsize), dtype=np.complex64, target=getattr(ol,"HBM"+str(0 + (cu-1)*3)))
        B = allocate(shape=(gridsize,), dtype=np.float32, target=getattr(ol,"HBM"+str(1 + (cu-1)*3)))
        C = allocate(shape=(resolution,), dtype=np.complex64, target=getattr(ol,"HBM"+str(2 + (cu-1)*3)))

        D = allocate(shape=(resolution,gridsize), dtype=np.complex64,  target=getattr(ol,"HBM"+str(6 + (cu-1)*3)))
        E = allocate(shape=(resolution,),dtype=np.complex64,  target=getattr(ol,"HBM"+str(7 + (cu-1)*3)))
        F = allocate(shape=(gridsize), dtype=np.complex64, target=getattr(ol,"HBM"+str(8 + (cu-1)*3)))

        iter = []
        for i in range(3): 
            start_i = time.time()
            A, B = init_hw_dotProduct(A, B)
            E, D = init_hw_updateDirection(E,D)

            print(f'start dotprod_{cu}')
            A.sync_to_device()
            B.sync_to_device()
            dotprod.call(A,B,C)   
            C.sync_from_device()
            print(f'end dotprod_{cu}')

            print(f'start update_{cu}')
            D.sync_to_device()
            E.sync_to_device()
            update.call(D,E,F)
            F.sync_from_device()
            print(f'end update_{cu}')

            iter.append(time.time()-start_i)
        
        q.put(f"cu {cu} iterations: {iter} total time: {np.sum(iter)}")
        return

    

    start = time.time()
    q1 = Queue()
    q2 = Queue()
    p1 = Process(target=run_process,args=(q1,1))
    p2 = Process(target=run_process,args=(q2,2))
    p1.start()
    p2.start()
    
    r1 = q1.get()
    r2 = q2.get()

    p1.join()
    p2.join()

    ol.free()
    return  f'TOTAL TIME: {time.time()-start}, r1: {r1}, r2:{r2}'



<h1> Execute version two setup </h1>

In [None]:
import time

wall_time_s = time.time()

futures = client.submit(version_2)
res = client.gather([futures])

wall_time_no_dl = time.time() - wall_time_s

print(f"Wall Time: {wall_time_no_dl}")
print(res)

<h1> Execute version one setup </h1> 

In [9]:
import time

dl_time = time.time()

# f = client.submit(setup_multcu,workers="tcp://10.1.212.127:35789")
# print(client.gather([f]))

# f = client.submit(setup_multcu,workers="tcp://10.1.212.127:42299")
# print(client.gather([f]))

wall_time_s = time.time()

futures = client.map(version_1,[1,2])
res = client.gather(futures)

wall_time_no_dl = time.time() - wall_time_s
wall_time_dl = time.time() - dl_time

print(f"Wall Time: {wall_time_no_dl}")
print(f"Wall time including download: {wall_time_dl}")
print(res[0])

print(res[1])

Wall Time: 2.9305472373962402
Wall time including download: 2.9306225776672363
cu 1 iterations: [0.9020271301269531, 0.9187393188476562, 0.917022705078125] total time: 2.7377891540527344
cu 2 iterations: [0.94364333152771, 0.9393250942230225, 0.9420070648193359] total time: 2.8249754905700684


distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client


<h1> Automatic setup of cluster </h1>

In [None]:
from dask.distributed import Client, progress, get_worker
from SSHCluster import OctoSSHCluster
import json

with open("cluster_config.json") as f:
            config = json.load(f)

cluster = OctoSSHCluster(hosts=[config["scheduler"],*config["hosts"]],
                                      connect_options=config["connect_options"],
                                      worker_options=config["worker_options"],
                                      worker_class=config["worker_class"],
                                      scheduler_options=config["scheduler_options"]
                                     )



In [None]:
client = Client(cluster)
client