# Scalar Multiplication 1

In this notebook, we will explore how the **`DefaultSharedMemorySync` pass** inserts `__syncthreads()` tasklets. We will specifically observe its behavior when **reusing shared memory** during a scalar multiplication. Our example involves multiplying a scalar by a long vector; we will import a consecutive subset of the vector into shared memory, perform the multiplication, and then restore it.
Only one threadblock is used and it gets each consecutive chunc using a **sequential map**. **Scalar Multiplication 2** does the same but uses a **for loop** instead. 

In [1]:
# needed modules, nothing interesting :)
import dace
from IPython.display import Code
from dace.transformation import pass_pipeline
from dace.transformation.auto import auto_optimize
from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync

### Inspiration

Here are some example SDFGs using the Python frontend that perform scalar multiplication. These served as inspiration to implement the same operation—this time using shared memory instead of a temporary local variable.

Why not use shared memory in the Python frontend? Because we want more control over the program and prefer to focus on the concept itself, rather than the capabilities provided by the Python frontend.

Note that we have several similar examples. They differ in where the sequential map is placed within the nested map. A sequential map **outside** the kernel (i.e., outside GPU schedules) does **not** require synchronization after the sequential iteration, as we simply launch the kernel again and do not reuse shared memory.


In [2]:
@dace.program
def scalarMultiplication1(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):
    for k in dace.map[0:4] @ dace.dtypes.ScheduleType.Sequential:
        for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:
            for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:
                tmp = A[k * 32 + j]
                A[k * 32 + j] = scalar * tmp

@dace.program
def scalarMultiplication2(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):
    for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:
        for k in dace.map[0:4] @ dace.dtypes.ScheduleType.Sequential:
            for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:
                tmp = A[k * 32 + j]
                A[k * 32 + j] = scalar * tmp

@dace.program
def scalarMultiplication3(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):
    for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:
        for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:
            for k in dace.map[0:4] @ dace.dtypes.ScheduleType.Sequential:
                tmp = A[k * 32 + j]
                A[k * 32 + j] = scalar * tmp


# Choose the sdfg you want so inspect below
sdfg_inspiration = scalarMultiplication3.to_sdfg()
sdfg_inspiration

Tipp: collapse the functions and only focus one at a time below. They are quite similar, only difference is where the sequential map occurs.
Select it and the observe whether the post-synchronization happens if required and whether it is omitted if unnecessary.

In [None]:
# Here we should have NO post synchronization, since seq map is OUTSIDE of the kernel. 
def scalarMultiplication1_smem():
    # Create SDFG and state
    sdfg = dace.SDFG("scalarMultiplication1_smem")
    state = sdfg.add_state("main")

    # Add arrays
    sdfg.add_array("A", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)
    sdfg.add_scalar("scalar", dace.uint32)
    sdfg.add_array("S", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope)

    # Add access nodes
    a_acc = state.add_read("A")
    a_store = state.add_write("A")
    scalar_acc = state.add_access("scalar")
    s_acc= state.add_access("S")

    # Sequential map (outermost) 
    seq_map_entry, seq_map_exit = state.add_map(
        "seq_map",
        dict(k="0:4"),
        schedule=dace.dtypes.ScheduleType.Sequential,
    )


    # GPU Device map
    gpu_map_entry, gpu_map_exit = state.add_map(
        "gpu_map",
        dict(i="0:32:32"),
        schedule=dace.dtypes.ScheduleType.GPU_Device,
    )

    #  GPU TB map
    tb_map_entry, tb_map_exit = state.add_map(
        "tb",
        dict(j="0:32"),
        schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock,
    )

    # Add tasklets for A -> S -> B
    tasklet1 = state.add_tasklet(
        "addMult",
        inputs={"__inp_A", "__inp_scalar"},
        outputs={"__out"},
        code="__out = __inp_A * __inp_scalar;",
        language=dace.dtypes.Language.CPP
    )

    tasklet2 = state.add_tasklet(
        "store_to_global",
        inputs={"__inp"},
        outputs={"__out"},
        code="__out = __inp;",
        language=dace.dtypes.Language.CPP
    )

    # Edges

    # A and scalar to first map
    state.add_edge(a_acc, None, seq_map_entry, None, dace.Memlet("A[0:128]"))
    state.add_edge(scalar_acc, None, seq_map_entry, None, dace.Memlet("scalar[0]"))

    # Add both down to last map, the threadblock map
    state.add_edge(seq_map_entry, None, gpu_map_entry, None, dace.Memlet("A[32 * k: 32 * (k+1)]"))
    state.add_edge(seq_map_entry, None, gpu_map_entry, None, dace.Memlet("scalar[0]"))

    state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet("A[32 * k: 32 * (k+1)]"))
    state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet("scalar[0]"))

    # connect to tasklets
    state.add_edge(tb_map_entry, None, tasklet1, "__inp_A", dace.Memlet("A[j + 32* k]"))
    state.add_edge(tb_map_entry, None, tasklet1, "__inp_scalar", dace.Memlet("scalar[0]"))

    state.add_edge(tasklet1, "__out", s_acc, None, dace.Memlet("S[j]"))

    state.add_edge(s_acc, None, tasklet2, "__inp", dace.Memlet("S[j]"))

    # connect to all map exit nodes and then back to A to store back
    state.add_edge(tasklet2, "__out", tb_map_exit, None, dace.Memlet("A[j + 32* k]"))
    state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet("A[32 * k: 32 * (k+1)]"))
    state.add_edge(gpu_map_exit, None, seq_map_exit, None, dace.Memlet("A[32 * k: 32 * (k+1)]"))
    state.add_edge(seq_map_exit, None, a_store, None, dace.Memlet("A[0:128]"))
    
    
    sdfg.fill_scope_connectors()
    return sdfg


# Here we should have post synchronization
def scalarMultiplication2_smem():
    # Create SDFG and state
    sdfg = dace.SDFG("scalarMultiplication2_smem")
    state = sdfg.add_state("main")

    # Add arrays
    sdfg.add_array("A", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)
    sdfg.add_scalar("scalar", dace.uint32)
    sdfg.add_array("S", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope)

    # Add access nodes
    a_acc = state.add_read("A")
    a_store = state.add_write("A")
    scalar_acc = state.add_access("scalar")
    s_acc= state.add_access("S")

    # Sequential map (outermost) 
    seq_map_entry, seq_map_exit = state.add_map(
        "seq_map",
        dict(k="0:4"),
        schedule=dace.dtypes.ScheduleType.Sequential,
    )


    # GPU Device map
    gpu_map_entry, gpu_map_exit = state.add_map(
        "gpu_map",
        dict(i="0:32:32"),
        schedule=dace.dtypes.ScheduleType.GPU_Device,
    )

    #  GPU TB map
    tb_map_entry, tb_map_exit = state.add_map(
        "tb",
        dict(j="0:32"),
        schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock,
    )

    # Add tasklets for A -> S -> B
    tasklet1 = state.add_tasklet(
        "addMult",
        inputs={"__inp_A", "__inp_scalar"},
        outputs={"__out"},
        code="__out = __inp_A * __inp_scalar;",
        language=dace.dtypes.Language.CPP
    )

    tasklet2 = state.add_tasklet(
        "store_to_global",
        inputs={"__inp"},
        outputs={"__out"},
        code="__out = __inp;",
        language=dace.dtypes.Language.CPP
    )

    # Edges

    # A and scalar to first map
    state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet("A[0:128]"))
    state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet("scalar[0]"))

    # Add both down to last map, the threadblock map
    state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet("A[0:128]"))
    state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet("scalar[0]"))

    state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet("A[32 * k: 32 * (k+1)]"))
    state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet("scalar[0]"))

    # connect to tasklets
    state.add_edge(tb_map_entry, None, tasklet1, "__inp_A", dace.Memlet("A[j + 32* k]"))
    state.add_edge(tb_map_entry, None, tasklet1, "__inp_scalar", dace.Memlet("scalar[0]"))

    state.add_edge(tasklet1, "__out", s_acc, None, dace.Memlet("S[j]"))

    state.add_edge(s_acc, None, tasklet2, "__inp", dace.Memlet("S[j]"))

    # connect to all map exit nodes and then back to A to store back
    state.add_edge(tasklet2, "__out", tb_map_exit, None, dace.Memlet("A[j + 32* k]"))
    state.add_edge(tb_map_exit, None, seq_map_exit, None, dace.Memlet("A[32 * k: 32 * (k+1)]"))
    state.add_edge(seq_map_exit, None, gpu_map_exit, None, dace.Memlet("A[0:128]"))
    state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet("A[0:128]"))
    
    
    sdfg.fill_scope_connectors()
    return sdfg


# As before, Here we should have post synchronization
def scalarMultiplication3_smem():
    # Create SDFG and state
    sdfg = dace.SDFG("scalarMultiplication3_smem")
    state = sdfg.add_state("main")

    # Add arrays
    sdfg.add_array("A", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)
    sdfg.add_scalar("scalar", dace.uint32)
    sdfg.add_array("S", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope)

    # Add access nodes
    a_acc = state.add_read("A")
    a_store = state.add_write("A")
    scalar_acc = state.add_access("scalar")
    s_acc= state.add_access("S")

    # Sequential map (outermost) 
    seq_map_entry, seq_map_exit = state.add_map(
        "seq_map",
        dict(k="0:4"),
        schedule=dace.dtypes.ScheduleType.Sequential,
    )


    # GPU Device map
    gpu_map_entry, gpu_map_exit = state.add_map(
        "gpu_map",
        dict(i="0:32:32"),
        schedule=dace.dtypes.ScheduleType.GPU_Device,
    )

    #  GPU TB map
    tb_map_entry, tb_map_exit = state.add_map(
        "tb",
        dict(j="0:32"),
        schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock,
    )

    # Add tasklets for A -> S -> B
    tasklet1 = state.add_tasklet(
        "addMult",
        inputs={"__inp_A", "__inp_scalar"},
        outputs={"__out"},
        code="__out = __inp_A * __inp_scalar;",
        language=dace.dtypes.Language.CPP
    )

    tasklet2 = state.add_tasklet(
        "store_to_global",
        inputs={"__inp"},
        outputs={"__out"},
        code="__out = __inp;",
        language=dace.dtypes.Language.CPP
    )

    # Edges

    # A and scalar to first map
    state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet("A[0:128]"))
    state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet("scalar[0]"))

    # Add both down to last map, the threadblock map
    state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet("A[0:128]"))
    state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet("scalar[0]"))

    state.add_edge(tb_map_entry, None, seq_map_entry, None, dace.Memlet("A[j: j + 4]")) # weird, but it is like this in the inspiration
    state.add_edge(tb_map_entry, None, seq_map_entry, None, dace.Memlet("scalar[0]"))

    # connect to tasklets
    state.add_edge(seq_map_entry, None, tasklet1, "__inp_A", dace.Memlet("A[j + 32* k]"))
    state.add_edge(seq_map_entry, None, tasklet1, "__inp_scalar", dace.Memlet("scalar[0]"))

    state.add_edge(tasklet1, "__out", s_acc, None, dace.Memlet("S[j]"))

    state.add_edge(s_acc, None, tasklet2, "__inp", dace.Memlet("S[j]"))

    # connect to all map exit nodes and then back to A to store back
    state.add_edge(tasklet2, "__out", seq_map_exit, None, dace.Memlet("A[j + 32* k]"))
    state.add_edge(seq_map_exit, None, tb_map_exit, None, dace.Memlet("A[j: j + 4]"))
    state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet("A[0:128]"))
    state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet("A[0:128]"))
    
    
    sdfg.fill_scope_connectors()
    return sdfg


# choose which of the three versions should be applied to the pass
sdfg = scalarMultiplication2_smem()
sdfg

Observe how the pass inserts the synchronization barriers correctly:

In [4]:
# insert synchronization barriers
DefaultSharedMemorySync().apply_pass(sdfg, None)
sdfg