In [1]:
%%bash
mkdir src
mkdir report
mkdir build
rm -rf sample_data/

# Naive Approach

## Without share memory

In [25]:
%%bash 

hipcc -O3 -std=c++17 src/naive-main.cpp -o build/naive-main --offload-arch=gfx90a
PARAMS=(
    "1024 1024 1024"
    "1024 1024 128"
    "512 2048 4096"
    "8192 8192 8192"
)

# Create output directory if it doesn't exist.
mkdir -p report

# Loop over each parameter set.
for params in "${PARAMS[@]}"; do
    # Parse M, N, K.
    set -- $params
    M=$1
    N=$2
    K=$3

    output="report/naive-main-${M}_${N}_${K}.txt"
    echo "Running configuration: M=$M, N=$N, K=$K" | tee $output
    # Run the executable with srun (adjust --time if needed)
    srun --time=1:00 ./build/naive-main -m $M -n $N -k $K >> $output 2>&1
done

   61 |   hipDeviceSynchronize();
      |   ^~~~~~~~~~~~~~~~~~~~
  154 |   hipMalloc((int **)&d_a, M * K * sizeof(int));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  155 |   hipMalloc((int **)&d_b, K * N * sizeof(int));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  156 |   hipMalloc((int **)&d_c, M * N* sizeof(int));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  162 |   hipEventCreate(&start);
      |   ^~~~~~~~~~~~~~ ~~~~~~
  163 |   hipEventCreate(&stop);
      |   ^~~~~~~~~~~~~~ ~~~~~
  165 |   hipEventRecord( start, 0 );
      |   ^~~~~~~~~~~~~~  ~~~~~~~~
  169 |   hipEventRecord( stop, 0 );
      |   ^~~~~~~~~~~~~~  ~~~~~~~
  170 |   hipEventSynchronize( stop );
      |   ^~~~~~~~~~~~~~~~~~~  ~~~~
  172 |   hipEventElapsedTime( &time, start, stop );
      |   ^~~~~~~~~~~~~~~~~~~  ~~~~~~~~~~~~~~~~~~
  174 |   hipEventDestroy( start );
      |   ^~~~~~~~~~~~~~~  ~~~~~
  175 |   hipEventDestroy( stop );
      |   ^~~~~~~~~~~~~~~  ~~~~
   61 |   hipDevice

Running configuration: M=1024, N=1024, K=1024
Running configuration: M=1024, N=1024, K=128
Running configuration: M=512, N=2048, K=4096
Running configuration: M=8192, N=8192, K=8192
Error while terminating subprocess (pid=1557213): 


TypeError: %d format: a real number is required, not NoneType

## With share memory

In [19]:
%%bash 

hipcc -O3 -std=c++17 src/naive-share.cpp -o build/naive-share --offload-arch=gfx90a
srun --time=1:00 ./build/naive-share -m 10 -n 10 -k 10 > report/naive-share.log

   17 |     extern __shared__ T As[16][K];
      |                                ^
src/naive-share.cpp:17:32: note: function parameter 'K' with unknown value cannot be used in a constant expression
src/naive-share.cpp:13:79: note: declared here
   13 | __global__ void matmul_kernel(const T *a, const T *b, T *c, int M, int N, int K) {
      |                                                                               ^
src/naive-share.cpp:17:12: error: __shared__ variable 'As' cannot be 'extern'
   17 |     extern __shared__ T As[16][K];
      |            ^
/opt/rocm-6.3.3/lib/llvm/lib/clang/18/include/__clang_hip_runtime_wrapper.h:24:35: note: expanded from macro '__shared__'
   24 | #define __shared__ __attribute__((shared))
      |                                   ^
src/naive-share.cpp:17:25: error: variably modified type declaration cannot have 'extern' linkage
   17 |     extern __shared__ T As[16][K];
      |                         ^
   18 |     extern __shared__ T Bs[K][64]

In [2]:
%%bash 

hipcc -O3 -std=c++17 src/tiled-main.cpp -o build/tiled-main --offload-arch=gfx90a
PARAMS=(
    "1024 1024 1024"
    "1024 1024 128"
    "512 2048 4096"
    "8192 8192 8192"
)

# Create output directory if it doesn't exist.
mkdir -p report

# Loop over each parameter set.
for params in "${PARAMS[@]}"; do
    # Parse M, N, K.
    set -- $params
    M=$1
    N=$2
    K=$3

    output="report/tiled-main-${M}_${N}_${K}.txt"
    echo "Running configuration: M=$M, N=$N, K=$K" | tee $output
    # Run the executable with srun (adjust --time if needed)
    srun --time=1:00 ./build/tiled-main -m $M -n $N -k $K >> $output 2>&1
done

  100 |   hipDeviceSynchronize();
      |   ^~~~~~~~~~~~~~~~~~~~
  187 |   hipMalloc((float **)&d_a, M * K * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  188 |   hipMalloc((float **)&d_b, K * N * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  189 |   hipMalloc((float **)&d_c, M * N * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  195 |   hipEventCreate(&start);
      |   ^~~~~~~~~~~~~~ ~~~~~~
  196 |   hipEventCreate(&stop);
      |   ^~~~~~~~~~~~~~ ~~~~~
  198 |   hipEventRecord( start, 0 );
      |   ^~~~~~~~~~~~~~  ~~~~~~~~
  202 |   hipEventRecord( stop, 0 );
      |   ^~~~~~~~~~~~~~  ~~~~~~~
  203 |   hipEventSynchronize( stop );
      |   ^~~~~~~~~~~~~~~~~~~  ~~~~
  205 |   hipEventElapsedTime( &time, start, stop );
      |   ^~~~~~~~~~~~~~~~~~~  ~~~~~~~~~~~~~~~~~~
  207 |   hipEventDestroy( start );
      |   ^~~~~~~~~~~~~~~  ~~~~~
  208 |   hipEventDestroy( stop );
      |   ^~~~~~~~~~~~~~~

Running configuration: M=1024, N=1024, K=1024
Running configuration: M=1024, N=1024, K=128
Running configuration: M=512, N=2048, K=4096
Running configuration: M=8192, N=8192, K=8192


CalledProcessError: Command 'b'\nhipcc -O3 -std=c++17 src/tiled-main.cpp -o build/tiled-main --offload-arch=gfx90a\nPARAMS=(\n    "1024 1024 1024"\n    "1024 1024 128"\n    "512 2048 4096"\n    "8192 8192 8192"\n)\n\n# Create output directory if it doesn\'t exist.\nmkdir -p report\n\n# Loop over each parameter set.\nfor params in "${PARAMS[@]}"; do\n    # Parse M, N, K.\n    set -- $params\n    M=$1\n    N=$2\n    K=$3\n\n    output="report/tiled-main-${M}_${N}_${K}.txt"\n    echo "Running configuration: M=$M, N=$N, K=$K" | tee $output\n    # Run the executable with srun (adjust --time if needed)\n    srun --time=1:00 ./build/tiled-main -m $M -n $N -k $K >> $output 2>&1\ndone\n'' returned non-zero exit status 143.

In [3]:
%%bash 

hipcc -O3 -std=c++17 src/tiled-ilp-1D-main.cpp -o build/tiled-ilp-1D-main --offload-arch=gfx90a
PARAMS=(
    "1024 1024 1024"
    "1024 1024 128"
    "512 2048 4096"
    "8192 8192 8192"
)

# Create output directory if it doesn't exist.
mkdir -p report

# Loop over each parameter set.
for params in "${PARAMS[@]}"; do
    # Parse M, N, K.
    set -- $params
    M=$1
    N=$2
    K=$3

    output="report/tiled-ilp-1D-main-${M}_${N}_${K}.txt"
    echo "Running configuration: M=$M, N=$N, K=$K" | tee $output
    # Run the executable with srun (adjust --time if needed)
    srun --time=1:00 ./build/tiled-ilp-1D-main -m $M -n $N -k $K >> $output 2>&1
done

  108 |   hipDeviceSynchronize();
      |   ^~~~~~~~~~~~~~~~~~~~
  189 |   hipMalloc((float **)&d_a, M * K * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  190 |   hipMalloc((float **)&d_b, K * N * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  191 |   hipMalloc((float **)&d_c, M * N * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  197 |   hipEventCreate(&start);
      |   ^~~~~~~~~~~~~~ ~~~~~~
  198 |   hipEventCreate(&stop);
      |   ^~~~~~~~~~~~~~ ~~~~~
  200 |   hipEventRecord(start, 0);
      |   ^~~~~~~~~~~~~~ ~~~~~~~~
  206 |   hipEventRecord(stop, 0);
      |   ^~~~~~~~~~~~~~ ~~~~~~~
  207 |   hipEventSynchronize(stop);
      |   ^~~~~~~~~~~~~~~~~~~ ~~~~
  209 |   hipEventElapsedTime(&time, start, stop);
      |   ^~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~
  211 |   hipEventDestroy(start);
      |   ^~~~~~~~~~~~~~~ ~~~~~
  212 |   hipEventDestroy(stop);
      |   ^~~~~~~~~~~~~~~ ~~~~
  108 |   h

Running configuration: M=1024, N=1024, K=1024
Running configuration: M=1024, N=1024, K=128
Running configuration: M=512, N=2048, K=4096
Running configuration: M=8192, N=8192, K=8192


In [5]:
%%bash 

hipcc -O3 -std=c++17 src/tiled-ilp-2D-main.cpp -o build/tiled-ilp-2D-main --offload-arch=gfx90a
PARAMS=(
    "1024 1024 1024"
    "1024 1024 128"
    "512 2048 4096"
    "8192 8192 8192"
)

# Create output directory if it doesn't exist.
mkdir -p report

# Loop over each parameter set.
for params in "${PARAMS[@]}"; do
    # Parse M, N, K.
    set -- $params
    M=$1
    N=$2
    K=$3

    output="report/tiled-ilp-2D-main-${M}_${N}_${K}.txt"
    echo "Running configuration: M=$M, N=$N, K=$K" | tee $output
    # Run the executable with srun (adjust --time if needed)
    srun --time=1:00 ./build/tiled-ilp-2D-main -m $M -n $N -k $K >> $output 2>&1
done

  125 |   hipDeviceSynchronize();
      |   ^~~~~~~~~~~~~~~~~~~~
  212 |   hipMalloc((float **)&d_a, M * K * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  213 |   hipMalloc((float **)&d_b, K * N * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  214 |   hipMalloc((float **)&d_c, M * N * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  220 |   hipEventCreate(&start);
      |   ^~~~~~~~~~~~~~ ~~~~~~
  221 |   hipEventCreate(&stop);
      |   ^~~~~~~~~~~~~~ ~~~~~
  223 |   hipEventRecord( start, 0 );
      |   ^~~~~~~~~~~~~~  ~~~~~~~~
  227 |   hipEventRecord( stop, 0 );
      |   ^~~~~~~~~~~~~~  ~~~~~~~
  228 |   hipEventSynchronize( stop );
      |   ^~~~~~~~~~~~~~~~~~~  ~~~~
  230 |   hipEventElapsedTime( &time, start, stop );
      |   ^~~~~~~~~~~~~~~~~~~  ~~~~~~~~~~~~~~~~~~
  232 |   hipEventDestroy( start );
      |   ^~~~~~~~~~~~~~~  ~~~~~
  233 |   hipEventDestroy( stop );
      |   ^~~~~~~~~~~~~~~

Running configuration: M=1024, N=1024, K=1024
Running configuration: M=1024, N=1024, K=128
Running configuration: M=512, N=2048, K=4096
Error while terminating subprocess (pid=1696467): 


TypeError: %d format: a real number is required, not NoneType

Running configuration: M=8192, N=8192, K=8192


In [9]:
%%bash 

hipcc -O3 -std=c++17 src/vectorize.cpp -o build/vectorize --offload-arch=gfx90a
PARAMS=(
    "1024 1024 1024"
    "1024 1024 128"
    "512 2048 4096"
    "8192 8192 8192"
)

# Create output directory if it doesn't exist.
mkdir -p report

# Loop over each parameter set.
for params in "${PARAMS[@]}"; do
    # Parse M, N, K.
    set -- $params
    M=$1
    N=$2
    K=$3

    output="report/vectorize_${M}_${N}_${K}.txt"
    echo "Running configuration: M=$M, N=$N, K=$K" | tee $output
    # Run the executable with srun (adjust --time if needed)
    srun --time=1:00 ./build/vectorize -m $M -n $N -k $K >> $output 2>&1
done

  101 |   hipDeviceSynchronize();
      |   ^~~~~~~~~~~~~~~~~~~~
  186 |   hipMalloc((float **)&d_a, M * K * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  187 |   hipMalloc((float **)&d_b, K * N * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  188 |   hipMalloc((float **)&d_c, M * N * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  194 |   hipEventCreate(&start);
      |   ^~~~~~~~~~~~~~ ~~~~~~
  195 |   hipEventCreate(&stop);
      |   ^~~~~~~~~~~~~~ ~~~~~
  197 |   hipEventRecord(start, 0);
      |   ^~~~~~~~~~~~~~ ~~~~~~~~
  202 |   hipEventRecord(stop, 0);
      |   ^~~~~~~~~~~~~~ ~~~~~~~
  203 |   hipEventSynchronize(stop);
      |   ^~~~~~~~~~~~~~~~~~~ ~~~~
  204 |   hipEventElapsedTime(&time, start, stop);
      |   ^~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~
  206 |   hipEventDestroy(start);
      |   ^~~~~~~~~~~~~~~ ~~~~~
  207 |   hipEventDestroy(stop);
      |   ^~~~~~~~~~~~~~~ ~~~~
  101 |   h

Running configuration: M=1024, N=1024, K=1024
Running configuration: M=1024, N=1024, K=128
Running configuration: M=512, N=2048, K=4096
Running configuration: M=8192, N=8192, K=8192
Error while terminating subprocess (pid=1824945): 


TypeError: %d format: a real number is required, not NoneType

In [7]:
%%bash 

hipcc -O3 -std=c++17 src/warp-tiling-main.cpp -o build/warp-tiling-main --offload-arch=gfx90a
PARAMS=(
    "1024 1024 1024"
    "1024 1024 128"
    "512 2048 4096"
    "8192 8192 8192"
)

# Create output directory if it doesn't exist.
mkdir -p report

# Loop over each parameter set.
for params in "${PARAMS[@]}"; do
    # Parse M, N, K.
    set -- $params
    M=$1
    N=$2
    K=$3

    output="report/warp-tiling-main_${M}_${N}_${K}.txt"
    echo "Running configuration: M=$M, N=$N, K=$K" | tee $output
    # Run the executable with srun (adjust --time if needed)
    srun --time=1:00 ./build/warp-tiling-main -m $M -n $N -k $K >> $output 2>&1
done

  199 |   hipDeviceSynchronize();
      |   ^~~~~~~~~~~~~~~~~~~~
  286 |   hipMalloc((float **)&d_a, M * K * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  287 |   hipMalloc((float **)&d_b, K * N * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  288 |   hipMalloc((float **)&d_c, M * N * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  294 |   hipEventCreate(&start);
      |   ^~~~~~~~~~~~~~ ~~~~~~
  295 |   hipEventCreate(&stop);
      |   ^~~~~~~~~~~~~~ ~~~~~
  297 |   hipEventRecord( start, 0 );
      |   ^~~~~~~~~~~~~~  ~~~~~~~~
  302 |   hipEventRecord( stop, 0 );
      |   ^~~~~~~~~~~~~~  ~~~~~~~
  303 |   hipEventSynchronize( stop );
      |   ^~~~~~~~~~~~~~~~~~~  ~~~~
  305 |   hipEventElapsedTime( &time, start, stop );
      |   ^~~~~~~~~~~~~~~~~~~  ~~~~~~~~~~~~~~~~~~
  307 |   hipEventDestroy( start );
      |   ^~~~~~~~~~~~~~~  ~~~~~
  308 |   hipEventDestroy( stop );
      |   ^~~~~~~~~~~~~~~

Running configuration: M=1024, N=1024, K=1024
Running configuration: M=1024, N=1024, K=128
Running configuration: M=512, N=2048, K=4096
Running configuration: M=8192, N=8192, K=8192


CalledProcessError: Command 'b'\nhipcc -O3 -std=c++17 src/warp-tiling-main.cpp -o build/warp-tiling-main --offload-arch=gfx90a\nPARAMS=(\n    "1024 1024 1024"\n    "1024 1024 128"\n    "512 2048 4096"\n    "8192 8192 8192"\n)\n\n# Create output directory if it doesn\'t exist.\nmkdir -p report\n\n# Loop over each parameter set.\nfor params in "${PARAMS[@]}"; do\n    # Parse M, N, K.\n    set -- $params\n    M=$1\n    N=$2\n    K=$3\n\n    output="report/warp-tiling-main_${M}_${N}_${K}.txt"\n    echo "Running configuration: M=$M, N=$N, K=$K" | tee $output\n    # Run the executable with srun (adjust --time if needed)\n    srun --time=1:00 ./build/warp-tiling-main -m $M -n $N -k $K >> $output 2>&1\ndone\n'' returned non-zero exit status 143.

In [11]:
%%bash 

hipcc -O3 -std=c++17 src/best-gemm.cpp -o build/best-gemm --offload-arch=gfx90a
PARAMS=(
    "1024 1024 1024"
    "1024 1024 128"
    "512 2048 4096"
    "8192 8192 8192"
)

# Create output directory if it doesn't exist.
mkdir -p report

# Loop over each parameter set.
for params in "${PARAMS[@]}"; do
    # Parse M, N, K.
    set -- $params
    M=$1
    N=$2
    K=$3

    output="report/best-gemm_${M}_${N}_${K}.txt"
    echo "Running configuration: M=$M, N=$N, K=$K" | tee $output
    # Run the executable with srun (adjust --time if needed)
    srun --time=1:00 ./build/best-gemm -m $M -n $N -k $K >> $output 2>&1
done

  112 |   hipDeviceSynchronize();
      |   ^~~~~~~~~~~~~~~~~~~~
  197 |   hipMalloc((float **)&d_a, M * K * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  198 |   hipMalloc((float **)&d_b, K * N * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  199 |   hipMalloc((float **)&d_c, M * N * sizeof(float));
      |   ^~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  205 |   hipEventCreate(&start);
      |   ^~~~~~~~~~~~~~ ~~~~~~
  206 |   hipEventCreate(&stop);
      |   ^~~~~~~~~~~~~~ ~~~~~
  208 |   hipEventRecord(start, 0);
      |   ^~~~~~~~~~~~~~ ~~~~~~~~
  213 |   hipEventRecord(stop, 0);
      |   ^~~~~~~~~~~~~~ ~~~~~~~
  214 |   hipEventSynchronize(stop);
      |   ^~~~~~~~~~~~~~~~~~~ ~~~~
  215 |   hipEventElapsedTime(&time, start, stop);
      |   ^~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~
  217 |   hipEventDestroy(start);
      |   ^~~~~~~~~~~~~~~ ~~~~~
  218 |   hipEventDestroy(stop);
      |   ^~~~~~~~~~~~~~~ ~~~~
  112 |   h

Running configuration: M=1024, N=1024, K=1024
Running configuration: M=1024, N=1024, K=128
Running configuration: M=512, N=2048, K=4096
Running configuration: M=8192, N=8192, K=8192
