In [1]:
] activate ../

[32m[1mActivating[22m[39m environment at `C:\Users\carsten\Desktop\Oulu2020\Project.toml`


## Problem

Optimize the following function.

In [2]:
function work(mat, s, v, N)
    val = 0.0
    for i in 1:N
        for j in 1:N
            val = mod(v[i],256);
            mat[i,j] = s[i,j]*(sin(val)*sin(val)-cos(val)*cos(val));
        end
    end;
end

work (generic function with 1 method)

In [3]:
using Test
x = rand()
@test 1-2*cos(x)*cos(x) ≈ sin(x)*sin(x)-cos(x)*cos(x)
@test -cos(2*x) ≈ sin(x)*sin(x)-cos(x)*cos(x)

[32m[1mTest Passed[22m[39m

In [4]:
# pulling out + analytical opt
function opt1(mat, s, v, N)
    val = 0.0
    @inbounds for i in 1:N
        val = mod(v[i],256);
        val = -cos(2*val)
        for j in 1:N
            mat[i,j] = s[i,j]*val;
        end
    end;
    mat
end

opt1 (generic function with 1 method)

In [5]:
# pulling out + analytical opt + allocate values
function opt12(mat, s, v, N)
    val = Vector{Float64}(undef, length(v))
    @inbounds for i in eachindex(val)
        val[i] = -cos(2*mod(v[i],256));
    end
    
    @inbounds for i in 1:N
        for j in 1:N
            mat[i,j] = s[i,j]*val[i];
        end
    end;
    mat
end

opt12 (generic function with 1 method)

In [6]:
# reordering loops
function opt2(mat, s, v, N)
    val = 0.0
    @inbounds for j in 1:N
        for i in 1:N
            val = mod(v[i],256);
            val = -cos(2*val)
            mat[i,j] = s[i,j]*val;
        end
    end;
end

opt2 (generic function with 1 method)

In [7]:
# reordering loops + allocate values
function opt22(mat, s, v, N)
    val = Vector{Float64}(undef, length(v))
    @inbounds for i in eachindex(val)
        val[i] = -cos(2*mod(v[i],256));
    end
    
    @inbounds for j in 1:N
        for i in 1:N
            mat[i,j] = s[i,j]*val[i];
        end
    end;
end

opt22 (generic function with 1 method)

More optimizations: blocking, tabling values of `val`...

## Single thread benchmarks

In [8]:
using BenchmarkTools
N = 4000
mat = zeros(N,N)
s = rand(N,N)
v = rand(Int, N);

In [9]:
runtime = @belapsed work($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 18.881034356306554 MIt/s


In [10]:
runtime = @belapsed opt1($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 78.34772483103824 MIt/s


In [11]:
runtime = @belapsed opt12($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 72.13995511993042 MIt/s


In [12]:
runtime = @belapsed opt2($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 56.743544712494646 MIt/s


In [13]:
runtime = @belapsed opt22($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 1009.6803099718552 MIt/s


## Multi-threading

In [14]:
using Hwloc
Hwloc.num_physical_cores()

4

In [15]:
Base.Threads.nthreads()

4

In [16]:
function opt22_threaded(mat, s, v, N)
    val = Vector{Float64}(undef, length(v))
    @inbounds for i in eachindex(val)
        val[i] = -cos(2*mod(v[i],256));
    end
    
    @inbounds Threads.@threads for j in 1:N
        for i in 1:N
            mat[i,j] = s[i,j]*val[i];
        end
    end;
    mat
end

opt22_threaded (generic function with 1 method)

In [17]:
runtime = @belapsed opt22_threaded($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")
# Performanc (1 thread):   678.7915812874131 MIt/s
# Performance (4 threads): 1172.5993721170087 MIt/s

Performance: 1075.5796365885303 MIt/s


## Maximal performance?

"Roofline model"

In [18]:
bs = 29 # [GB/s] max memory bandwidth (estimated from STREAM benchmark, 34 GB/s is the Intel value)
traffic = 24 # [B/iter] in each iteration we have: LOAD s, LOAD + STORE mat, each contributing 8 B
println("Memory bounded performance: ", round(bs/traffic, digits=2), " GIt/s")

Memory bounded performance: 1.21 GIt/s
