In [7]:
] activate ../

[32m[1mActivating[22m[39m environment at `~/Project.toml`


## Problem

Optimize the following function.

In [10]:
function work(mat, s, v, N)
    val = 0.0
    for i in 1:N
        for j in 1:N
            val = mod(v[i],256);
            mat[i,j] = s[i,j]*(sin(val)*sin(val)-cos(val)*cos(val));
        end
    end;
end

work (generic function with 1 method)

In [11]:
using Test
x = rand()
@test 1-2*cos(x)*cos(x) ≈ sin(x)*sin(x)-cos(x)*cos(x)
@test -cos(2*x) ≈ sin(x)*sin(x)-cos(x)*cos(x)

[32m[1mTest Passed[22m[39m

In [12]:
# pulling out + analytical opt
function opt1(mat, s, v, N)
    val = 0.0
    @inbounds for i in 1:N
        val = mod(v[i],256);
        val = -cos(2*val)
        for j in 1:N
            mat[i,j] = s[i,j]*val;
        end
    end;
    mat
end

opt1 (generic function with 1 method)

In [13]:
# pulling out + analytical opt + allocate values
function opt12(mat, s, v, N)
    val = Vector{Float64}(undef, length(v))
    @inbounds for i in eachindex(val)
        val[i] = -cos(2*mod(v[i],256));
    end
    
    @inbounds for i in 1:N
        for j in 1:N
            mat[i,j] = s[i,j]*val[i];
        end
    end;
    mat
end

opt12 (generic function with 1 method)

In [14]:
# reordering loops
function opt2(mat, s, v, N)
    val = 0.0
    @inbounds for j in 1:N
        for i in 1:N
            val = mod(v[i],256);
            val = -cos(2*val)
            mat[i,j] = s[i,j]*val;
        end
    end;
end

opt2 (generic function with 1 method)

In [15]:
# reordering loops + allocate values
function opt22(mat, s, v, N)
    val = Vector{Float64}(undef, length(v))
    @inbounds for i in eachindex(val)
        val[i] = -cos(2*mod(v[i],256));
    end
    
    @inbounds for j in 1:N
        for i in 1:N
            mat[i,j] = s[i,j]*val[i];
        end
    end;
end

opt22 (generic function with 1 method)

More optimizations: blocking, tabling values of `val`...

## Single thread benchmarks

In [22]:
using BenchmarkTools
N = 4000
mat = zeros(N,N)
s = rand(N,N)
v = rand(Int, N);

In [24]:
runtime = @belapsed work($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 8.832933179745996 MIt/s


In [25]:
runtime = @belapsed opt1($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 18.954256315496306 MIt/s


In [26]:
runtime = @belapsed opt12($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 18.667771913881896 MIt/s


In [27]:
runtime = @belapsed opt2($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 39.56193729495743 MIt/s


In [28]:
runtime = @belapsed opt22($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 641.2379740326675 MIt/s


In [29]:
641/21.47

52.957615277130884

## Multi-threading

In [34]:
using Hwloc
Hwloc.num_physical_cores()

8

In [35]:
Base.Threads.nthreads()

8

In [None]:
function opt22_threaded(mat, s, v, N)
    val = Vector{Float64}(undef, length(v))
    @inbounds for i in eachindex(val)
        val[i] = -cos(2*mod(v[i],256));
    end
    
    @inbounds Threads.@threads for j in 1:N
        for i in 1:N
            mat[i,j] = s[i,j]*val[i];
        end
    end;
    mat
end

In [30]:
runtime = @belapsed opt22_threaded($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")
# Performanc (1 thread):   678.7915812874131 MIt/s
# Performance (4 threads): 1172.5993721170087 MIt/s

Performance: 1710.8674621986506 MIt/s


## Maximal performance?

"Roofline model"

In [84]:
bs = 50 # [GB/s] max memory bandwidth (only an estimate for the unknown CPU model)
traffic = 24 # [B/iter] in each iteration we have: LOAD s, LOAD + STORE mat, each contributing 8 B
println("Memory bounded performance: ", round(bs/traffic, digits=2), " GIt/s")

Memory bounded performance: 2.08 GIt/s
