In [50]:
] activate .

[32m[1mActivating[22m[39m new environment at `C:\Users\carsten\Desktop\Oulu2020\backup\Project.toml`


In [14]:
using BenchmarkTools

In [15]:
function work(mat, s, v, N)
    val = 0.0
    for i in 1:N
        for j in 1:N
            val = mod(v[i],256);
            mat[i,j] = s[i,j]*(sin(val)*sin(val)-cos(val)*cos(val));
        end
    end;
end

work (generic function with 1 method)

In [16]:
using Test
x = rand()
@test 1-2*cos(x)*cos(x) ≈ sin(x)*sin(x)-cos(x)*cos(x)
@test -cos(2*x) ≈ sin(x)*sin(x)-cos(x)*cos(x)

[32m[1mTest Passed[22m[39m

In [17]:
# pulling out + analytical opt
function opt1(mat, s, v, N)
    val = 0.0
    @inbounds for i in 1:N
        val = mod(v[i],256);
        val = -cos(2*val)
        for j in 1:N
            mat[i,j] = s[i,j]*val;
        end
    end;
    mat
end

opt1 (generic function with 1 method)

In [18]:
# pulling out + analytical opt + allocate values
function opt12(mat, s, v, N)
    val = Vector{Float64}(undef, length(v))
    @inbounds for i in eachindex(val)
        val[i] = -cos(2*mod(v[i],256));
    end
    
    @inbounds for i in 1:N
        for j in 1:N
            mat[i,j] = s[i,j]*val[i];
        end
    end;
    mat
end

opt12 (generic function with 1 method)

In [19]:
# reordering loops
function opt2(mat, s, v, N)
    val = 0.0
    @inbounds for j in 1:N
        for i in 1:N
            val = mod(v[i],256);
            val = -cos(2*val)
            mat[i,j] = s[i,j]*val;
        end
    end;
end

opt2 (generic function with 1 method)

In [20]:
# reordering loops + allocate values
function opt22(mat, s, v, N)
    val = Vector{Float64}(undef, length(v))
    @inbounds for i in eachindex(val)
        val[i] = -cos(2*mod(v[i],256));
    end
    
    @inbounds for j in 1:N
        for i in 1:N
            mat[i,j] = s[i,j]*val[i];
        end
    end;
end

opt22 (generic function with 1 method)

In [45]:
# memory bound!
# s → 8 bytes
# mat → 8 + 8 bytes (write allocate)
B_c = 24 # bytes/it
# I = bs / B_c

24

In [38]:
cachesize()

(32768, 262144, 6291456)

In [43]:
# precalculate val (table)
# vtab(i) = -0.5*cos(2*i)
# IAND(v(i),255)

In [None]:
# reordering loops + allocate values
function opt_blocking(mat, s, v, N)
    val = Vector{Float64}(undef, length(v))
    @inbounds for i in eachindex(val)
        val[i] = -cos(2*mod(v[i],256));
    end
    
    Nblock = 2
    block_size = N/Nblock
    @inbounds for iblock in 1:Nblock
        for j in 1:N
            for i in 1:N/Nblock
                mat[i,j] = s[i,j]*val[i];
            end
        end
    end;
end

In [21]:
using Hwloc
Hwloc.num_physical_cores()

4

In [22]:
function opt22_threaded(mat, s, v, N)
    val = Vector{Float64}(undef, length(v))
    @inbounds for i in eachindex(val)
        val[i] = -cos(2*mod(v[i],256));
    end
    
    @inbounds Threads.@threads for j in 1:N
        for i in 1:N
            mat[i,j] = s[i,j]*val[i];
        end
    end;
    mat
end

opt22_threaded (generic function with 1 method)

In [23]:
N = 4000
mat = zeros(N,N)
s = rand(N,N)
v = rand(Int, N);

In [24]:
opt22_threaded(mat, s, v, N);

In [25]:
runtime = @belapsed work($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 20.388401573030933 MIt/s


In [26]:
runtime = @belapsed opt1($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 78.76380212563812 MIt/s


In [27]:
runtime = @belapsed opt12($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 71.90096720282318 MIt/s


In [28]:
runtime = @belapsed opt2($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 56.04036307150225 MIt/s


In [29]:
runtime = @belapsed opt22($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")

Performance: 1138.3282227139166 MIt/s


In [18]:
1137/21.47

52.957615277130884

In [30]:
runtime = @belapsed opt22_threaded($mat, $s, $v, $N);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")
# Performanc (1 thread):   678.7915812874131 MIt/s
# Performance (4 threads): 1172.5993721170087 MIt/s

Performance: 1172.5993721170087 MIt/s


In [93]:
emmy = 86

86

In [None]:
emmy_opt = 600

In [None]:
emmy_opt_threaded = 1.8 * 1e9

# Multiple `N`

In [18]:
using Plots

runtime = @belapsed work($mat, $s, $v);
perf = N*N*1e-6/runtime # MIt/s
println("Performance: $perf MIt/s")