# Exercise: Performance Optimization 1

Optimize the following code.

(The type and size of the input is fixed/may not be changed.)

In [13]:
function work!(A, N)
    D = zeros(N,N)
    for i in 1:N
        D = b[i]*c*A
        b[i] = sum(D)
    end
end

N = 100
A = rand(N,N)
b = rand(N)
c = 1.23

work!(A,N)

In [14]:
using BenchmarkTools
@btime work!($A, $N);

  556.375 μs (402 allocations: 7.71 MiB)


## Optimizations

### Avoiding globals

In [15]:
@code_warntype work!(A,N)

MethodInstance for work!(::Matrix{Float64}, ::Int64)
  from work!([90mA[39m, [90mN[39m)[90m @[39m [90mMain[39m [90m~/repos/JuliaHLRS23/exercises/Day2/1_perf_optimization/solution_proposal/[39m[90m[4mperf_optimization1_solution.ipynb:1[24m[39m
Arguments
  #self#[36m::Core.Const(work!)[39m
  A[36m::Matrix{Float64}[39m
  N[36m::Int64[39m
Locals
  @_4[33m[1m::Union{Nothing, Tuple{Int64, Int64}}[22m[39m
  D[91m[1m::Any[22m[39m
  i[36m::Int64[39m
Body[36m::Nothing[39m
[90m1 ─[39m       (D = Main.zeros(N, N))
[90m│  [39m %2  = (1:N)[36m::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])[39m
[90m│  [39m       (@_4 = Base.iterate(%2))
[90m│  [39m %4  = (@_4 === nothing)[36m::Bool[39m
[90m│  [39m %5  = Base.not_int(%4)[36m::Bool[39m
[90m└──[39m       goto #4 if not %5
[90m2 ┄[39m %7  = @_4[36m::Tuple{Int64, Int64}[39m
[90m│  [39m       (i = Core.getfield(%7, 1))
[90m│  [39m %9  = Core.getfield(%7, 2)[36m::Int64[39m
[90m│ 

In [16]:
function work1!(A, N, b, c) # b and c are now function arguments
    D = zeros(N,N)
    for i in 1:N
        D = b[i]*c*A
        b[i] = sum(D)
    end
end

work1! (generic function with 1 method)

In [17]:
@code_warntype work1!(A,N,b,c)

MethodInstance for work1!(::Matrix{Float64}, ::Int64, ::Vector{Float64}, ::Float64)
  from work1!([90mA[39m, [90mN[39m, [90mb[39m, [90mc[39m)[90m @[39m [90mMain[39m [90m~/repos/JuliaHLRS23/exercises/Day2/1_perf_optimization/solution_proposal/[39m[90m[4mperf_optimization1_solution.ipynb:1[24m[39m
Arguments
  #self#[36m::Core.Const(work1!)[39m
  A[36m::Matrix{Float64}[39m
  N[36m::Int64[39m
  b[36m::Vector{Float64}[39m
  c[36m::Float64[39m
Locals
  @_6[33m[1m::Union{Nothing, Tuple{Int64, Int64}}[22m[39m
  D[36m::Matrix{Float64}[39m
  i[36m::Int64[39m
Body[36m::Nothing[39m
[90m1 ─[39m       (D = Main.zeros(N, N))
[90m│  [39m %2  = (1:N)[36m::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])[39m
[90m│  [39m       (@_6 = Base.iterate(%2))
[90m│  [39m %4  = (@_6 === nothing)[36m::Bool[39m
[90m│  [39m %5  = Base.not_int(%4)[36m::Bool[39m
[90m└──[39m       goto #4 if not %5
[90m2 ┄[39m %7  = @_6[36m::Tuple{Int64, Int64}[

In [18]:
@btime work1!($A, $N, $b, $c);

  529.542 μs (202 allocations: 7.71 MiB)


### Avoiding globals + temporary allocations

In [19]:
function work2!(A, N, b)
    D = zeros(N,N)
    for i in 1:N
        @. D = b[i]*c*A
        b[i] = sum(D)
    end
end

@btime work2!($A, $N, $b);

  355.209 μs (302 allocations: 85.98 KiB)


In [20]:
function work3!(A, N, b, c)
    D = zeros(N,N)
    for i in 1:N
        @inbounds for j in eachindex(D)
            D[j] = b[i]*c*A[j]
        end
        b[i] = sum(D)
    end
end

@btime work3!($A, $N, $b, $c);

  310.042 μs (2 allocations: 78.17 KiB)


### Avoiding globals + temporary allocations and merging `sum` with loop

In [21]:
function work4!(A, N, b, c)
    D = zeros(N,N)
    for i in 1:N
        s = zero(eltype(D))
        @inbounds @simd for j in eachindex(D)
            D[j] = b[i]*c*A[j]
            s += D[j]
        end
        b[i] = s
    end
end

@btime work4!($A, $N, $b, $c);

  233.958 μs (2 allocations: 78.17 KiB)


### Realizing that one can factor out `b` and `c`

In [22]:
# function work!(A, N)
#     D = zeros(N,N)
#     for i in 1:N
#         D = b[i]*c*A
#         b[i] = sum(D)
#     end
# end

# function work!(A, N)
#     for i in 1:N
#         b[i] = sum(b[i]*c*A)
#     end
# end

# function work!(A, N)
#     for i in 1:N
#         b[i] = b[i]*c*sum(A)
#     end
# end

# function work!(A, N)
#     D = c*sum(A)
#     for i in 1:N
#         b[i] *= D
#     end
# end

function work5!(A, N, b, c)
    D = c * sum(A)
    b .*= D
end

@btime work5!($A, $N, $b, $c);

  1.712 μs (0 allocations: 0 bytes)
