# Performance optimization exercise 1

Optimize the following code.

(The type and size of the input is fixed/may not be changed.)

In [74]:
function work!(A, N)
    D = zeros(N,N)
    for i in 1:N
        D = b[i]*c*A
        b[i] = sum(D)
    end
end

N = 100
A = rand(N,N)
b = rand(N)
c = 1.23

work!(A,N)

In [75]:
using BenchmarkTools
@btime work!($A, $N);

  2.796 ms (502 allocations: 7.72 MiB)


## Optimizations

### Avoiding globals

In [76]:
@code_warntype work!(A,N)

Variables
  #self#[36m::Core.Compiler.Const(work!, false)[39m
  A[36m::Array{Float64,2}[39m
  N[36m::Int64[39m
  D[91m[1m::Any[22m[39m
  @_5[33m[1m::Union{Nothing, Tuple{Int64,Int64}}[22m[39m
  i[36m::Int64[39m

Body[36m::Nothing[39m
[90m1 ─[39m       (D = Main.zeros(N, N))
[90m│  [39m %2  = (1:N)[36m::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])[39m
[90m│  [39m       (@_5 = Base.iterate(%2))
[90m│  [39m %4  = (@_5 === nothing)[36m::Bool[39m
[90m│  [39m %5  = Base.not_int(%4)[36m::Bool[39m
[90m└──[39m       goto #4 if not %5
[90m2 ┄[39m %7  = @_5::Tuple{Int64,Int64}[36m::Tuple{Int64,Int64}[39m
[90m│  [39m       (i = Core.getfield(%7, 1))
[90m│  [39m %9  = Core.getfield(%7, 2)[36m::Int64[39m
[90m│  [39m %10 = Base.getindex(Main.b, i)[91m[1m::Any[22m[39m
[90m│  [39m       (D = %10 * Main.c * A)
[90m│  [39m %12 = Main.sum(D)[91m[1m::Any[22m[39m
[90m│  [39m       Base.setindex!(Main.b,

In [77]:
function work1!(A, N, b, c)
    D = zeros(N,N)
    for i in 1:N
        D = b[i]*A
        b[i] = sum(D)
    end
end

work1! (generic function with 2 methods)

In [78]:
@code_warntype work1!(A,N,b,c)

Variables
  #self#[36m::Core.Compiler.Const(work1!, false)[39m
  A[36m::Array{Float64,2}[39m
  N[36m::Int64[39m
  b[36m::Array{Float64,1}[39m
  c[36m::Float64[39m
  D[36m::Array{Float64,2}[39m
  @_7[33m[1m::Union{Nothing, Tuple{Int64,Int64}}[22m[39m
  i[36m::Int64[39m

Body[36m::Nothing[39m
[90m1 ─[39m       (D = Main.zeros(N, N))
[90m│  [39m %2  = (1:N)[36m::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])[39m
[90m│  [39m       (@_7 = Base.iterate(%2))
[90m│  [39m %4  = (@_7 === nothing)[36m::Bool[39m
[90m│  [39m %5  = Base.not_int(%4)[36m::Bool[39m
[90m└──[39m       goto #4 if not %5
[90m2 ┄[39m %7  = @_7::Tuple{Int64,Int64}[36m::Tuple{Int64,Int64}[39m
[90m│  [39m       (i = Core.getfield(%7, 1))
[90m│  [39m %9  = Core.getfield(%7, 2)[36m::Int64[39m
[90m│  [39m %10 = Base.getindex(b, i)[36m::Float64[39m
[90m│  [39m       (D = %10 * A)
[90m│  [39m %12 = Main.sum(D)[36m::Float64[39m
[90m│

In [79]:
@btime work1!($A, $N, $b, $c);

  2.774 ms (202 allocations: 7.71 MiB)


### Avoiding globals + temporary allocations

In [46]:
function work2!(A, N, b)
    D = zeros(N,N)
    for i in 1:N
        @. D = b[i]*A
        b[i] = sum(D)
    end
end

@btime work2!($A, $N, $b);

  258.407 μs (2 allocations: 78.20 KiB)


In [47]:
function work3!(A, N, b, c)
    D = zeros(N,N)
    for i in 1:N
        @inbounds for j in eachindex(D)
            D[j] = b[i]*c*A[j]
        end
        b[i] = sum(D)
    end
end

@btime work3!($A, $N, $b, $c);

  236.812 μs (2 allocations: 78.20 KiB)


### Avoiding globals + temporary allocations and merging `sum` with loop

In [50]:
function work4!(A, N, b, c)
    D = zeros(N,N)
    for i in 1:N
        s = 0.0
        @inbounds @simd for j in eachindex(D)
            D[j] = b[i]*c*A[j]
            s += D[j]
        end
        b[i] = s
    end
end

@btime work4!($A, $N, $b, $c);

  148.491 μs (2 allocations: 78.20 KiB)
