# Profiling

In [7]:
# Silly, slow example

In [20]:
using Profile
using LinearAlgebra

In [4]:
A = randn(30, 30)

30×30 Matrix{Float64}:
 -0.914795    -1.96561    -0.903287   …   1.18039    -1.5253      0.101923
  0.00843821  -0.243326    1.43056       -0.394222   -0.947004   -0.0743636
  0.0720819    0.0656803   1.41185       -0.460671    0.434779   -0.10102
  0.367917     1.88246    -0.645694      -0.227686   -0.643866    0.14855
  0.0829269   -0.220208    0.771834      -0.885813   -1.5749      0.387158
 -0.862632    -0.345017    0.560816   …   0.43014    -0.152852    0.717941
  0.608336    -0.732846    0.959007       1.05239     0.509366    1.3045
  1.91098     -0.24113    -0.872244       1.47294    -0.127014   -0.00670381
 -0.685637    -0.742322   -0.680385      -2.44592     0.0817437   0.822828
 -0.17129     -1.31511     0.355184      -2.88387     0.710741   -0.609816
 -0.413969     0.28541    -1.29035    …   1.74428     0.299926   -0.73581
  0.678496    -0.24379    -2.48961        1.41738    -0.839976   -0.862686
 -0.677874    -0.137215    0.240916       1.04924    -0.920848   -1.15836
  ⋮  

In [18]:
@time for i in 1:1000000
   A = A * A
   A /= maximum(A)
end

  4.695663 seconds (3.00 M allocations: 13.724 GiB, 6.78% gc time)


In [13]:
@profile for i in 1:1000000
   A = A * A
   A /= maximum(A)
end

In [14]:
Profile.print()

Overhead ╎ [+additional indent] Count File:Line; Function
   1╎2     @Base/arraymath.jl:24; /(A::Matrix{Float64}, B::Float64)
    ╎ 1     @Base/broadcast.jl:849; broadcast_preserving_zero_d
    ╎  1     @Base/broadcast.jl:860; materialize
    ╎   1     @Base/broadcast.jl:885; copy
    ╎    1     @Base/broadcast.jl:211; similar
    ╎     1     @Base/broadcast.jl:212; similar
    ╎    ╎ 1     .../abstractarray.jl:840; similar
    ╎    ╎  1     ...abstractarray.jl:841; similar
    ╎    ╎   1     @Base/boot.jl:476; Array
    ╎    ╎    1     @Base/boot.jl:469; Array
   1╎    ╎     1     @Base/boot.jl:461; Array
   1╎1     @Base/range.jl:352; steprange_last(start::Int64, s...
   1╎1     @Base/reduce.jl:0; mapreduce_impl(f::typeof(ident...
    ╎2     @Base/reduce.jl:645; mapreduce_impl(f::typeof(ident...
    ╎ 2     @Base/range.jl:22; Colon
    ╎  2     @Base/range.jl:24; _colon
    ╎   2     @Base/range.jl:369; StepRange
    ╎    2     @Base/range.jl:316; StepRange
   2╎     2     @Base/rang

In [26]:
# Put code into a function
function squaremany(A)
    for i in 1:10^6
        A = A * A
        A /= maximum(A)
    end
    return A
end

squaremany (generic function with 1 method)

In [28]:
A = randn(30, 30)
@time squaremany(A);

  4.782865 seconds (2.00 M allocations: 13.709 GiB, 8.04% gc time)


In [33]:
# Manual memory management
function squaremany_fast!(A)
    B = similar(A)
    for i in 1:1000000
       mul!(B, A, A)
       A .= B ./ maximum(B)
    end
    return A
end


squaremany_fast! (generic function with 1 method)

In [41]:
A = randn(30, 30)
@time squaremany_fast!(A);

  3.246797 seconds (1 allocation: 7.188 KiB)


In [39]:
Profile.clear()
@profile squaremany_fast!(A);
Profile.print()

Overhead ╎ [+additional indent] Count File:Line; Function
   4╎4    @Base/range.jl:352; steprange_last(start::Int64, st...
   1╎1    @Base/reduce.jl:0; _mapreduce(f::typeof(identity),...
   1╎1    @Base/reduce.jl:423; _mapreduce(f::typeof(identity),...
   1╎1    @Base/reduce.jl:0; mapreduce_impl(f::typeof(identi...
    ╎2    @Base/reduce.jl:638; mapreduce_impl(f::typeof(identi...
   1╎ 1    @Base/array.jl:924; getindex
   1╎ 1    @Base/reduce.jl:636; getindex
    ╎1    @Base/reduce.jl:645; mapreduce_impl(f::typeof(identi...
    ╎ 1    @Base/range.jl:22; Colon
    ╎  1    @Base/range.jl:24; _colon
    ╎   1    @Base/range.jl:369; StepRange
    ╎    1    @Base/range.jl:316; StepRange
   1╎     1    @Base/range.jl:321; steprange_last(start::Int64,...
    ╎3213 @Base/task.jl:484; (::IJulia.var"#15#18")()
    ╎ 3213 ...lia/src/eventloop.jl:8; eventloop(socket::ZMQ.Socket)
    ╎  3213 @Base/essentials.jl:726; invokelatest
    ╎   3213 @Base/essentials.jl:729; #invokelatest#2
    ╎    3213 ..

# Distributed Computing

In [42]:
using Distributed

In [43]:
nworkers()

1

In [44]:
addprocs(4)

4-element Vector{Int64}:
 2
 3
 4
 5

In [46]:
nworkers()

4

In [47]:
# serial `map`
a = [i for i in 1:10]

10-element Vector{Int64}:
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10

In [53]:
b = map(x -> 2x + 1, a)

10-element Vector{Int64}:
  3
  5
  7
  9
 11
 13
 15
 17
 19
 21

In [57]:
c = pmap(x -> 2x + 1, a)

10-element Vector{Int64}:
  3
  5
  7
  9
 11
 13
 15
 17
 19
 21

In [60]:
@time c = pmap(x -> (sleep(5); 2x + 1), a)

 15.100866 seconds (117.63 k allocations: 6.013 MiB, 0.47% compilation time)


10-element Vector{Int64}:
  3
  5
  7
  9
 11
 13
 15
 17
 19
 21

# Multi-threading

In [61]:
Threads.nthreads()

1

In [66]:
a = [i for i in 1:10]
c = similar(a)
Threads.@threads for i = 1:10
           c[i] = 2*a[i] + 1
end