**Most of a program's run time is spent inside loops**

Mostly function calls inside loops.

**The act of calling these functions itself better be fast!**

### But it doesn't look like it should be fast!

To execute `foo(x,y)`, Julia chooses the **method for foo that is _most_ specific to types of x, and y**.

**Looking up this method can be expensive!** There are hundreds of methods for `+(x,y)` picking the most specific one can involve a lot of head-scratching.

But Julia can do it when **compiling** the code, once and forever!

In [75]:
function sum1(xs)
    s = zero(xs[1])

    for i=1:length(xs)
        s = +(s, xs[i])
    end
    s
end

sum1 (generic function with 1 method)

In [76]:
@code_typed optimize=false sum1([1+2im, 7+9im])

CodeInfo(
[90m[77G│[1G[39m[90m2 [39m1 ─ %1  = (Base.getindex)(xs, 1)[36m::Complex{Int64}[39m
[90m[77G│[1G[39m[90m  [39m│         (s = (Main.zero)(%1))[90m::Complex{Int64}[39m
[90m[77G│[1G[39m[90m4 [39m│   %3  = (Main.length)(xs)[36m::Int64[39m
[90m[77G│[1G[39m[90m  [39m│   %4  = (1:%3)[36m::UnitRange{Int64}[39m
[90m[77G│[1G[39m[90m  [39m│         (#temp# = (Base.iterate)(%4))[90m::Union{Nothing, Tuple{Int64,Int64}}[39m
[90m[77G│[1G[39m[90m  [39m│   %6  = (#temp# === nothing)[36m::Bool[39m
[90m[77G│[1G[39m[90m  [39m│   %7  = (Base.not_int)(%6)[36m::Bool[39m
[90m[77G│[1G[39m[90m  [39m└──       goto #4 if not %7
[90m[77G│[1G[39m[90m  [39m2 ┄ %9  = #temp#::Tuple{Int64,Int64}[36m::Tuple{Int64,Int64}[39m
[90m[77G│[1G[39m[90m  [39m│         (i = (Core.getfield)(%9, 1))[90m::Int64[39m
[90m[77G│[1G[39m[90m  [39m│   %11 = (Core.getfield)(%9, 2)[36m::Int64[39m
[90m[77G│[1G[39m[90m5 [39m│   %12 = s[36m::Co

In [123]:
@code_typed sum1([1+2im, 7+9im]) # Also gets inlined -- no call to + on Complex anymore!

CodeInfo(
[90m[55G│╻     getindex[1G[39m[90m2 [39m1 ──       (Base.arrayref)(true, xs, 1)[90m::Complex{Int64}[39m
[90m[55G││╻╷╷╷  oftype[1G[39m[90m  [39m│    %2  = %new(Complex{Int64}, 0, 0)[36m::Complex{Int64}[39m
[90m[55G│╻     length[1G[39m[90m4 [39m│    %3  = (Base.arraylen)(xs)[36m::Int64[39m
[90m[55G││╻╷╷╷  Type[1G[39m[90m  [39m│    %4  = (Base.sle_int)(1, %3)[36m::Bool[39m
[90m[55G│││╻     unitrange_last[1G[39m[90m  [39m│          (Base.sub_int)(%3, 1)[90m::Int64[39m
[90m[55G││││  [1G[39m[90m  [39m│    %6  = (Base.ifelse)(%4, %3, 0)[36m::Int64[39m
[90m[55G││╻╷╷   isempty[1G[39m[90m  [39m│    %7  = (Base.slt_int)(%6, 1)[36m::Bool[39m
[90m[55G││    [1G[39m[90m  [39m└───       goto #3 if not %7
[90m[55G││    [1G[39m[90m  [39m2 ──       goto #4
[90m[55G││    [1G[39m[90m  [39m3 ──       goto #4
[90m[55G│     [1G[39m[90m  [39m4 ┄─ %11 = φ (#2 => true, #3 => false)[36m::Bool[39m
[90m[55G│     [1G[39m

In [82]:
@noinline add(x,y) = x+y # forcefully avoid inlining

add (generic function with 1 method)

In [83]:
function sum2(xs)
    s = zero(xs[1])

    for i=1:length(xs)
        s = add(s, xs[i])
    end
    s
end

sum2 (generic function with 1 method)

In [84]:
@code_native sum2([1+1im, 2+3im])

	.text
; Function sum2 {
; Location: In[83]:2
	pushq	%rbp
	movq	%rsp, %rbp
	pushq	%r15
	pushq	%r14
	pushq	%r13
	pushq	%r12
	pushq	%rbx
	subq	$56, %rsp
	movq	%rsi, %rbx
; Function getindex; {
; Location: array.jl:731
	movq	8(%rbx), %r14
	testq	%r14, %r14
	je	L217
;}
; Location: In[83]:4
	jle	L191
	movq	%rdi, -48(%rbp)
; Location: In[83]:4
	vxorps	%xmm0, %xmm0, %xmm0
	vmovaps	%xmm0, -80(%rbp)
	movl	$1, %r15d
	xorl	%r12d, %r12d
	leaq	-96(%rbp), %r13
	nopw	%cs:(%rax,%rax)
; Location: In[83]:5
; Function getindex; {
; Location: array.jl:731
L80:
	movq	(%rbx), %rax
	vmovups	(%rax,%r12), %xmm0
	vmovaps	%xmm0, -96(%rbp)
;}
	leaq	-64(%rbp), %rdi
	leaq	-80(%rbp), %rsi
	movq	%r13, %rdx
	movabsq	$julia_add_36169, %rax
	callq	*%rax
; Function iterate; {
; Location: range.jl:575
; Function ==; {
; Location: promotion.jl:425
	cmpq	%r15, %r14
;}}
	je	L180
	vmovaps	-64(%rbp), %xmm0
; Location: In[83]:4
	vmovaps	%xmm0, -80(%rbp)
; Location: In[83]:5
; Function getindex; {
; Location: array.jl:731
	addq	

Let's try to create a function that doesn't allow you to infer its result type

In [128]:
vec1 = [1,2]
@show typeof(vec1)

vec2 = Union{Int, Float64}[1,2.3]
@show typeof(vec2)

vec3 = [1,"hi"]
@show typeof(vec3)
vec[1] # what is the output type of vec[i] for all i?


function bad_add(x,y)
    d=[0, "yes"] # intentionally create a Vector{Any}
    x+y+d[1]
end

typeof(vec1) = Array{Int64,1}
typeof(vec2) = Array{Union{Float64, Int64},1}
typeof(vec3) = Array{Any,1}


bad_add (generic function with 1 method)

In [117]:


function sum3(xs)
    s = zero(xs[1])

    for i=1:length(xs)
        s = bad_add(s, xs[i])
    end
    s
end

sum3 (generic function with 1 method)

In [118]:
@code_typed bad_add(1,2)

CodeInfo(
[90m[61G│     [1G[39m[90m2 [39m1 ── %1  = (Core.tuple)(0)[36m::Tuple{Int64}[39m
[90m[61G│╻     getindex[1G[39m[90m  [39m│    %2  = $(Expr(:foreigncall, :(:jl_alloc_array_1d), Array{Any,1}, svec(Any, Int64), :(:ccall), 2, Array{Any,1}, 1, 1))[36m::Array{Any,1}[39m
[90m[61G││╻╷╷   Colon[1G[39m[90m  [39m│          (Base.ifelse)(true, 1, 0)[90m::Int64[39m
[90m[61G│││╻╷╷   isempty[1G[39m[90m  [39m│    %4  = (Base.slt_int)(1, 1)[36m::Bool[39m
[90m[61G│││   [1G[39m[90m  [39m└───       goto #3 if not %4
[90m[61G│││   [1G[39m[90m  [39m2 ──       goto #4
[90m[61G│││   [1G[39m[90m  [39m3 ──       goto #4
[90m[61G││    [1G[39m[90m  [39m4 ┄─ %8  = φ (#2 => true, #3 => false)[36m::Bool[39m
[90m[61G││    [1G[39m[90m  [39m│    %9  = φ (#3 => 1)[36m::Int64[39m
[90m[61G││    [1G[39m[90m  [39m│    %10 = φ (#3 => 1)[36m::Int64[39m
[90m[61G││    [1G[39m[90m  [39m│    %11 = (Base.not_int)(%8)[36m::Bool[39m
[90m[61G

In [119]:
@code_warntype sum3([1,2])

Body[91m[1m::Any[22m[39m
[90m[55G│╻     getindex[1G[39m[90m7  [39m1 ──       (Base.arrayref)(true, xs, 1)
[90m[55G│╻     length[1G[39m[90m9  [39m│    %2  = (Base.arraylen)(xs)[36m::Int64[39m
[90m[55G││╻╷╷╷  Type[1G[39m[90m   [39m│    %3  = (Base.sle_int)(1, %2)[36m::Bool[39m
[90m[55G│││╻     unitrange_last[1G[39m[90m   [39m│          (Base.sub_int)(%2, 1)
[90m[55G││││  [1G[39m[90m   [39m│    %5  = (Base.ifelse)(%3, %2, 0)[36m::Int64[39m
[90m[55G││╻╷╷   isempty[1G[39m[90m   [39m│    %6  = (Base.slt_int)(%5, 1)[36m::Bool[39m
[90m[55G││    [1G[39m[90m   [39m└───       goto #3 if not %6
[90m[55G││    [1G[39m[90m   [39m2 ──       goto #4
[90m[55G││    [1G[39m[90m   [39m3 ──       goto #4
[90m[55G│     [1G[39m[90m   [39m4 ┄─ %10 = φ (#2 => true, #3 => false)[36m::Bool[39m
[90m[55G│     [1G[39m[90m   [39m│    %11 = φ (#3 => 1)[36m::Int64[39m
[90m[55G│     [1G[39m[90m   [39m│    %12 = φ (#3 => 1)[36m::Int6

### Common pitfall Number 1. use of global variables in loops

In [15]:
xs = rand(10^6)
function sum1_global()
    s = zero(xs[1])
    
    for i=1:length(xs)
        s = +(s, xs[i])
    end
    s
end

sum1_global (generic function with 1 method)

In [24]:
@code_warntype sum1_global()

Body[91m[1m::Any[22m[39m
[90m[56G│    [1G[39m[90m3 [39m1 ─ %1  = (Base.getindex)(Main.xs, 1)[91m[1m::Any[22m[39m
[90m[56G│    [1G[39m[90m  [39m│   %2  = (Main.zero)(%1)[91m[1m::Any[22m[39m
[90m[56G│    [1G[39m[90m5 [39m│   %3  = (Main.length)(Main.xs)[91m[1m::Any[22m[39m
[90m[56G│    [1G[39m[90m  [39m│   %4  = (isa)(%3, Int64)[36m::Bool[39m
[90m[56G│    [1G[39m[90m  [39m└──       goto #3 if not %4
[90m[56G│    [1G[39m[90m  [39m2 ─ %6  = π (%3, [36mInt64[39m)
[90m[56G│╻╷╷╷ Colon[1G[39m[90m  [39m│   %7  = (Base.sle_int)(1, %6)[36m::Bool[39m
[90m[56G││╻    Type[1G[39m[90m  [39m│         (Base.sub_int)(%6, 1)
[90m[56G│││┃│   unitrange_last[1G[39m[90m  [39m│   %9  = (Base.sub_int)(1, 1)[36m::Int64[39m
[90m[56G││││ [1G[39m[90m  [39m│   %10 = (Base.ifelse)(%7, %6, %9)[36m::Int64[39m
[90m[56G│││  [1G[39m[90m  [39m│   %11 = %new(UnitRange{Int64}, 1, %10)[36m::UnitRange{Int64}[39m
[90m[56G│    [1G[3

### Fixes
- Make the global an argument!
- Make the global a `const`

In [111]:
const arr = rand(10^6)
function sum2_global()
    s = zero(arr[1])
    
    for i=1:length(arr)
        s = s + arr[i]
    end
    s
end



sum2_global (generic function with 1 method)

In [114]:
@time sum1_global()
@time sum2_global()

  0.126528 seconds (5.00 M allocations: 91.537 MiB, 5.34% gc time)
  0.001476 seconds (5 allocations: 176 bytes)


500242.1887876529

**Let's practice propagating type information in our head**

Run through the program and find out the types assigned to each variable over time.

In [27]:
function logint2(n)
    r = 0
    while n > 1
        n = n/2
        r += 1
    end
    r
end

logint2 (generic function with 1 method)

In [32]:
function logint2(n) # assume n::Float64
    r = 0 # Int
    while n > 1 # n :: Float64
        n = n/2 # n :: Float64
        r += 1 # Int
    end
    r # Int
end

logint2 (generic function with 1 method)

In [42]:
@code_typed logint2(10.0)

CodeInfo(
[90m[69G│   [1G[39m[90m  [39m1 ─       nothing[90m::Nothing[39m
[90m[69G│   [1G[39m[90m3 [39m2 ┄ %2  = φ (#1 => 0, #3 => %12)[36m::Int64[39m
[90m[69G│   [1G[39m[90m  [39m│   %3  = φ (#1 => _2, #3 => %11)[36m::Float64[39m
[90m[69G│╻   >[1G[39m[90m  [39m│   %4  = π (1.0, [36mFloat64[39m)
[90m[69G││┃│  <[1G[39m[90m  [39m│   %5  = (Base.lt_float)(%4, %3)[36m::Bool[39m
[90m[69G│││ [1G[39m[90m  [39m│   %6  = π (1.0, [36mFloat64[39m)
[90m[69G│││╻   ==[1G[39m[90m  [39m│   %7  = (Base.eq_float)(%6, %3)[36m::Bool[39m
[90m[69G│││╻   &[1G[39m[90m  [39m│   %8  = (Base.and_int)(%7, false)[36m::Bool[39m
[90m[69G│││╻   |[1G[39m[90m  [39m│   %9  = (Base.or_int)(%5, %8)[36m::Bool[39m
[90m[69G│   [1G[39m[90m  [39m└──       goto #4 if not %9
[90m[69G│╻╷  /[1G[39m[90m4 [39m3 ─ %11 = (Base.div_float)(%3, 2.0)[36m::Float64[39m
[90m[69G│╻   +[1G[39m[90m5 [39m│   %12 = (Base.add_int)(%2, 1)[36m::Int64[39m
[

In [33]:
function logint2(n) # assume n::Int
    r = 0 # Int
    while n > 1 # n :: Union{Int, Float64} (propagated from below)
        n = n/2 # n :: Union{Int, Float64}
        r += 1 # Int
    end
    r # Int
end

logint2 (generic function with 1 method)

In [43]:
@code_typed logint2(10)

CodeInfo(
[90m[57G│     [1G[39m[90m  [39m1 ──       nothing[90m::Nothing[39m
[90m[57G│     [1G[39m[90m3 [39m2 ┄─ %2  = φ (#1 => 0, #13 => %41)[36m::Int64[39m
[90m[57G│     [1G[39m[90m  [39m│    %3  = φ (#1 => _2, #13 => %40)[36m::Union{Float64, Int64}[39m
[90m[57G│     [1G[39m[90m  [39m│    %4  = (isa)(%3, Float64)[36m::Bool[39m
[90m[57G│     [1G[39m[90m  [39m└───       goto #4 if not %4
[90m[57G│     [1G[39m[90m  [39m3 ── %6  = π (%3, [36mFloat64[39m)
[90m[57G│╻╷    >[1G[39m[90m  [39m│    %7  = (Base.sitofp)(Float64, 1)[36m::Float64[39m
[90m[57G││╻     <[1G[39m[90m  [39m│    %8  = (Base.lt_float)(%7, %6)[36m::Bool[39m
[90m[57G│││╻     ==[1G[39m[90m  [39m│    %9  = (Base.eq_float)(%7, %6)[36m::Bool[39m
[90m[57G││││  [1G[39m[90m  [39m│    %10 = (Base.eq_float)(%7, 9.22337e18)[36m::Bool[39m
[90m[57G│││╻     unsafe_trunc[1G[39m[90m  [39m│    %11 = (Base.fptosi)(Int64, %7)[36m::Int64[39m
[90m[57G│││╻   

In [44]:
function sumlog(xs)
    s = zero(xs[1])
    
    for i=1:length(xs)
        s = s + logint2(xs[i]) # If type of xs[i] is known, result of logint2 will be known
    end
    s
end

sumlog (generic function with 1 method)

In [57]:
@code_typed optimize=false sumlog(xs)

CodeInfo(
[90m[77G│[1G[39m[90m2 [39m1 ─ %1  = (Base.getindex)(xs, 1)[36m::Float64[39m
[90m[77G│[1G[39m[90m  [39m│         (s = (Main.zero)(%1))[90m::Const(0.0, false)[39m
[90m[77G│[1G[39m[90m4 [39m│   %3  = (Main.length)(xs)[36m::Int64[39m
[90m[77G│[1G[39m[90m  [39m│   %4  = (1:%3)[36m::UnitRange{Int64}[39m
[90m[77G│[1G[39m[90m  [39m│         (#temp# = (Base.iterate)(%4))[90m::Union{Nothing, Tuple{Int64,Int64}}[39m
[90m[77G│[1G[39m[90m  [39m│   %6  = (#temp# === nothing)[36m::Bool[39m
[90m[77G│[1G[39m[90m  [39m│   %7  = (Base.not_int)(%6)[36m::Bool[39m
[90m[77G│[1G[39m[90m  [39m└──       goto #4 if not %7
[90m[77G│[1G[39m[90m  [39m2 ┄ %9  = #temp#::Tuple{Int64,Int64}[36m::Tuple{Int64,Int64}[39m
[90m[77G│[1G[39m[90m  [39m│         (i = (Core.getfield)(%9, 1))[90m::Int64[39m
[90m[77G│[1G[39m[90m  [39m│   %11 = (Core.getfield)(%9, 2)[36m::Int64[39m
[90m[77G│[1G[39m[90m5 [39m│   %12 = s[36m::Float6

In [65]:
@time logint2.([i for i in 1:2^20]);

  0.097134 seconds (55.88 k allocations: 18.758 MiB, 1.91% gc time)


# Summary

**Uncertainty** is the root cause of slowness.

It can come in many forms!

1. Globals -- uncertainty about type after compilation
2. Type instability -- uncertainty about the input or output type
3. Not fixed length -- arrays or strings don't have a fixed size
3. Mutability -- uncertainty that someone else may mutate the object
4. Boxing -- uncertainty about type or, size of values

Use `@time` and pay attention to allocations -- uncertainty leads to boxing which leads to GC overhead, so you will see it!

## References

- Performance tips from the manual https://docs.julialang.org/en/v0.6.1/manual/performance-tips/
- Engineering Julia for Speed - Lionel Zoubritzky
 https://www.youtube.com/watch?v=XWIZ_dCO6X8
- Information overload: tools for making program analysis and debugging manageable - Jameson Nash JuliaCon 2018