### Measure performance with @time and pay attention to memory allocation

In [1]:
function sum_global()
 global x
 s = 0.0
 for i in x
  s += i
 end
 return s
end

x = rand(1)
print("Pierwsza kompilacja jest niemiarodajna\t\t")
@time sum_global(); # żeby skompilować

Pierwsza kompilacja jest niemiarodajna		  0.023568 seconds (6.83 k allocations: 340.258 KiB)


In [2]:
x = rand(1000)
@time sum_global();

  0.000256 seconds (3.49 k allocations: 70.313 KiB)


In [3]:
function sum_arg(x)
 s = 0.0
 for i in x
  s += i
 end
 return s
end

x = rand(1)
print("Pierwsza kompilacja jest niemiarodajna\t\t")
@time sum_arg(x);

Pierwsza kompilacja jest niemiarodajna		  0.027336 seconds (5.27 k allocations: 242.211 KiB)


In [4]:
x = rand(1000)
@time sum_arg(x);

  0.000025 seconds (5 allocations: 176 bytes)


In [5]:
time_sum(x) = @time sum_arg(x)
time_sum(x);

  0.000013 seconds


In [9]:
import BenchmarkTools: @btime

### Avoid containers with abstract type parameters

In [7]:
f(v) = v[1];

In [8]:
a = Real[]
push!(a, 1)
push!(a, 2.0)
push!(a, π)

3-element Array{Real,1}:
 1  
 2.0
 π  

In [9]:
@code_native f(a)

	.text
; ┌ @ In[7]:1 within `f'
	pushq	%rbp
	movq	%rsp, %rbp
	subq	$16, %rsp
	movq	%rsi, -8(%rbp)
	movq	(%rsi), %rdi
; │┌ @ array.jl:744 within `getindex'
	cmpq	$0, 8(%rdi)
	je	L38
	movq	(%rdi), %rax
	movq	(%rax), %rax
	testq	%rax, %rax
	je	L73
; │└
	movq	%rbp, %rsp
	popq	%rbp
	retq
; │┌ @ array.jl:744 within `getindex'
L38:
	movq	%rsp, %rax
	leaq	-16(%rax), %rsi
	movq	%rsi, %rsp
	movq	$1, -16(%rax)
	movabsq	$jl_bounds_error_ints, %rax
	movl	$1, %edx
	callq	*%rax
L73:
	movabsq	$jl_throw, %rax
	movabsq	$jl_system_image_data, %rdi
	callq	*%rax
; └└


In [10]:
a = Float64[]
push!(a, 1)
push!(a, 2.0)
push!(a, π)

3-element Array{Float64,1}:
 1.0              
 2.0              
 3.141592653589793

In [11]:
@code_native f(a)

	.text
; ┌ @ In[7]:1 within `f'
; │┌ @ In[7]:1 within `getindex'
	cmpq	$0, 8(%rdi)
	je	L15
	movq	(%rdi), %rax
	vmovsd	(%rax), %xmm0           # xmm0 = mem[0],zero
; │└
	retq
L15:
	pushq	%rbp
	movq	%rsp, %rbp
; │ @ In[7]:1 within `f'
; │┌ @ array.jl:744 within `getindex'
	movq	%rsp, %rax
	leaq	-16(%rax), %rsi
	movq	%rsi, %rsp
	movq	$1, -16(%rax)
	movabsq	$jl_bounds_error_ints, %rax
	movl	$1, %edx
	callq	*%rax
	nopw	%cs:(%rax,%rax)
; └└


### Avoid fields with abstract types

In [12]:
struct MyAmbiguousType
 a
end

b = MyAmbiguousType("Hello")
c = MyAmbiguousType(17)

@show typeof(b);
@show typeof(c);

typeof(b) = MyAmbiguousType
typeof(c) = MyAmbiguousType


In [13]:
mutable struct MyStillAmbiguousType
 a :: AbstractFloat
end

mutable struct EquivalentByConcreteType{T <: AbstractFloat}
 a :: T
end

t = MyStillAmbiguousType(3.2)
m = EquivalentByConcreteType(3.2)

@show typeof(t);
@show typeof(m);
println()
@show typeof(t.a);
t.a = 4.5f0
@show typeof(t.a);
println()
@show typeof(m.a);
m.a = 4.5f0
@show typeof(m.a);

typeof(t) = MyStillAmbiguousType
typeof(m) = EquivalentByConcreteType{Float64}

typeof(t.a) = Float64
typeof(t.a) = Float32

typeof(m.a) = Float64
typeof(m.a) = Float64


In [14]:
r = EquivalentByConcreteType{AbstractFloat}(3.2)

@show typeof(r.a);
r.a = 4.5f0
@show typeof(r.a);

typeof(r.a) = Float64
typeof(r.a) = Float32


In [15]:
func(m::EquivalentByConcreteType) = m.a + 1
@code_native func(m)

	.text
; ┌ @ In[15]:1 within `func'
; │┌ @ In[15]:1 within `getproperty'
	vmovsd	(%rdi), %xmm0           # xmm0 = mem[0],zero
	movabsq	$140067473019728, %rax  # imm = 0x7F63FFF95350
; │└
; │┌ @ promotion.jl:311 within `+' @ float.jl:401
	vaddsd	(%rax), %xmm0, %xmm0
; │└
	retq
	nopw	%cs:(%rax,%rax)
; └


In [16]:
func(m::MyStillAmbiguousType) = m.a + 1
@code_native func(t)

	.text
; ┌ @ In[16]:1 within `func'
	pushq	%rbx
	subq	$48, %rsp
	vxorps	%xmm0, %xmm0, %xmm0
	vmovaps	%xmm0, (%rsp)
	movq	$0, 16(%rsp)
	movq	%rsi, 40(%rsp)
	movq	%fs:0, %rax
	movq	$2, (%rsp)
	movq	-15712(%rax), %rcx
	movq	%rcx, 8(%rsp)
	movq	%rsp, %rcx
	movq	%rcx, -15712(%rax)
	leaq	-15712(%rax), %rbx
	movq	(%rsi), %rax
; │┌ @ Base.jl:20 within `getproperty'
	movq	(%rax), %rdi
; │└
	movq	-8(%rdi), %rax
	shrq	$4, %rax
	movabsq	$8754233578515, %rcx    # imm = 0x7F640FB9413
	movq	%rdi, 16(%rsp)
	cmpq	%rcx, %rax
	je	L162
	movq	%rdi, 24(%rsp)
	movabsq	$140067642601568, %rax  # imm = 0x7F640A14F060
	movq	%rax, 32(%rsp)
	movabsq	$jl_apply_generic, %rax
	movabsq	$jl_system_image_data, %rdi
	leaq	24(%rsp), %rsi
	movl	$2, %edx
	callq	*%rax
	jmp	L179
L162:
	movabsq	$"+", %rax
	movl	$1, %esi
	callq	*%rax
L179:
	movq	8(%rsp), %rcx
	movq	%rcx, (%rbx)
; │ @ In[16]:1 within `func'
	addq	$48, %rsp
	popq	%rbx
	retq
; └


### Avoid fields with abstract containers

In [17]:
struct MyConcreteContainer{A <: AbstractVector}
 a :: A
end

struct MyAmbiguousContainer{T}
 a :: AbstractVector{T}
end
ur = 1:10  # zwraca zakres
ar =[1:10;]# zwraca kolejne wartości z zakresu

@show ur;
@show ar;

ur = 1:10
ar = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [18]:
@show typeof(MyConcreteContainer(ur));
@show typeof(MyConcreteContainer(ar));
@show typeof(MyAmbiguousContainer(ur));
@show typeof(MyAmbiguousContainer(ar));

typeof(MyConcreteContainer(ur)) = MyConcreteContainer{UnitRange{Int64}}
typeof(MyConcreteContainer(ar)) = MyConcreteContainer{Array{Int64,1}}
typeof(MyAmbiguousContainer(ur)) = MyAmbiguousContainer{Int64}
typeof(MyAmbiguousContainer(ar)) = MyAmbiguousContainer{Int64}


In [19]:
function sumfoo(c :: MyConcreteContainer)
 s = 0
 for x in c.a
  s += foo(x)
 end
 s
end
foo(x::Integer) = x;
foo(x::AbstractFloat) = round(x);

In [20]:
function bar(c :: MyConcreteContainer{ <:AbstractArray{ <:Integer }})
 return c.a[1] + 1
end

function bar(c :: MyConcreteContainer{ <:AbstractArray{ <:AbstractFloat }})
 return c.a[1] + 2
end

function bar(c :: MyConcreteContainer{ Vector{T} }) where T <: Integer
 return c.a[1] + 3
end

@show bar(MyConcreteContainer(1:3));
@show bar(MyConcreteContainer(1.0:3));
@show bar(MyConcreteContainer([1:3;]));

bar(MyConcreteContainer(1:3)) = 2
bar(MyConcreteContainer(1.0:3)) = 3.0
bar(MyConcreteContainer([1:3;])) = 4


### Break functions into multiple definitions

In [21]:
using LinearAlgebra

function _norm(A)
    if isa(A, Vector)
        return sqrt(real(dot(A,A)))
    elseif isa(A, Matrix)
        return maximum(svdvals(A))
    else
        error("_norm: invalid argument")
    end
end

_norm (generic function with 1 method)

In [22]:
_norm(x :: Vector) = sqrt(real(dot(x, x)))
_norm(A :: Matrix) = maximum(svdvals(A))

_norm (generic function with 3 methods)

### Write "type-stable" functions

In [23]:
pos(x) = x < 0 ? 0 : x
@code_warntype pos(5.0)

Variables
  #self#[36m::Core.Compiler.Const(pos, false)[39m
  x[36m::Float64[39m

Body[91m[1m::Union{Float64, Int64}[22m[39m
[90m1 ─[39m %1 = (x < 0)[36m::Bool[39m
[90m└──[39m      goto #3 if not %1
[90m2 ─[39m      return 0
[90m3 ─[39m      return x


In [24]:
pos(x) = x < 0 ? zero(x) : x
@code_warntype pos(5.0)

Variables
  #self#[36m::Core.Compiler.Const(pos, false)[39m
  x[36m::Float64[39m

Body[36m::Float64[39m
[90m1 ─[39m %1 = (x < 0)[36m::Bool[39m
[90m└──[39m      goto #3 if not %1
[90m2 ─[39m %3 = Main.zero(x)[36m::Core.Compiler.Const(0.0, false)[39m
[90m└──[39m      return %3
[90m3 ─[39m      return x


### Avoid changing the type of a variable inside a function

In [25]:
function baz()
 x = 1
 # x = 1.0
 # x :: Float64 = 1
 # x = oneunit(Float64)
 for i = 1:10
  x /= rand()
 end
 return x
end
@code_warntype baz()

Variables
  #self#[36m::Core.Compiler.Const(baz, false)[39m
  x[91m[1m::Union{Float64, Int64}[22m[39m
  @_3[33m[1m::Union{Nothing, Tuple{Int64,Int64}}[22m[39m
  i[36m::Int64[39m

Body[36m::Float64[39m
[90m1 ─[39m       (x = 1)
[90m│  [39m %2  = (1:10)[36m::Core.Compiler.Const(1:10, false)[39m
[90m│  [39m       (@_3 = Base.iterate(%2))
[90m│  [39m %4  = (@_3::Core.Compiler.Const((1, 1), false) === nothing)[36m::Core.Compiler.Const(false, false)[39m
[90m│  [39m %5  = Base.not_int(%4)[36m::Core.Compiler.Const(true, false)[39m
[90m└──[39m       goto #4 if not %5
[90m2 ┄[39m %7  = @_3::Tuple{Int64,Int64}[36m::Tuple{Int64,Int64}[39m
[90m│  [39m       (i = Core.getfield(%7, 1))
[90m│  [39m %9  = Core.getfield(%7, 2)[36m::Int64[39m
[90m│  [39m %10 = x[91m[1m::Union{Float64, Int64}[22m[39m
[90m│  [39m %11 = Main.rand()[36m::Float64[39m
[90m│  [39m       (x = %10 / %11)
[90m│  [39m       (@_3 = Base.iterate(%2, %9))
[90m│  [39m %14 = (@_3

### Separate kernel functions (aka, function barriers)

In [26]:
function strange_twos(n)
 a = Vector{rand() > 0.5 ? Int64 : Float64}(undef, n)
 for i = 1:n
  a[i] = 2
 end
 return a
end
@code_warntype strange_twos(1)
@show strange_twos(1);
@show strange_twos(1);
@show strange_twos(1);
@show strange_twos(1);

Variables
  #self#[36m::Core.Compiler.Const(strange_twos, false)[39m
  n[36m::Int64[39m
  a[91m[1m::Array{_A,1} where _A[22m[39m
  @_4[33m[1m::Union{Nothing, Tuple{Int64,Int64}}[22m[39m
  i[36m::Int64[39m
  @_6[91m[1m::Union{Type{Float64}, Type{Int64}}[22m[39m

Body[91m[1m::Array{_A,1} where _A[22m[39m
[90m1 ─[39m       Core.NewvarNode(:(a))
[90m│  [39m       Core.NewvarNode(:(@_4))
[90m│  [39m %3  = Main.rand()[36m::Float64[39m
[90m│  [39m %4  = (%3 > 0.5)[36m::Bool[39m
[90m└──[39m       goto #3 if not %4
[90m2 ─[39m       (@_6 = Main.Int64)
[90m└──[39m       goto #4
[90m3 ─[39m       (@_6 = Main.Float64)
[90m4 ┄[39m %9  = @_6[91m[1m::Union{Type{Float64}, Type{Int64}}[22m[39m
[90m│  [39m %10 = Core.apply_type(Main.Vector, %9)[91m[1m::Type{Array{_A,1}} where _A[22m[39m
[90m│  [39m       (a = (%10)(Main.undef, n))
[90m│  [39m %12 = (1:n)[36m::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64]

In [27]:
function fill_twos!(a)
 for i = eachindex(a)
  a[i] = 2
 end
end

function strange_twos(n)
 a = Vector{rand(Bool) ? Int64 : Float64}(undef, n)
 fill_twos!(a)
 return a
end
@code_warntype strange_twos(1)

Variables
  #self#[36m::Core.Compiler.Const(strange_twos, false)[39m
  n[36m::Int64[39m
  a[91m[1m::Array{_A,1} where _A[22m[39m
  @_4[91m[1m::Union{Type{Float64}, Type{Int64}}[22m[39m

Body[91m[1m::Array{_A,1} where _A[22m[39m
[90m1 ─[39m      Core.NewvarNode(:(a))
[90m│  [39m %2 = Main.rand(Main.Bool)[36m::Bool[39m
[90m└──[39m      goto #3 if not %2
[90m2 ─[39m      (@_4 = Main.Int64)
[90m└──[39m      goto #4
[90m3 ─[39m      (@_4 = Main.Float64)
[90m4 ┄[39m %7 = @_4[91m[1m::Union{Type{Float64}, Type{Int64}}[22m[39m
[90m│  [39m %8 = Core.apply_type(Main.Vector, %7)[91m[1m::Type{Array{_A,1}} where _A[22m[39m
[90m│  [39m      (a = (%8)(Main.undef, n))
[90m│  [39m      Main.fill_twos!(a)
[90m└──[39m      return a


### Access arrays in memory order, along columns

In [28]:
function copy_cols!(out::Array{T,2}, x::Vector{T}) where T
    inds = axes(x, 1);
    for i = inds
        out[:, i] = x
    end
    return out
end

function copy_rows!(out::Array{T,2}, x::Vector{T}) where T
    inds = axes(x, 1);
    for i = inds
        out[i, :] = x
    end
    return out
end

function copy_col_row!(out::Array{T,2}, x::Vector{T}) where T
    inds = axes(x, 1);
    for col = inds, row = inds
        out[row, col] = x[row]
    end
    return out
end

function copy_row_col!(out::Array{T,2}, x::Vector{T}) where T
    inds = axes(x, 1);
    for row = inds, col = inds
        out[row, col] = x[col]
    end
    return out
end
n = 10_000;
x = randn(n);
out = similar(Array{Float64}, n, n);

In [29]:
@btime copy_cols!(out, x);

  114.268 ms (0 allocations: 0 bytes)


In [30]:
@btime copy_rows!(out, x);

  2.882 s (0 allocations: 0 bytes)


In [31]:
@btime copy_col_row!(out, x);

  85.068 ms (0 allocations: 0 bytes)


In [32]:
@btime copy_row_col!(out, x);

  2.880 s (0 allocations: 0 bytes)


### Pre-allocating outputs

In [33]:
# Trivial
function xinc(x)
 return [x, x+1, x+2]
end

function loopinc()
 y = 0
 for i = 1:10^7
  ret = xinc(i)
  y += ret[2]
 end
 return y
end

loopinc (generic function with 1 method)

In [34]:
# Using pre-allocation
function xinc!(ret, x)
 ret[1] = x
 ret[2] = x+1
 ret[3] = x+2
 nothing
end

function loopinc_prealloc()
 ret = Vector{Int}(undef, 3)
 y = 0
 for i = 1:10^7
  xinc!(ret, i)
  y += ret[2]
 end
 return y
end

loopinc_prealloc (generic function with 1 method)

In [35]:
@btime loopinc();
@btime loopinc_prealloc();

  1.021 s (10000000 allocations: 1.04 GiB)
  10.694 ms (1 allocation: 112 bytes)


### More dots: Fuse vectorized operations

In [36]:
f(x) =    3x.^2 + 4x + 7x.^3;
g(x) = @. 3x^2  + 4x + 7x^3;

x = rand(10^6);

In [37]:
@btime f(x);
@btime g(x);
@btime f.(x);

  8.855 ms (12 allocations: 45.78 MiB)
  1.139 ms (2 allocations: 7.63 MiB)
  1.119 ms (5 allocations: 7.63 MiB)


### Consider using views for slices

In [38]:
       fcopy(x) = sum(x[2:end-1]);
@views fview(x) = sum(x[2:end-1]);

In [39]:
x = rand(10^6);
@btime fcopy(x);
@btime fview(x);

  1.283 ms (3 allocations: 7.63 MiB)
  460.700 μs (2 allocations: 64 bytes)


### Copying data is not always bad

In [10]:
using Random
N, n = 1_000_000, 500_000;
inds = shuffle(1:N)[1:n];
println("Indeksy: ", inds[1:5], "...")

x = randn(N);
A = randn(50, N);

x̂ = zeros(n);
Â = zeros(50, n);

@btime @views sum(A[:, inds] * x[inds])
@btime @views begin
           copyto!(x̂, x[inds])
           copyto!(Â, A[:, inds])
           sum(Â * x̂)
       end

Indeksy: [312390, 17432, 853856, 62834, 420377]...
  323.159 ms (12 allocations: 784 bytes)
  245.241 ms (12 allocations: 784 bytes)


-5008.961654302468

### Use performance annotations

In [11]:
@noinline function inner(x, y)
    s = zero(eltype(x))
    for i=eachindex(x)
        @inbounds s += x[i]*y[i]
    end
    return s
end

@noinline function innersimd(x, y)
    s = zero(eltype(x))
    @simd for i = eachindex(x)
        @inbounds s += x[i] * y[i]
    end
    return s
end

function timeit(n, reps)
    x = rand(Float32, n)
    y = rand(Float32, n)
    s = zero(Float64)
    time = @elapsed for j in 1:reps
        s += inner(x, y)
    end
    println("GFlop/sec        = ", 2n*reps / time*1E-9)
    time = @elapsed for j in 1:reps
        s += innersimd(x, y)
    end
    println("GFlop/sec (SIMD) = ", 2n*reps / time*1E-9)
end

timeit(0, 0); # aby Julia skompilowała funkcje

GFlop/sec        = 0.0
GFlop/sec (SIMD) = 0.0


In [12]:
timeit(1000, 1000)

GFlop/sec        = 1.1638056444573757
GFlop/sec (SIMD) = 15.163002274450342


In [13]:
function init!(u::Vector)
    n = length(u)
    dx = 1.0 / (n-1)
    @fastmath @inbounds @simd for i in 1:n #by asserting that `u` is a `Vector` we can assume it has 1-based indexing
        u[i] = sin(2pi*dx*i)
    end
end

function deriv!(u::Vector, du)
    n = length(u)
    dx = 1.0 / (n-1)
    @fastmath @inbounds du[1] = (u[2] - u[1]) / dx
    @fastmath @inbounds @simd for i in 2:n-1
        du[i] = (u[i+1] - u[i-1]) / (2*dx)
    end
    @fastmath @inbounds du[n] = (u[n] - u[n-1]) / dx
end

function mynorm(u::Vector)
    n = length(u)
    T = eltype(u)
    s = zero(T)
    @fastmath @inbounds @simd for i in 1:n
        s += u[i]^2
    end
    @fastmath @inbounds return sqrt(s)
end

function main()
    n = 2000
    u = Vector{Float64}(undef, n)
    init!(u)
    du = similar(u)

    deriv!(u, du)
    nu = mynorm(du)

    @time for i in 1:10^6
        deriv!(u, du)
        nu = mynorm(du)
    end

    println("ν = ", nu)
end

print("Niemiarodajny wynik: \t\t")
main();
println()
main();

Niemiarodajny wynik: 		  0.632163 seconds
ν = 198.74110382490196

  0.676987 seconds
ν = 198.74110382490196


In [14]:
function init!(u::Vector)
    n = length(u)
    dx = 1.0 / (n-1)
    @inbounds @simd for i in 1:n #by asserting that `u` is a `Vector` we can assume it has 1-based indexing
        u[i] = sin(2pi*dx*i)
    end
end

function deriv!(u::Vector, du)
    n = length(u)
    dx = 1.0 / (n-1)
    @inbounds du[1] = (u[2] - u[1]) / dx
    @inbounds @simd for i in 2:n-1
        du[i] = (u[i+1] - u[i-1]) / (2*dx)
    end
    @inbounds du[n] = (u[n] - u[n-1]) / dx
end

function mynorm(u::Vector)
    n = length(u)
    T = eltype(u)
    s = zero(T)
    @inbounds @simd for i in 1:n
        s += u[i]^2
    end
    @inbounds return sqrt(s)
end

function main()
    n = 2000
    u = Vector{Float64}(undef, n)
    init!(u)
    du = similar(u)

    deriv!(u, du)
    nu = mynorm(du)

    @time for i in 1:10^6
        deriv!(u, du)
        nu = mynorm(du)
    end

    println("ν = ", nu)
end

print("Niemiarodajny wynik: \t\t")
main();
println()
main();

Niemiarodajny wynik: 		  1.669284 seconds
ν = 198.74110382490198

  1.685399 seconds
ν = 198.74110382490198


### Closures and performance of captured variables

In [15]:
function abmult(r :: Int)
    if r < 0
        r = -r
    end
    f = x -> x * r
    return f
end
@code_llvm abmult(-5)


;  @ In[15]:2 within `abmult'
define nonnull %jl_value_t addrspace(10)* @julia_abmult_17714(i64) {
top:
  %1 = alloca %jl_value_t addrspace(10)*, i32 2
  %gcframe = alloca %jl_value_t addrspace(10)*, i32 4
  %2 = bitcast %jl_value_t addrspace(10)** %gcframe to i8*
  call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 32, i32 0, i1 false)
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"()
  %ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -15712
  %ptls = bitcast i8* %ptls_i8 to %jl_value_t***
; ┌ @ boot.jl:338 within `Box'
   %3 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %gcframe, i32 0
   %4 = bitcast %jl_value_t addrspace(10)** %3 to i64*
   store i64 4, i64* %4
   %5 = getelementptr %jl_value_t**, %jl_value_t*** %ptls, i32 0
   %6 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %gcframe, i32 1
   %7 = bitcast %jl_value_t addrspace(10)** %6 to %jl_value_t***
   %8 = load %jl_value_t**, %jl_value_t*** %5
   store %jl_value_t** %8

In [16]:
function abmult2(r0 :: Int)
    r :: Int = r0
    if r < 0
        r = -r
    end
    f = x -> x * r
    return f
end
@code_llvm abmult2(-5)


;  @ In[16]:2 within `abmult2'
define nonnull %jl_value_t addrspace(10)* @julia_abmult2_17715(i64) {
top:
  %gcframe = alloca %jl_value_t addrspace(10)*, i32 3
  %1 = bitcast %jl_value_t addrspace(10)** %gcframe to i8*
  call void @llvm.memset.p0i8.i32(i8* %1, i8 0, i32 24, i32 0, i1 false)
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"()
  %ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -15712
  %ptls = bitcast i8* %ptls_i8 to %jl_value_t***
; ┌ @ boot.jl:339 within `Box'
   %2 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %gcframe, i32 0
   %3 = bitcast %jl_value_t addrspace(10)** %2 to i64*
   store i64 2, i64* %3
   %4 = getelementptr %jl_value_t**, %jl_value_t*** %ptls, i32 0
   %5 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %gcframe, i32 1
   %6 = bitcast %jl_value_t addrspace(10)** %5 to %jl_value_t***
   %7 = load %jl_value_t**, %jl_value_t*** %4
   store %jl_value_t** %7, %jl_value_t*** %6
   %8 = bitcast %jl_value_

### Comparison with Numba

In [17]:
import BenchmarkTools: @btime
function go(a)
    trace = 0
    for i in 1:size(a, 1)
        trace += tanh(a[i, i])
    end
    return a .+ trace
end

x = reshape(1:100, 10, 10)
@btime go(x);

  315.267 ns (1 allocation: 896 bytes)
