**Important note:** We are using `Cxx.jl` here which only works on julia 1.3.1 :(

https://github.com/mitmath/18S096/blob/master/lectures/lecture1/Performance-variation.ipynb
https://github.com/mitmath/18S096/blob/master/lectures/lecture1/Boxes-and-registers.ipynb

In [8]:
using BenchmarkTools
using Cxx
using PyCall

In [9]:
x = rand(10^7);
d = Dict() # to store the measurement results

Dict{Any,Any} with 0 entries

## Hand-written C

In [40]:
c_code = """
#include <stddef.h>
double c_sum(size_t n, double *X) {
    double s = 0.0;
    for (size_t i = 0; i < n; ++i) {
        s += X[i];
    }
    return s;
}
"""

"#include <stddef.h>\ndouble c_sum(size_t n, double *X) {\n    double s = 0.0;\n    for (size_t i = 0; i < n; ++i) {\n        s += X[i];\n    }\n    return s;\n}\n"

In [41]:
# compile to a shared library by piping C_code to gcc:
# (only works if you have gcc installed)
const Clib = tempname()
using Libdl

In [42]:
open(`gcc -fPIC -O3 -msse3 -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
    print(f, c_code)
end

In [43]:
c_sum(X::Array{Float64}) = ccall(("c_sum", Clib), Float64, (Csize_t, Ptr{Float64}), length(X), X)

c_sum (generic function with 1 method)

In [44]:
c_sum(x) ≈ sum(x)

true

In [45]:
b = @benchmark c_sum($x)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     11.135 ms (0.00% GC)
  median time:      12.231 ms (0.00% GC)
  mean time:        12.287 ms (0.00% GC)
  maximum time:     15.974 ms (0.00% GC)
  --------------
  samples:          407
  evals/sample:     1

In [16]:
d["C"] = minimum(b.times) / 1e6

11.241185

## Hand-written C++

In [17]:
cpp_code = cxx"""
double sum_array(size_t n, double *X) {
    double s = 0.0;
    for (size_t i = 0; i < n; ++i) {
        s += X[i];
    }
    return s;
}
"""

true

In [18]:
cpp_sum(xs) = @cxx sum_array(length(xs), pointer(xs))

cpp_sum (generic function with 1 method)

In [19]:
b = @benchmark cpp_sum($x)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     24.008 ms (0.00% GC)
  median time:      24.773 ms (0.00% GC)
  mean time:        25.394 ms (0.00% GC)
  maximum time:     30.628 ms (0.00% GC)
  --------------
  samples:          197
  evals/sample:     1

In [20]:
d["C++"] = minimum(b.times) / 1e6

24.007968

## Built-in Python / numpy `sum`

In [21]:
# call a low-level PyCall function to get a Python list, because
# by default PyCall will convert to a NumPy array instead (we benchmark NumPy below):
xpy_list = PyCall.array2py(x)
# get the Python built-in "sum" function:
pysum = pybuiltin("sum")

b = @benchmark pysum($xpy_list)

BenchmarkTools.Trial: 
  memory estimate:  48 bytes
  allocs estimate:  3
  --------------
  minimum time:     51.067 ms (0.00% GC)
  median time:      54.054 ms (0.00% GC)
  mean time:        54.420 ms (0.00% GC)
  maximum time:     59.876 ms (0.00% GC)
  --------------
  samples:          92
  evals/sample:     1

In [22]:
d["Python (built-in)"] = minimum(b.times) / 1e6

51.06728

In [23]:
numpy_sum = pyimport("numpy").sum
xpy_numpy = PyObject(x) # converts to a numpy array by default
b = @benchmark $numpy_sum($xpy_numpy)

BenchmarkTools.Trial: 
  memory estimate:  48 bytes
  allocs estimate:  3
  --------------
  minimum time:     3.866 ms (0.00% GC)
  median time:      4.186 ms (0.00% GC)
  mean time:        4.363 ms (0.00% GC)
  maximum time:     6.830 ms (0.00% GC)
  --------------
  samples:          1144
  evals/sample:     1

In [24]:
d["Python (numpy)"] = minimum(b.times) / 1e6

3.865609

## Hand-written Python

In [25]:
py"""
def mysum(a):
    s = 0.0
    for x in a:
        s = s + x
    return s
"""
mysum_py = py"mysum"

PyObject <function mysum at 0x7f92a1f31c20>

In [26]:
b = @benchmark $mysum_py($xpy_list)

BenchmarkTools.Trial: 
  memory estimate:  48 bytes
  allocs estimate:  3
  --------------
  minimum time:     235.610 ms (0.00% GC)
  median time:      238.732 ms (0.00% GC)
  mean time:        241.121 ms (0.00% GC)
  maximum time:     252.385 ms (0.00% GC)
  --------------
  samples:          21
  evals/sample:     1

In [27]:
d["Python (hand-written)"] = minimum(b.times) / 1e6

235.609548

In [26]:
# @btime $mysum_py($xpy_numpy)

## Built-in Julia `sum`

In [28]:
b = @benchmark sum($x)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     3.569 ms (0.00% GC)
  median time:      4.765 ms (0.00% GC)
  mean time:        4.888 ms (0.00% GC)
  maximum time:     10.608 ms (0.00% GC)
  --------------
  samples:          1021
  evals/sample:     1

In [29]:
d["Julia (built-in)"] = minimum(b.times) / 1e6

3.569286

In [31]:
x_any = Vector{Any}(x)
b = @benchmark sum($x_any)

BenchmarkTools.Trial: 
  memory estimate:  152.59 MiB
  allocs estimate:  9999999
  --------------
  minimum time:     202.155 ms (0.00% GC)
  median time:      220.342 ms (0.00% GC)
  mean time:        225.099 ms (8.63% GC)
  maximum time:     253.700 ms (17.68% GC)
  --------------
  samples:          23
  evals/sample:     1

In [32]:
d["Julia (built-in, Any)"] = minimum(b.times) / 1e6

202.155479

## Hand-written Julia

In [33]:
function mysum1(A)
    s = zero(eltype(A)) # the correct type of zero for A
    for a in A
        s += a
    end
    return s
end

mysum1 (generic function with 1 method)

In [34]:
b = @benchmark mysum1($x)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     11.023 ms (0.00% GC)
  median time:      12.436 ms (0.00% GC)
  mean time:        12.518 ms (0.00% GC)
  maximum time:     15.818 ms (0.00% GC)
  --------------
  samples:          400
  evals/sample:     1

In [35]:
d["Julia (hand-written)"] = minimum(b.times) / 1e6

11.023287

In [36]:
function mysum(A)
    s = zero(eltype(A)) # the correct type of zero for A
    @simd for a in A
        s += a
    end
    return s
end

mysum (generic function with 1 method)

In [37]:
b = @benchmark mysum($x)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     3.556 ms (0.00% GC)
  median time:      4.872 ms (0.00% GC)
  mean time:        4.901 ms (0.00% GC)
  maximum time:     7.262 ms (0.00% GC)
  --------------
  samples:          1018
  evals/sample:     1

In [38]:
d["Julia (hand-written, simd)"] = minimum(b.times) / 1e6

3.555818

## Summary

In [39]:
for (key, value) in sort(collect(d), by=x->x[2])
    println(rpad(key, 30, "."), lpad(round(value, digits=2), 10, "."))
end

Julia (hand-written, simd)..........3.56
Julia (built-in)....................3.57
Python (numpy)......................3.87
Julia (hand-written)...............11.02
C..................................11.24
C++................................24.01
Python (built-in)..................51.07
Julia (built-in, Any).............202.16
Python (hand-written).............235.61
