# SIMD

SIMD stands for "Single Instruction Multiple Data" and falls into the category of instruction level parallelism (vector instructions).

In [1]:
function mysum(X)
    acc = zero(eltype(X))
    for i in 1:length(X)
        @inbounds acc += X[i]
    end
    return acc
end

mysum (generic function with 1 method)

Think about if each loop iteration is independent.

Integer addition is **associative** and the order of operations has no impact. Floating-point addition is **non-associative** and the order of operations is important.

By using `@simd`, we are asserting several properties of the loop:

* It is safe to execute iterations in arbitrary or overlapping order, with special consideration for reduction variables.
* Floating-point operations on reduction variables can be reordered, possibly causing different results than without `@simd`.

In [2]:
function mysum_simd(X)
    acc = zero(eltype(X))
    @simd for i in 1:length(X)
        @inbounds acc += X[i]
    end
    return acc
end

mysum_simd (generic function with 1 method)

In [5]:
using BenchmarkTools

In [6]:
X = rand(Float64, 1000)
@btime mysum($X);
@btime mysum_simd($X);

  964.571 ns (0 allocations: 0 bytes)
  63.594 ns (0 allocations: 0 bytes)


In [7]:
X = rand(Int64, 1000)
@btime mysum($X);
@btime mysum_simd($X);

  42.304 ns (0 allocations: 0 bytes)
  42.062 ns (0 allocations: 0 bytes)


In [60]:
X = rand(Float64, 1000)
s = mysum(X);
s_simd = mysum_simd(X);

In [61]:
s == s_simd # will sometimes be false!!!

false

In [62]:
abs(s-s_simd)

3.979039320256561e-13

In [85]:
@code_native debuginfo=:none mysum(X)

	.section	__TEXT,__text,regular,pure_instructions
	movq	8(%rdi), %rax
	testq	%rax, %rax
	jle	L46
	movq	%rax, %rcx
	sarq	$63, %rcx
	andnq	%rax, %rcx, %rax
	movq	(%rdi), %rcx
	vxorpd	%xmm0, %xmm0, %xmm0
	xorl	%edx, %edx
	nop
L32:
	vaddsd	(%rcx,%rdx,8), %xmm0, %xmm0
	incq	%rdx
	cmpq	%rdx, %rax
	jne	L32
	retq
L46:
	vxorps	%xmm0, %xmm0, %xmm0
	retq
	nopw	%cs:(%rax,%rax)
	nopl	(%rax)


In [86]:
@code_native debuginfo=:none mysum_simd(X)

	.section	__TEXT,__text,regular,pure_instructions
	movq	8(%rdi), %rax
	testq	%rax, %rax
	jle	L38
	movq	%rax, %rcx
	sarq	$63, %rcx
	andnq	%rax, %rcx, %rax
	movq	(%rdi), %rcx
	cmpq	$16, %rax
	jae	L43
	vxorpd	%xmm0, %xmm0, %xmm0
	xorl	%edx, %edx
	jmp	L160
L38:
	vxorps	%xmm0, %xmm0, %xmm0
	retq
L43:
	movl	%eax, %esi
	andl	$15, %esi
	movq	%rax, %rdx
	subq	%rsi, %rdx
	vxorpd	%xmm0, %xmm0, %xmm0
	xorl	%esi, %esi
	vxorpd	%xmm1, %xmm1, %xmm1
	vxorpd	%xmm2, %xmm2, %xmm2
	vxorpd	%xmm3, %xmm3, %xmm3
	nopl	(%rax,%rax)
L80:
	vaddpd	(%rcx,%rsi,8), %ymm0, %ymm0
	vaddpd	32(%rcx,%rsi,8), %ymm1, %ymm1
	vaddpd	64(%rcx,%rsi,8), %ymm2, %ymm2
	vaddpd	96(%rcx,%rsi,8), %ymm3, %ymm3
	addq	$16, %rsi
	cmpq	%rsi, %rdx
	jne	L80
	vaddpd	%ymm0, %ymm1, %ymm0
	vaddpd	%ymm0, %ymm2, %ymm0
	vaddpd	%ymm0, %ymm3, %ymm0
	vextractf128	$1, %ymm0, %xmm1
	vaddpd	%xmm1, %xmm0, %xmm0
	vpermilpd	$1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0]
	vaddsd	%xmm1, %xmm0, %xmm0
	cmpq	%rdx, %rax
	je	L173
	nopw	%cs:(%rax,%rax)
	nop
L160:
	vaddsd	(%rcx,

## Structure of Array vs Array of Structure

In [71]:
struct MyComplex
  real::Float64
  imag::Float64
end

In [72]:
# Array of structure
AoS = [MyComplex(rand(),rand()) for i in 1:100]

100-element Array{MyComplex,1}:
 MyComplex(0.6592548704085348, 0.9304586794954257)
 MyComplex(0.9913134870338749, 0.3373358399535811)
 MyComplex(0.7663231112437918, 0.16124195631290505)
 MyComplex(0.5755773381794937, 0.9431803344795369)
 MyComplex(0.9956338880282287, 0.4319106210605499)
 MyComplex(0.49960033171605334, 0.7529216102553398)
 MyComplex(0.8204880588928809, 0.9863152363918228)
 MyComplex(0.5705486006928855, 0.1738368480210566)
 MyComplex(0.4068115374711485, 0.029177320500149984)
 MyComplex(0.21301055949797165, 0.9679714353523348)
 MyComplex(0.9974383981581558, 0.29947623977721083)
 MyComplex(0.9848752636488678, 0.32791550924604396)
 MyComplex(0.5236772849803057, 0.3775565120013684)
 ⋮
 MyComplex(0.7185535695725727, 0.7598989393612539)
 MyComplex(0.6001965780226173, 0.5802951959128866)
 MyComplex(0.49588186913586174, 0.10012096401953086)
 MyComplex(0.16766488639028188, 0.9472293366031512)
 MyComplex(0.2615159764839159, 0.4192320802218239)
 MyComplex(0.09092995990517339, 0.913

In [73]:
struct MyComplexes
  real::Vector{Float64}
  imag::Vector{Float64}
end

In [74]:
# Structure of arrays
SoA = MyComplexes(rand(100),rand(100))

MyComplexes([0.7670543146544782, 0.11064727011172515, 0.5310624348465611, 0.6791947059200489, 0.8883372308097439, 0.08006495273449965, 0.11686879735999023, 0.36517639577782957, 0.5646585684295811, 0.5431547534003083  …  0.3201719903361038, 0.9266966687298588, 0.7167174950881661, 0.8281757203583004, 0.18757444662413092, 0.16869451871160956, 0.32797033191750136, 0.6487781588729336, 0.9048863406349708, 0.4908804023503073], [0.3262879318395253, 0.4781541926714652, 0.38157165655704484, 0.34136149073340305, 0.08022845373449461, 0.16884335386306448, 0.3578108892995484, 0.6680231518357203, 0.9845223962560516, 0.6942841065189596  …  0.6107080731144634, 0.7420895417132569, 0.8071728571275285, 0.17683738666930893, 0.7977909381993387, 0.9343701479446693, 0.6575204448982133, 0.9416595975440156, 0.7872602039537329, 0.6915809774733781])

In [75]:
# Array of structure (MyComplex)
Base.:+(x::MyComplex,y::MyComplex) = MyComplex(x.real+y.real,x.imag+y.imag)
Base.:/(x::MyComplex,y::Int) = MyComplex(x.real/y,x.imag/y)
average(x::Vector{MyComplex}) = sum(x)/length(x)

# Structure of array (MyComplexes)
average(x::MyComplexes) = MyComplex(sum(x.real),sum(x.imag))/length(x.real)

average (generic function with 2 methods)

In [76]:
using BenchmarkTools

In [78]:
@btime average(AoS);
@btime average(SoA);

  37.275 ns (0 allocations: 0 bytes)
  28.785 ns (0 allocations: 0 bytes)
