# Introduction to DataFrames
**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Dec 10, 2017**

A brief introduction to basic usage of `DataFrames`. Tested under `DataFrames` master on 2017-12-05.
I will try to keep it up to date as the package evolves.

In [1]:
using DataFrames
using BenchmarkTools

## Performance tips

### Access by column number is faster than by name

In [2]:
x = DataFrame(rand(5, 1000))
@btime x[500];
@btime x[:x500];

  41.054 ns (0 allocations: 0 bytes)
  88.641 ns (1 allocation: 16 bytes)


### When working with data `DataFrame` use barrier functions or type annotation

In [3]:
function f_bad() # this function will be slow
    srand(1); x = DataFrame(rand(1000000,2))
    y, z = x[1], x[2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i]*z[i]
    end
    p
end

@btime f_bad();

  193.012 ms (5999029 allocations: 122.06 MiB)


In [4]:
@code_warntype f_bad() # the reason is that Julia does not know the types of columns in `DataFrame`

Variables:
  #self# <optimized out>
  i::Int64
  #temp#@_3::Int64
  x::DataFrames.DataFrame
  y <optimized out>
  z <optimized out>
  p[1m[91m::Any[39m[22m
  selected_column@_8 <optimized out>
  selected_column@_9 <optimized out>
  #temp#@_10::Int64

Body:
  begin 
      $(Expr(:invoke, MethodInstance for srand(::Int64), :(Main.srand), 1)) # line 2:
      $(Expr(:inbounds, false))
      # meta: location random.jl rand 285
      SSAValue(9) = 1000000
      SSAValue(10) = 2
      # meta: location random.jl rand 284
      # meta: location random.jl rand 387
      # meta: location random.jl rand 390
      SSAValue(8) = $(Expr(:foreigncall, :(:jl_alloc_array_2d), Array{Float64,2}, svec(Any, Int64, Int64), Array{Float64,2}, 0, SSAValue(9), 0, SSAValue(10), 0))
      # meta: pop location
      # meta: pop location
      # meta: pop location
      # meta: pop location
      $(Expr(:inbounds, :pop))
      SSAValue(11) = $(Expr(:invoke, MethodInstance for rand!(::MersenneTwister, ::Array{Flo

In [5]:
# solution 1 is to use barrier function (it should be possible to use it in almost any code)
function f_inner(y,z)
   p = 0.0
   for i in 1:length(y)
       p += y[i]*z[i]
   end
   p
end

function f_barrier() # extract the work to an inner function
    srand(1); x = DataFrame(rand(1000000,2))
    f_inner(x[1], x[2])
end

function f_inbuilt() # or use inbuilt function if possible
    srand(1); x = DataFrame(rand(1000000,2))
    dot(x[1], x[2])
end

@btime f_barrier();
@btime f_inbuilt();

  16.228 ms (51 allocations: 30.52 MiB)
  15.168 ms (51 allocations: 30.52 MiB)


In [6]:
# solution 2 is to provide the types of extracted columns
# it is simpler but there are cases in which you will not know these types
function f_typed()
    srand(1); x = DataFrame(rand(1000000,2))
    y::Vector{Float64}, z::Vector{Float64} = x[1], x[2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i]*z[i]
    end
    p
end

@btime f_typed();

  15.634 ms (51 allocations: 30.52 MiB)


### Consider using delayed `DataFrame` creation technique

In [7]:
function f1()
    x = DataFrame(Float64, 10^4, 100) # we work with DataFrame directly
    for c in 1:ncol(x)
        d = x[c]
        for r in 1:nrow(x)
            d[r] = rand()
        end
    end
    x
end

function f2()
    x = Vector{Any}(100)
    for c in 1:length(x)
        d = Vector{Float64}(10^4)
        for r in 1:length(d)
            d[r] = rand()
        end
        x[c] = d
    end
    DataFrame(x) # we delay creation of DataFrame after we have our job done
end

@btime f1();
@btime f2();

  38.753 ms (1950236 allocations: 37.43 MiB)
  4.018 ms (1035 allocations: 7.69 MiB)


### You can add rows to a `DataFrame` in place and it is fast

In [8]:
x = DataFrame(rand(10^6, 5))
y = DataFrame(1.0:5.0)
z = [1.0:5.0;]

@btime vcat($x, $y); # creates a new DataFrame - slow
@btime append!($x, $y); # in place - fast

x = DataFrame(rand(10^6, 5)) # reset to the same starting point
@btime push!($x, $z); # add a single row in place - fastest

  14.132 ms (115 allocations: 38.15 MiB)
  23.793 μs (12 allocations: 576 bytes)
  345.576 ns (5 allocations: 80 bytes)


### Allowing `missing` as well as `categorical` slows down computations

In [9]:
using StatsBase

function test(data) # uses countmap function to test performance
    println(eltype(data))
    x = rand(data, 10^6)
    y = categorical(x)
    println(" raw:")
    @btime countmap($x)
    println(" categorical:")
    @btime countmap($y)
    nothing
end

test(1:10)
test([randstring() for i in 1:10])
test(allowmissing(1:10))
test(allowmissing([randstring() for i in 1:10]))


Int64
 raw:
  41.466 ms (4 allocations: 608 bytes)
 categorical:
  82.488 ms (4 allocations: 608 bytes)
String
 raw:
  69.105 ms (4 allocations: 608 bytes)
 categorical:
  88.669 ms (4 allocations: 608 bytes)
Union{Int64, Missings.Missing}
 raw:
  119.277 ms (1989774 allocations: 30.36 MiB)
 categorical:
  143.282 ms (1989774 allocations: 30.36 MiB)
Union{Missings.Missing, String}
 raw:
  147.912 ms (1989774 allocations: 30.36 MiB)
 categorical:
  182.565 ms (1989774 allocations: 30.36 MiB)
