# Introduction to DataFrames
**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), August 16, 2019**

In [1]:
using Pkg
Pkg.activate(".")

"d:\\Dev\\Julia\\DataFrames_Tutorial\\Project.toml"

In [2]:
using DataFrames
using BenchmarkTools

## Performance tips

### Access by column number is faster than by name

In [3]:
x = DataFrame(rand(5, 1000))
@btime $x[!, 500];
@btime $x.x500;

  6.305 ns (0 allocations: 0 bytes)
  19.153 ns (0 allocations: 0 bytes)


### When working with data `DataFrame` use barrier functions or type annotation

In [4]:
using Random
function f_bad() # this function will be slow
    Random.seed!(1); x = DataFrame(rand(1000000,2))
    y, z = x[!, 1], x[!, 2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i]*z[i]
    end
    p
end

@btime f_bad();
# if you run @code_warntype f_bad() then you notice
# that Julia does not know column types of `DataFrame`


  210.469 ms (5999023 allocations: 122.06 MiB)


In [5]:
# solution 1 is to use barrier function (it should be possible to use it in almost any code)
function f_inner(y,z)
   p = 0.0
   for i in 1:length(y)
       p += y[i]*z[i]
   end
   p
end

function f_barrier() # extract the work to an inner function
    Random.seed!(1); x = DataFrame(rand(1000000,2))
    f_inner(x[!, 1], x[!, 2])
end

using LinearAlgebra
function f_inbuilt() # or use inbuilt function if possible
    Random.seed!(1); x = DataFrame(rand(1000000,2))
    dot(x[!, 1], x[!, 2])
end

@btime f_barrier();
@btime f_inbuilt();

  26.627 ms (45 allocations: 30.52 MiB)
  27.351 ms (45 allocations: 30.52 MiB)


In [6]:
# solution 2 is to provide the types of extracted columns
# it is simpler but there are cases in which you will not know these types
# This example  assumes that you have DataFrames master at least from August 31, 2018
function f_typed()
    Random.seed!(1); x = DataFrame(rand(1000000,2))
    y::Vector{Float64}, z::Vector{Float64} = x[!, 1], x[!, 2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i]*z[i]
    end
    p
end

@btime f_typed();

  28.875 ms (45 allocations: 30.52 MiB)


### Consider using delayed `DataFrame` creation technique

also notice the difference in performance between `DataFrame` and `DataFrame!` (copying vs non-copying data frame creation)

In [7]:
function f1()
    x = DataFrame!([Vector{Float64}(undef, 10^4) for i in 1:100]) # we work with a DataFrame directly
    for c in 1:ncol(x)
        d = x[!, c]
        for r in 1:nrow(x)
            d[r] = rand()
        end
    end
    x
end

function f1a()
    x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100]) # we work with a DataFrame directly
    for c in 1:ncol(x)
        d = x[!, c]
        for r in 1:nrow(x)
            d[r] = rand()
        end
    end
    x
end

function f2()
    x = Vector{Any}(undef, 100)
    for c in 1:length(x)
        d = Vector{Float64}(undef, 10^4)
        for r in 1:length(d)
            d[r] = rand()
        end
        x[c] = d
    end
    DataFrame!(x) # we delay creation of DataFrame after we have our job done
end

function f2a()
    x = Vector{Any}(undef, 100)
    for c in 1:length(x)
        d = Vector{Float64}(undef, 10^4)
        for r in 1:length(d)
            d[r] = rand()
        end
        x[c] = d
    end
    DataFrame(x) # we delay creation of DataFrame after we have our job done
end

@btime f1();
@btime f1a();
@btime f2();
@btime f2a();

  41.633 ms (1949837 allocations: 37.42 MiB)
  29.323 ms (1950037 allocations: 45.06 MiB)
  2.867 ms (937 allocations: 7.69 MiB)
  5.765 ms (1137 allocations: 15.32 MiB)


### You can add rows to a `DataFrame` in place and it is fast

In [8]:
x = DataFrame(rand(10^6, 5))
y = DataFrame(transpose(1.0:5.0))
z = [1.0:5.0;]

@btime vcat($x, $y); # creates a new DataFrame - slow
@btime append!($x, $y); # in place - fast

x = DataFrame(rand(10^6, 5)) # reset to the same starting point
@btime push!($x, $z); # add a single row in place - fast

  30.904 ms (174 allocations: 38.16 MiB)
  743.667 ns (11 allocations: 176 bytes)
  546.491 ns (16 allocations: 256 bytes)


### Allowing `missing` as well as `categorical` slows down computations

In [9]:
using StatsBase

function test(data) # uses countmap function to test performance
    println(eltype(data))
    x = rand(data, 10^6)
    y = categorical(x)
    println(" raw:")
    @btime countmap($x)
    println(" categorical:")
    @btime countmap($y)
    nothing
end

test(1:10)
test([randstring() for i in 1:10])
test(allowmissing(1:10))
test(allowmissing([randstring() for i in 1:10]))


Int64
 raw:
  5.544 ms (7 allocations: 7.63 MiB)
 categorical:
  47.402 ms (4 allocations: 608 bytes)
String
 raw:
  67.141 ms (4 allocations: 608 bytes)
 categorical:
  85.638 ms (4 allocations: 608 bytes)
Union{Missing, Int64}
 raw:
  11.672 ms (4 allocations: 624 bytes)
 categorical:
  23.086 ms (4 allocations: 608 bytes)
Union{Missing, String}
 raw:
  46.571 ms (4 allocations: 608 bytes)
 categorical:
  71.580 ms (4 allocations: 608 bytes)


### When aggregating use column selector and prefer categorical or pooled array grouping variable

In [10]:
df = DataFrame(x=rand('a':'d', 10^7), y=1);

In [11]:
@btime by($df, :x, v -> sum(v.y)) # traditional syntax, slow

  727.792 ms (196 allocations: 433.19 MiB)


Unnamed: 0_level_0,x,x1
Unnamed: 0_level_1,Char,Int64
1,'d',2500861
2,'b',2497802
3,'c',2502939
4,'a',2498398


In [12]:
@btime by($df, :x, :y=>sum) # use column selector

  546.222 ms (159 allocations: 356.89 MiB)


Unnamed: 0_level_0,x,y_sum
Unnamed: 0_level_1,Char,Int64
1,'d',2500861
2,'b',2497802
3,'c',2502939
4,'a',2498398


In [13]:
categorical!(df, :x);

In [14]:
@btime by($df, :x, :y=>sum)

  229.163 ms (178 allocations: 152.60 MiB)


Unnamed: 0_level_0,x,y_sum
Unnamed: 0_level_1,Categorical…,Int64
1,'a',2498398
2,'b',2497802
3,'c',2502939
4,'d',2500861


In [15]:
using PooledArrays

In [16]:
df.x = PooledArray{Char}(df.x)

10000000-element PooledArray{Char,UInt8,1,Array{UInt8,1}}:
 'd'
 'b'
 'd'
 'd'
 'd'
 'd'
 'c'
 'd'
 'd'
 'd'
 'c'
 'd'
 'c'
 ⋮  
 'd'
 'd'
 'c'
 'c'
 'b'
 'c'
 'd'
 'a'
 'b'
 'a'
 'c'
 'd'

In [17]:
@btime by($df, :x, :y=>sum)

  230.642 ms (168 allocations: 152.60 MiB)


Unnamed: 0_level_0,x,y_sum
Unnamed: 0_level_1,Char,Int64
1,'d',2500861
2,'b',2497802
3,'c',2502939
4,'a',2498398


### Use views instead of materializing a new DataFrame

In [18]:
x = DataFrame(rand(100, 1000))

Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.637801,0.499658,0.946076,0.381518,0.541173,0.33955,0.988044,0.789436
2,0.582944,0.450625,0.576639,0.509009,0.674542,0.291518,0.839739,0.231549
3,0.25576,0.924204,0.509361,0.100793,0.73373,0.897498,0.509463,0.00914906
4,0.448119,0.459261,0.397853,0.150296,0.686973,0.616155,0.771118,0.424968
5,0.600548,0.725675,0.177003,0.570394,0.283683,0.754999,0.139011,0.751878
6,0.545967,0.0630025,0.836106,0.958643,0.723384,0.870861,0.786544,0.783141
7,0.318235,0.197158,0.0245412,0.147027,0.191275,0.690648,0.868467,0.79229
8,0.0332237,0.982657,0.0736654,0.151617,0.805835,0.668803,0.989204,0.86314
9,0.635475,0.642825,0.95715,0.855208,0.205377,0.35821,0.39587,0.663429
10,0.433199,0.0185481,0.399948,0.974413,0.828756,0.0820863,0.325566,0.795758


In [19]:
@btime $x[1:1, :]

  136.701 μs (1511 allocations: 194.41 KiB)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8,x9
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.637801,0.499658,0.946076,0.381518,0.541173,0.33955,0.988044,0.789436,0.908469


In [20]:
@btime $x[1, :]

  49.698 ns (1 allocation: 32 bytes)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8,x9
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.637801,0.499658,0.946076,0.381518,0.541173,0.33955,0.988044,0.789436,0.908469


In [21]:
@btime view($x, 1:1, :)

  62.449 ns (1 allocation: 48 bytes)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8,x9
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.637801,0.499658,0.946076,0.381518,0.541173,0.33955,0.988044,0.789436,0.908469


In [22]:
@btime $x[1:1, 1:20]

  7.900 μs (55 allocations: 7.23 KiB)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8,x9
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.637801,0.499658,0.946076,0.381518,0.541173,0.33955,0.988044,0.789436,0.908469


In [23]:
@btime $x[1, 1:20]

  28.995 ns (2 allocations: 80 bytes)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8,x9
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.637801,0.499658,0.946076,0.381518,0.541173,0.33955,0.988044,0.789436,0.908469


In [24]:
@btime view($x, 1:1, 1:20)

  34.256 ns (2 allocations: 96 bytes)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8,x9
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.637801,0.499658,0.946076,0.381518,0.541173,0.33955,0.988044,0.789436,0.908469
