# Introduction to DataFrames
**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), July 17, 2021**

In [1]:
using DataFrames

In [2]:
using BenchmarkTools

In [3]:
using CategoricalArrays

In [4]:
using PooledArrays

## Performance tips

### Access by column number is faster than by name

In [5]:
x = DataFrame(rand(5, 1000), :auto)
@btime $x[!, 500];
@btime $x.x500;

  5.000 ns (0 allocations: 0 bytes)
  36.714 ns (1 allocation: 32 bytes)


### When working with data `DataFrame` use barrier functions or type annotation

In [6]:
using Random
function f_bad() # this function will be slow
    Random.seed!(1); x = DataFrame(rand(1000000,2), :auto)
    y, z = x[!, 1], x[!, 2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i]*z[i]
    end
    p
end

@btime f_bad();
# if you run @code_warntype f_bad() then you notice
# that Julia does not know column types of `DataFrame`


  111.635 ms (5999015 allocations: 122.06 MiB)


In [7]:
# solution 1 is to use barrier function (it should be possible to use it in almost any code)
function f_inner(y,z)
   p = 0.0
   for i in 1:length(y)
       p += y[i]*z[i]
   end
   p
end

function f_barrier() # extract the work to an inner function
    Random.seed!(1); x = DataFrame(rand(1000000,2), :auto)
    f_inner(x[!, 1], x[!, 2])
end

using LinearAlgebra
function f_inbuilt() # or use inbuilt function if possible
    Random.seed!(1); x = DataFrame(rand(1000000,2), :auto)
    dot(x[!, 1], x[!, 2])
end

@btime f_barrier();
@btime f_inbuilt();

  8.125 ms (35 allocations: 30.52 MiB)
  7.509 ms (35 allocations: 30.52 MiB)


In [8]:
# solution 2 is to provide the types of extracted columns
# it is simpler but there are cases in which you will not know these types
# This example  assumes that you have DataFrames master at least from August 31, 2018
function f_typed()
    Random.seed!(1); x = DataFrame(rand(1000000,2), :auto)
    y::Vector{Float64}, z::Vector{Float64} = x[!, 1], x[!, 2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i]*z[i]
    end
    p
end

@btime f_typed();

  8.166 ms (35 allocations: 30.52 MiB)


In general for tall and narrow tables it is often useful to use `Tables.rowtable`, `Tables.columntable` or `Tables.namedtupleiterator` for intermediate processing of data in a type-stable way.

### Consider using delayed `DataFrame` creation technique

also notice the difference in performance between copying vs non-copying data frame creation

In [9]:
function f1()
    x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100], :auto, copycols=false) # we work with a DataFrame directly
    for c in 1:ncol(x)
        d = x[!, c]
        for r in 1:nrow(x)
            d[r] = rand()
        end
    end
    x
end

function f1a()
    x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100], :auto) # we work with a DataFrame directly
    for c in 1:ncol(x)
        d = x[!, c]
        for r in 1:nrow(x)
            d[r] = rand()
        end
    end
    x
end

function f2()
    x = Vector{Any}(undef, 100)
    for c in 1:length(x)
        d = Vector{Float64}(undef, 10^4)
        for r in 1:length(d)
            d[r] = rand()
        end
        x[c] = d
    end
    DataFrame(x, :auto, copycols=false) # we delay creation of DataFrame after we have our job done
end

function f2a()
    x = Vector{Any}(undef, 100)
    for c in 1:length(x)
        d = Vector{Float64}(undef, 10^4)
        for r in 1:length(d)
            d[r] = rand()
        end
        x[c] = d
    end
    DataFrame(x, :auto) # we delay creation of DataFrame after we have our job done
end

@btime f1();
@btime f1a();
@btime f2();
@btime f2a();

  32.414 ms (1949523 allocations: 37.40 MiB)
  35.522 ms (1949723 allocations: 45.04 MiB)
  4.253 ms (623 allocations: 7.66 MiB)
  5.616 ms (823 allocations: 15.30 MiB)


### You can add rows to a `DataFrame` in place and it is fast

In [10]:
x = DataFrame(rand(10^6, 5), :auto)
y = DataFrame(transpose(1.0:5.0), :auto)
z = [1.0:5.0;]

@btime vcat($x, $y); # creates a new DataFrame - slow
@btime append!($x, $y); # in place - fast

x = DataFrame(rand(10^6, 5), :auto) # reset to the same starting point
@btime push!($x, $z); # add a single row in place - fast

  6.926 ms (153 allocations: 38.16 MiB)
  1.270 μs (17 allocations: 832 bytes)
  553.226 ns (16 allocations: 256 bytes)


### Allowing `missing` as well as `categorical` slows down computations

In [11]:
using StatsBase

function test(data) # uses countmap function to test performance
    println(eltype(data))
    x = rand(data, 10^6)
    y = categorical(x)
    println(" raw:")
    @btime countmap($x)
    println(" categorical:")
    @btime countmap($y)
    nothing
end

test(1:10)
test([randstring() for i in 1:10])
test(allowmissing(1:10))
test(allowmissing([randstring() for i in 1:10]))


Int64
 raw:
  3.558 ms (7 allocations: 7.63 MiB)
 categorical:
  21.997 ms (1000004 allocations: 30.52 MiB)
String
 raw:
  28.855 ms (4 allocations: 608 bytes)
 categorical:
  36.338 ms (1000004 allocations: 30.52 MiB)
Union{Missing, Int64}
 raw:
  6.906 ms (4 allocations: 624 bytes)
 categorical:
  19.068 ms (1000004 allocations: 30.52 MiB)
Union{Missing, String}
 raw:
  21.905 ms (4 allocations: 608 bytes)
 categorical:
  30.331 ms (1000004 allocations: 30.52 MiB)


### When aggregating use column selector and prefer integer, categorical, or pooled array grouping variable

In [12]:
df = DataFrame(x=rand('a':'d', 10^7), y=1);

In [13]:
gdf = groupby(df, :x)

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Char,Int64
1,d,1
2,d,1
3,d,1
4,d,1
5,d,1
6,d,1
7,d,1
8,d,1
9,d,1
10,d,1

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Char,Int64
1,c,1
2,c,1
3,c,1
4,c,1
5,c,1
6,c,1
7,c,1
8,c,1
9,c,1
10,c,1


In [14]:
@btime combine(v -> sum(v.y), $gdf) # traditional syntax, slow

  56.403 ms (342 allocations: 19.09 MiB)


Unnamed: 0_level_0,x,x1
Unnamed: 0_level_1,Char,Int64
1,d,2499188
2,a,2499372
3,b,2498500
4,c,2502940


In [15]:
@btime combine($gdf, :y=>sum) # use column selector

  10.352 ms (231 allocations: 15.03 KiB)


Unnamed: 0_level_0,x,y_sum
Unnamed: 0_level_1,Char,Int64
1,d,2499188
2,a,2499372
3,b,2498500
4,c,2502940


In [16]:
transform!(df, :x => categorical => :x);

In [17]:
gdf = groupby(df, :x)

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Cat…,Int64
1,a,1
2,a,1
3,a,1
4,a,1
5,a,1
6,a,1
7,a,1
8,a,1
9,a,1
10,a,1

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Cat…,Int64
1,d,1
2,d,1
3,d,1
4,d,1
5,d,1
6,d,1
7,d,1
8,d,1
9,d,1
10,d,1


In [18]:
@btime combine($gdf, :y=>sum)

  11.954 ms (238 allocations: 15.69 KiB)


Unnamed: 0_level_0,x,y_sum
Unnamed: 0_level_1,Cat…,Int64
1,a,2499372
2,b,2498500
3,c,2502940
4,d,2499188


In [19]:
transform!(df, :x => PooledArray{Char} => :x)

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Char,Int64
1,d,1
2,d,1
3,a,1
4,a,1
5,a,1
6,d,1
7,a,1
8,a,1
9,d,1
10,b,1


In [20]:
gdf = groupby(df, :x)

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Char,Int64
1,d,1
2,d,1
3,d,1
4,d,1
5,d,1
6,d,1
7,d,1
8,d,1
9,d,1
10,d,1

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Char,Int64
1,c,1
2,c,1
3,c,1
4,c,1
5,c,1
6,c,1
7,c,1
8,c,1
9,c,1
10,c,1


In [21]:
@btime combine($gdf, :y=>sum)

  10.550 ms (233 allocations: 15.09 KiB)


Unnamed: 0_level_0,x,y_sum
Unnamed: 0_level_1,Char,Int64
1,d,2499188
2,a,2499372
3,b,2498500
4,c,2502940


### Use views instead of materializing a new DataFrame

In [22]:
x = DataFrame(rand(100, 1000), :auto)

Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.822841,0.627123,0.0787094,0.880556,0.315632,0.232786,0.577346,0.371028
2,0.402965,0.0229243,0.768259,0.03316,0.0257622,0.0568977,0.648605,0.8735
3,0.910438,0.440111,0.926592,0.982104,0.701911,0.809936,0.644246,0.35027
4,0.114476,0.88898,0.856592,0.955567,0.537406,0.614434,0.781799,0.399502
5,0.139316,0.436851,0.806506,0.222235,0.5632,0.221634,0.804874,0.253417
6,0.346014,0.00208929,0.305866,0.102841,0.649045,0.512559,0.793464,0.142623
7,0.279502,0.125278,0.46532,0.508624,0.770491,0.250046,0.964776,0.440098
8,0.125844,0.100991,0.537101,0.15584,0.155365,0.520906,0.74011,0.0436753
9,0.864695,0.783787,0.523708,0.733103,0.306951,0.867453,0.168743,0.904454
10,0.98536,0.994226,0.656058,0.403829,0.821321,0.275431,0.474932,0.0293327


In [23]:
@btime $x[1:1, :]

  204.700 μs (2985 allocations: 190.69 KiB)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.822841,0.627123,0.0787094,0.880556,0.315632,0.232786,0.577346,0.371028


In [24]:
@btime $x[1, :]

  29.032 ns (0 allocations: 0 bytes)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.822841,0.627123,0.0787094,0.880556,0.315632,0.232786,0.577346,0.371028


In [25]:
@btime view($x, 1:1, :)

  27.520 ns (0 allocations: 0 bytes)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.822841,0.627123,0.0787094,0.880556,0.315632,0.232786,0.577346,0.371028


In [26]:
@btime $x[1:1, 1:20]

  4.643 μs (50 allocations: 4.16 KiB)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.822841,0.627123,0.0787094,0.880556,0.315632,0.232786,0.577346,0.371028


In [27]:
@btime $x[1, 1:20]

  31.552 ns (0 allocations: 0 bytes)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.822841,0.627123,0.0787094,0.880556,0.315632,0.232786,0.577346,0.371028


In [28]:
@btime view($x, 1:1, 1:20)

  31.621 ns (0 allocations: 0 bytes)


Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.822841,0.627123,0.0787094,0.880556,0.315632,0.232786,0.577346,0.371028
