# Introduction to DataFrames
**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Dec 5, 2017**

A brief introduction to basic usage of `DataFrames`. Tested under `DataFrames` master on 2017-12-05.
I will try to keep it up to date as the package evolves.

In [1]:
using DataFrames # load package
srand(1);

## Manipulating rows of DataFrame

### Reordering rows

In [2]:
x = DataFrame(id=1:10, x = rand(10), y = [zeros(5); ones(5)]) # and we hope that x[:x] is not sorted :)

Unnamed: 0,id,x,y
1,1,0.236033,0.0
2,2,0.346517,0.0
3,3,0.312707,0.0
4,4,0.00790928,0.0
5,5,0.488613,0.0
6,6,0.210968,1.0
7,7,0.951916,1.0
8,8,0.999905,1.0
9,9,0.251662,1.0
10,10,0.986666,1.0


In [3]:
sort!(x, cols=:x) # sort x in place

Unnamed: 0,id,x,y
1,4,0.00790928,0.0
2,6,0.210968,1.0
3,1,0.236033,0.0
4,9,0.251662,1.0
5,3,0.312707,0.0
6,2,0.346517,0.0
7,5,0.488613,0.0
8,7,0.951916,1.0
9,10,0.986666,1.0
10,8,0.999905,1.0


In [4]:
y = sort(x, cols=:id) # new DataFrame

Unnamed: 0,id,x,y
1,1,0.236033,0.0
2,2,0.346517,0.0
3,3,0.312707,0.0
4,4,0.00790928,0.0
5,5,0.488613,0.0
6,6,0.210968,1.0
7,7,0.951916,1.0
8,8,0.999905,1.0
9,9,0.251662,1.0
10,10,0.986666,1.0


In [5]:
sort(x, cols = (:y, :x), rev=(true, false))

Unnamed: 0,id,x,y
1,6,0.210968,1.0
2,9,0.251662,1.0
3,7,0.951916,1.0
4,10,0.986666,1.0
5,8,0.999905,1.0
6,4,0.00790928,0.0
7,1,0.236033,0.0
8,3,0.312707,0.0
9,2,0.346517,0.0
10,5,0.488613,0.0


In [6]:
sort(x, cols = (order(:y, rev=true), :x)) # the same as above

Unnamed: 0,id,x,y
1,6,0.210968,1.0
2,9,0.251662,1.0
3,7,0.951916,1.0
4,10,0.986666,1.0
5,8,0.999905,1.0
6,4,0.00790928,0.0
7,1,0.236033,0.0
8,3,0.312707,0.0
9,2,0.346517,0.0
10,5,0.488613,0.0


In [7]:
sort(x, cols = (order(:y, rev=true), order(:x, by=v->rem(v,1)))) # some more fancy sorting stuff

Unnamed: 0,id,x,y
1,6,0.210968,1.0
2,9,0.251662,1.0
3,7,0.951916,1.0
4,10,0.986666,1.0
5,8,0.999905,1.0
6,4,0.00790928,0.0
7,1,0.236033,0.0
8,3,0.312707,0.0
9,2,0.346517,0.0
10,5,0.488613,0.0


In [8]:
x[shuffle(1:10), :] # reorder rows (here randomly)

Unnamed: 0,id,x,y
1,8,0.999905,1.0
2,10,0.986666,1.0
3,7,0.951916,1.0
4,2,0.346517,0.0
5,5,0.488613,0.0
6,1,0.236033,0.0
7,6,0.210968,1.0
8,9,0.251662,1.0
9,4,0.00790928,0.0
10,3,0.312707,0.0


In [9]:
sort!(x, cols=:id)
x[[1,10],:] = x[[10,1],:] # swap rows
x

Unnamed: 0,id,x,y
1,10,0.986666,1.0
2,2,0.346517,0.0
3,3,0.312707,0.0
4,4,0.00790928,0.0
5,5,0.488613,0.0
6,6,0.210968,1.0
7,7,0.951916,1.0
8,8,0.999905,1.0
9,9,0.251662,1.0
10,1,0.236033,0.0


In [10]:
x[1,:], x[10,:] = x[10,:], x[1,:] # and swap again
x

Unnamed: 0,id,x,y
1,1,0.236033,0.0
2,2,0.346517,0.0
3,3,0.312707,0.0
4,4,0.00790928,0.0
5,5,0.488613,0.0
6,6,0.210968,1.0
7,7,0.951916,1.0
8,8,0.999905,1.0
9,9,0.251662,1.0
10,10,0.986666,1.0


### Merging/adding rows

In [11]:
x = DataFrame(rand(3, 5))

Unnamed: 0,x1,x2,x3,x4,x5
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054


In [12]:
[x; x] # merge by rows - data frames must have the same column names; the same is vcat

Unnamed: 0,x1,x2,x3,x4,x5
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166
6,0.46335,0.976312,0.455692,0.548983,0.648054


In [13]:
append!(x, x) # the same but modifies x

Unnamed: 0,x1,x2,x3,x4,x5
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166
6,0.46335,0.976312,0.455692,0.548983,0.648054


In [14]:
push!(x, 1:5) # add one row to x at the end; must give correct number of values and correct types
x

Unnamed: 0,x1,x2,x3,x4,x5
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166
6,0.46335,0.976312,0.455692,0.548983,0.648054
7,1.0,2.0,3.0,4.0,5.0


In [15]:
push!(x, Dict(:x1=> 11, :x2=> 12, :x3=> 13, :x4=> 14, :x5=> 15)) # also works with dictionaries
x

Unnamed: 0,x1,x2,x3,x4,x5
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166
6,0.46335,0.976312,0.455692,0.548983,0.648054
7,1.0,2.0,3.0,4.0,5.0
8,11.0,12.0,13.0,14.0,15.0


### Subsetting/removing rows

In [16]:
x[1:2, :] # by index

Unnamed: 0,x1,x2,x3,x4,x5
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166


In [17]:
view(x, 1:2) # the same but a view

Unnamed: 0,x1,x2,x3,x4,x5
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166


In [18]:
x[repmat([true, false], 4), 1:3] # by Bool, exact length required

Unnamed: 0,x1,x2,x3
1,0.0856352,0.185821,0.0516146
2,0.46335,0.976312,0.455692
3,0.553206,0.111981,0.53803
4,1.0,2.0,3.0


In [19]:
view(x, repmat([true, false], 4), 1:3) # view again

Unnamed: 0,x1,x2,x3
1,0.0856352,0.185821,0.0516146
2,0.46335,0.976312,0.455692
3,0.553206,0.111981,0.53803
4,1.0,2.0,3.0


In [20]:
deleterows!(x, 7) # delete one row

Unnamed: 0,x1,x2,x3,x4,x5
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166
6,0.46335,0.976312,0.455692,0.548983,0.648054
7,11.0,12.0,13.0,14.0,15.0


In [21]:
deleterows!(x, 6:7) # delete collection of rows

Unnamed: 0,x1,x2,x3,x4,x5
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166


In [22]:
x = DataFrame([1:4, 2:5, 3:6])

Unnamed: 0,x1,x2,x3
1,1,2,3
2,2,3,4
3,3,4,5
4,4,5,6


In [23]:
filter(r -> r[:x1] > 2.5, x) # create a new DataFrame where filtering function operates on DataFrameRow

Unnamed: 0,x1,x2,x3
1,3,4,5
2,4,5,6


In [24]:
# in place modification of x, an example with do-block syntax
filter!(x) do r
    if r[:x1] > 2.5
        return r[:x2] < 4.5
    end
    r[:x3] < 3.5
end

Unnamed: 0,x1,x2,x3
1,1,2,3
2,3,4,5


### Deduplicating

In [25]:
x = DataFrame(A=[1,2], B=["x","y"])
append!(x, x)
x[:C] = 1:4
x

Unnamed: 0,A,B,C
1,1,x,1
2,2,y,2
3,1,x,3
4,2,y,4


In [26]:
unique(x, [1,2]) # get first unique rows for given index

Unnamed: 0,A,B,C
1,1,x,1
2,2,y,2


In [27]:
unique(x) # now we look at whole rows

Unnamed: 0,A,B,C
1,1,x,1
2,2,y,2
3,1,x,3
4,2,y,4


In [28]:
nonunique(x, :A) # get indicators of non-unique rows

4-element Array{Bool,1}:
 false
 false
  true
  true

In [29]:
unique!(x, :B) # modify x in place

Unnamed: 0,A,B,C
1,1,x,1
2,2,y,2
