# Introduction to DataFrames
**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), October 4, 2018**

In [1]:
using DataFrames, Random # load packages
Random.seed!(1);

## Manipulating rows of DataFrame

### Reordering rows

In [2]:
x = DataFrame(id=1:10, x = rand(10), y = [zeros(5); ones(5)]) # and we hope that x[:x] is not sorted :)

Unnamed: 0_level_0,id,x,y
Unnamed: 0_level_1,Int64,Float64,Float64
1,1,0.236033,0.0
2,2,0.346517,0.0
3,3,0.312707,0.0
4,4,0.00790928,0.0
5,5,0.488613,0.0
6,6,0.210968,1.0
7,7,0.951916,1.0
8,8,0.999905,1.0
9,9,0.251662,1.0
10,10,0.986666,1.0


In [3]:
issorted(x), issorted(x, :x) # check if a DataFrame or a subset of its columns is sorted

(true, false)

In [4]:
sort!(x, :x) # sort x in place

Unnamed: 0_level_0,id,x,y
Unnamed: 0_level_1,Int64,Float64,Float64
1,4,0.00790928,0.0
2,6,0.210968,1.0
3,1,0.236033,0.0
4,9,0.251662,1.0
5,3,0.312707,0.0
6,2,0.346517,0.0
7,5,0.488613,0.0
8,7,0.951916,1.0
9,10,0.986666,1.0
10,8,0.999905,1.0


In [5]:
y = sort(x, :id) # new DataFrame

Unnamed: 0_level_0,id,x,y
Unnamed: 0_level_1,Int64,Float64,Float64
1,1,0.236033,0.0
2,2,0.346517,0.0
3,3,0.312707,0.0
4,4,0.00790928,0.0
5,5,0.488613,0.0
6,6,0.210968,1.0
7,7,0.951916,1.0
8,8,0.999905,1.0
9,9,0.251662,1.0
10,10,0.986666,1.0


In [6]:
sort(x, (:y, :x), rev=(true, false)) # sort by two columns, first is decreasing, second is increasing

Unnamed: 0_level_0,id,x,y
Unnamed: 0_level_1,Int64,Float64,Float64
1,6,0.210968,1.0
2,9,0.251662,1.0
3,7,0.951916,1.0
4,10,0.986666,1.0
5,8,0.999905,1.0
6,4,0.00790928,0.0
7,1,0.236033,0.0
8,3,0.312707,0.0
9,2,0.346517,0.0
10,5,0.488613,0.0


In [7]:
sort(x, (order(:y, rev=true), :x)) # the same as above

Unnamed: 0_level_0,id,x,y
Unnamed: 0_level_1,Int64,Float64,Float64
1,6,0.210968,1.0
2,9,0.251662,1.0
3,7,0.951916,1.0
4,10,0.986666,1.0
5,8,0.999905,1.0
6,4,0.00790928,0.0
7,1,0.236033,0.0
8,3,0.312707,0.0
9,2,0.346517,0.0
10,5,0.488613,0.0


In [8]:
sort(x, (order(:y, rev=true), order(:x, by=v->-v))) # some more fancy sorting stuff

Unnamed: 0_level_0,id,x,y
Unnamed: 0_level_1,Int64,Float64,Float64
1,8,0.999905,1.0
2,10,0.986666,1.0
3,7,0.951916,1.0
4,9,0.251662,1.0
5,6,0.210968,1.0
6,5,0.488613,0.0
7,2,0.346517,0.0
8,3,0.312707,0.0
9,1,0.236033,0.0
10,4,0.00790928,0.0


In [9]:
x[shuffle(1:10), :] # reorder rows (here randomly)

Unnamed: 0_level_0,id,x,y
Unnamed: 0_level_1,Int64,Float64,Float64
1,8,0.999905,1.0
2,10,0.986666,1.0
3,7,0.951916,1.0
4,2,0.346517,0.0
5,5,0.488613,0.0
6,1,0.236033,0.0
7,6,0.210968,1.0
8,9,0.251662,1.0
9,4,0.00790928,0.0
10,3,0.312707,0.0


In [10]:
sort!(x, :id)
x[[1,10],:] = x[[10,1],:] # swap rows
x

Unnamed: 0_level_0,id,x,y
Unnamed: 0_level_1,Int64,Float64,Float64
1,10,0.986666,1.0
2,2,0.346517,0.0
3,3,0.312707,0.0
4,4,0.00790928,0.0
5,5,0.488613,0.0
6,6,0.210968,1.0
7,7,0.951916,1.0
8,8,0.999905,1.0
9,9,0.251662,1.0
10,1,0.236033,0.0


In [11]:
x[1,:], x[10,:] = x[10,:], x[1,:] # and swap again
x

Unnamed: 0_level_0,id,x,y
Unnamed: 0_level_1,Int64,Float64,Float64
1,1,0.236033,0.0
2,2,0.346517,0.0
3,3,0.312707,0.0
4,4,0.00790928,0.0
5,5,0.488613,0.0
6,6,0.210968,1.0
7,7,0.951916,1.0
8,8,0.999905,1.0
9,9,0.251662,1.0
10,10,0.986666,1.0


### Merging/adding rows

In [12]:
x = DataFrame(rand(3, 5))

Unnamed: 0_level_0,x1,x2,x3,x4,x5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054


In [13]:
[x; x] # merge by rows - data frames must have the same column names; the same is vcat

Unnamed: 0_level_0,x1,x2,x3,x4,x5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166
6,0.46335,0.976312,0.455692,0.548983,0.648054


In [14]:
reduce(vcat, [x, x, x]) # you can efficiently vcat a vector of DataFrames using reduce

Unnamed: 0_level_0,x1,x2,x3,x4,x5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166
6,0.46335,0.976312,0.455692,0.548983,0.648054
7,0.0856352,0.185821,0.0516146,0.279395,0.370971
8,0.553206,0.111981,0.53803,0.178246,0.894166
9,0.46335,0.976312,0.455692,0.548983,0.648054


In [15]:
y = x[reverse(names(x))] # get y with other order of names

Unnamed: 0_level_0,x5,x4,x3,x2,x1
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.370971,0.279395,0.0516146,0.185821,0.0856352
2,0.894166,0.178246,0.53803,0.111981,0.553206
3,0.648054,0.548983,0.455692,0.976312,0.46335


In [16]:
vcat(x, y) # we get what we want as vcat does column name matching

Unnamed: 0_level_0,x1,x2,x3,x4,x5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166
6,0.46335,0.976312,0.455692,0.548983,0.648054


In [17]:
vcat(x, y[1:3]) # but column names must still match

ArgumentError: ArgumentError: column(s) x1 and x2 are missing from argument(s) 2

In [18]:
append!(x, x) # the same but modifies x

Unnamed: 0_level_0,x1,x2,x3,x4,x5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166
6,0.46335,0.976312,0.455692,0.548983,0.648054


In [19]:
append!(x, y) # here column names must match exactly

ErrorException: Column names do not match

In [20]:
repeat(x, 2) # standard repeat function works on rows; also inner and outer keyword arguments are accepted

Unnamed: 0_level_0,x1,x2,x3,x4,x5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166
6,0.46335,0.976312,0.455692,0.548983,0.648054
7,0.0856352,0.185821,0.0516146,0.279395,0.370971
8,0.553206,0.111981,0.53803,0.178246,0.894166
9,0.46335,0.976312,0.455692,0.548983,0.648054
10,0.0856352,0.185821,0.0516146,0.279395,0.370971


In [21]:
push!(x, 1:5) # add one row to x at the end; must give correct number of values and correct types
x

Unnamed: 0_level_0,x1,x2,x3,x4,x5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166
6,0.46335,0.976312,0.455692,0.548983,0.648054
7,1.0,2.0,3.0,4.0,5.0


In [22]:
push!(x, Dict(:x1=> 11, :x2=> 12, :x3=> 13, :x4=> 14, :x5=> 15)) # also works with dictionaries
x

Unnamed: 0_level_0,x1,x2,x3,x4,x5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166
6,0.46335,0.976312,0.455692,0.548983,0.648054
7,1.0,2.0,3.0,4.0,5.0
8,11.0,12.0,13.0,14.0,15.0


In [23]:
push!(x, (x2=2, x1=1, x4=4, x3=3, x5=5)) # and NamedTuples via name matching

Unnamed: 0_level_0,x1,x2,x3,x4,x5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.0856352,0.185821,0.0516146,0.279395,0.370971
2,0.553206,0.111981,0.53803,0.178246,0.894166
3,0.46335,0.976312,0.455692,0.548983,0.648054
4,0.0856352,0.185821,0.0516146,0.279395,0.370971
5,0.553206,0.111981,0.53803,0.178246,0.894166
6,0.46335,0.976312,0.455692,0.548983,0.648054
7,1.0,2.0,3.0,4.0,5.0
8,11.0,12.0,13.0,14.0,15.0
9,1.0,2.0,3.0,4.0,5.0


### Subsetting/removing rows

In [24]:
x = DataFrame(id=1:10, val='a':'j')

Unnamed: 0_level_0,id,val
Unnamed: 0_level_1,Int64,Char
1,1,'a'
2,2,'b'
3,3,'c'
4,4,'d'
5,5,'e'
6,6,'f'
7,7,'g'
8,8,'h'
9,9,'i'
10,10,'j'


In [25]:
x[1:2, :] # by index

Unnamed: 0_level_0,id,val
Unnamed: 0_level_1,Int64,Char
1,1,'a'
2,2,'b'


In [26]:
view(x, 1:2) # the same but a view

Unnamed: 0_level_0,id,val
Unnamed: 0_level_1,Int64,Char
1,1,'a'
2,2,'b'


In [27]:
x[repeat([true, false], 5), :] # by Bool, exact length required

Unnamed: 0_level_0,id,val
Unnamed: 0_level_1,Int64,Char
1,1,'a'
2,3,'c'
3,5,'e'
4,7,'g'
5,9,'i'


In [28]:
view(x, repeat([true, false], 5), :) # view again

Unnamed: 0_level_0,id,val
Unnamed: 0_level_1,Int64,Char
1,1,'a'
2,3,'c'
3,5,'e'
4,7,'g'
5,9,'i'


In [29]:
deleterows!(x, 7) # delete one row

Unnamed: 0_level_0,id,val
Unnamed: 0_level_1,Int64,Char
1,1,'a'
2,2,'b'
3,3,'c'
4,4,'d'
5,5,'e'
6,6,'f'
7,8,'h'
8,9,'i'
9,10,'j'


In [30]:
deleterows!(x, 6:7) # delete a collection of rows

Unnamed: 0_level_0,id,val
Unnamed: 0_level_1,Int64,Char
1,1,'a'
2,2,'b'
3,3,'c'
4,4,'d'
5,5,'e'
6,9,'i'
7,10,'j'


In [31]:
x = DataFrame([1:4, 2:5, 3:6])

Unnamed: 0_level_0,x1,x2,x3
Unnamed: 0_level_1,Int64,Int64,Int64
1,1,2,3
2,2,3,4
3,3,4,5
4,4,5,6


In [32]:
filter(r -> r.x1 > 2.5, x) # create a new DataFrame where filtering function operates on DataFrameRow

Unnamed: 0_level_0,x1,x2,x3
Unnamed: 0_level_1,Int64,Int64,Int64
1,3,4,5
2,4,5,6


In [33]:
# in place modification of x, an example with do-block syntax
filter!(x) do r
    if r.x1 > 2.5
        return r.x2 < 4.5
    end
    r.x3 < 3.5
end

Unnamed: 0_level_0,x1,x2,x3
Unnamed: 0_level_1,Int64,Int64,Int64
1,1,2,3
2,3,4,5


### Deduplicating

In [34]:
x = DataFrame(A=[1,2], B=["x","y"])
append!(x, x)
x.C = 1:4
x

Unnamed: 0_level_0,A,B,C
Unnamed: 0_level_1,Int64,String,Int64
1,1,x,1
2,2,y,2
3,1,x,3
4,2,y,4


In [35]:
unique(x, [1,2]) # get first unique rows for given index

Unnamed: 0_level_0,A,B,C
Unnamed: 0_level_1,Int64,String,Int64
1,1,x,1
2,2,y,2


In [36]:
unique(x) # now we look at whole rows

Unnamed: 0_level_0,A,B,C
Unnamed: 0_level_1,Int64,String,Int64
1,1,x,1
2,2,y,2
3,1,x,3
4,2,y,4


In [37]:
nonunique(x, :A) # get indicators of non-unique rows

4-element Array{Bool,1}:
 false
 false
  true
  true

In [38]:
unique!(x, :B) # modify x in place

Unnamed: 0_level_0,A,B,C
Unnamed: 0_level_1,Int64,String,Int64
1,1,x,1
2,2,y,2


### Extracting one row from `DataFrame` into a vector

In [39]:
x = DataFrame(x=[1,missing,2], y=["a", "b", missing], z=[true,false,true])

Unnamed: 0_level_0,x,y,z
Unnamed: 0_level_1,Int64⍰,String⍰,Bool
1,1,a,True
2,missing,b,False
3,2,missing,True


In [40]:
cols = [:x, :y]
[x[1, col] for col in cols] # subset of columns

2-element Array{Any,1}:
 1   
  "a"

In [41]:
[[x[i, col] for col in names(x)] for i in 1:nrow(x)] # vector of vectors, each entry contains one full row of x

3-element Array{Array{Any,1},1}:
 [1, "a", true]       
 [missing, "b", false]
 [2, missing, true]   

In [42]:
Tuple(x[1, col] for col in cols) # similar construct for Tuples, when ported to Julia 0.7 NamedTuples will be added

(1, "a")

In [43]:
vec(Matrix(x[1, cols])) # also you can use a conversion to a Matrix

2-element Array{Any,1}:
 1   
  "a"