# Introduction to DataFrames
**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Dec 2, 2017**

A brief introduction to basic usage of `DataFrames`. Tested under version `0.11`.
I will try to keep it up to date as the package evolves.

In [55]:
using DataFrames # load package

## Manipulating rows of DataFrame

### Reordering rows

In [56]:
x = DataFrame(id=1:10, x = rand(10), y = [zeros(5); ones(5)]) # and we hope that x[:x] is not sorted :)

Unnamed: 0,id,x,y
1,1,0.581494,0.0
2,2,0.378324,0.0
3,3,0.828198,0.0
4,4,0.29051,0.0
5,5,0.611678,0.0
6,6,0.239892,1.0
7,7,0.307028,1.0
8,8,0.314251,1.0
9,9,0.681489,1.0
10,10,0.027058,1.0


In [57]:
sort!(x, cols=:x) # sort x in place

Unnamed: 0,id,x,y
1,10,0.027058,1.0
2,6,0.239892,1.0
3,4,0.29051,0.0
4,7,0.307028,1.0
5,8,0.314251,1.0
6,2,0.378324,0.0
7,1,0.581494,0.0
8,5,0.611678,0.0
9,9,0.681489,1.0
10,3,0.828198,0.0


In [58]:
y = sort(x, cols=:id) # new DataFrame

Unnamed: 0,id,x,y
1,1,0.581494,0.0
2,2,0.378324,0.0
3,3,0.828198,0.0
4,4,0.29051,0.0
5,5,0.611678,0.0
6,6,0.239892,1.0
7,7,0.307028,1.0
8,8,0.314251,1.0
9,9,0.681489,1.0
10,10,0.027058,1.0


In [59]:
sort(x, cols = (:y, :x), rev=(true, false))

Unnamed: 0,id,x,y
1,10,0.027058,1.0
2,6,0.239892,1.0
3,7,0.307028,1.0
4,8,0.314251,1.0
5,9,0.681489,1.0
6,4,0.29051,0.0
7,2,0.378324,0.0
8,1,0.581494,0.0
9,5,0.611678,0.0
10,3,0.828198,0.0


In [60]:
sort(x, cols = (order(:y, rev=true), :x)) # the same as above

Unnamed: 0,id,x,y
1,10,0.027058,1.0
2,6,0.239892,1.0
3,7,0.307028,1.0
4,8,0.314251,1.0
5,9,0.681489,1.0
6,4,0.29051,0.0
7,2,0.378324,0.0
8,1,0.581494,0.0
9,5,0.611678,0.0
10,3,0.828198,0.0


In [61]:
sort(x, cols = (order(:y, rev=true), order(:x, by=v->rem(v,1)))) # some more fancy sorting stuff

Unnamed: 0,id,x,y
1,10,0.027058,1.0
2,6,0.239892,1.0
3,7,0.307028,1.0
4,8,0.314251,1.0
5,9,0.681489,1.0
6,4,0.29051,0.0
7,2,0.378324,0.0
8,1,0.581494,0.0
9,5,0.611678,0.0
10,3,0.828198,0.0


In [62]:
x[shuffle(1:10), :] # reorder rows (here randomly)

Unnamed: 0,id,x,y
1,6,0.239892,1.0
2,3,0.828198,0.0
3,5,0.611678,0.0
4,9,0.681489,1.0
5,7,0.307028,1.0
6,2,0.378324,0.0
7,4,0.29051,0.0
8,8,0.314251,1.0
9,1,0.581494,0.0
10,10,0.027058,1.0


In [63]:
sort!(x, cols=:id)
x[[1,10],:] = x[[10,1],:] # swap rows
x

Unnamed: 0,id,x,y
1,10,0.027058,1.0
2,2,0.378324,0.0
3,3,0.828198,0.0
4,4,0.29051,0.0
5,5,0.611678,0.0
6,6,0.239892,1.0
7,7,0.307028,1.0
8,8,0.314251,1.0
9,9,0.681489,1.0
10,1,0.581494,0.0


In [64]:
x[1,:], x[10,:] = x[10,:], x[1,:] # and swap again
x

Unnamed: 0,id,x,y
1,1,0.581494,0.0
2,2,0.378324,0.0
3,3,0.828198,0.0
4,4,0.29051,0.0
5,5,0.611678,0.0
6,6,0.239892,1.0
7,7,0.307028,1.0
8,8,0.314251,1.0
9,9,0.681489,1.0
10,10,0.027058,1.0


### Merging/adding rows

In [65]:
x = DataFrame(rand(3, 5))

Unnamed: 0,x1,x2,x3,x4,x5
1,0.918193,0.730542,0.17528,0.290656,0.32454
2,0.99533,0.0354157,0.317524,0.873675,0.43646
3,0.00393182,0.613515,0.571327,0.399088,0.704549


In [66]:
[x; x] # merge by rows - data frames must have the same column names; the same is vcat

Unnamed: 0,x1,x2,x3,x4,x5
1,0.918193,0.730542,0.17528,0.290656,0.32454
2,0.99533,0.0354157,0.317524,0.873675,0.43646
3,0.00393182,0.613515,0.571327,0.399088,0.704549
4,0.918193,0.730542,0.17528,0.290656,0.32454
5,0.99533,0.0354157,0.317524,0.873675,0.43646
6,0.00393182,0.613515,0.571327,0.399088,0.704549


In [67]:
append!(x, x) # the same but modifies x

Unnamed: 0,x1,x2,x3,x4,x5
1,0.918193,0.730542,0.17528,0.290656,0.32454
2,0.99533,0.0354157,0.317524,0.873675,0.43646
3,0.00393182,0.613515,0.571327,0.399088,0.704549
4,0.918193,0.730542,0.17528,0.290656,0.32454
5,0.99533,0.0354157,0.317524,0.873675,0.43646
6,0.00393182,0.613515,0.571327,0.399088,0.704549


In [68]:
push!(x, 1:5) # add one row to x at the end; must give correct number of values and correct types
x

Unnamed: 0,x1,x2,x3,x4,x5
1,0.918193,0.730542,0.17528,0.290656,0.32454
2,0.99533,0.0354157,0.317524,0.873675,0.43646
3,0.00393182,0.613515,0.571327,0.399088,0.704549
4,0.918193,0.730542,0.17528,0.290656,0.32454
5,0.99533,0.0354157,0.317524,0.873675,0.43646
6,0.00393182,0.613515,0.571327,0.399088,0.704549
7,1.0,2.0,3.0,4.0,5.0


In [69]:
push!(x, Dict(:x1=> 11, :x2=> 12, :x3=> 13, :x4=> 14, :x5=> 15)) # also works with dictionaries
x

Unnamed: 0,x1,x2,x3,x4,x5
1,0.918193,0.730542,0.17528,0.290656,0.32454
2,0.99533,0.0354157,0.317524,0.873675,0.43646
3,0.00393182,0.613515,0.571327,0.399088,0.704549
4,0.918193,0.730542,0.17528,0.290656,0.32454
5,0.99533,0.0354157,0.317524,0.873675,0.43646
6,0.00393182,0.613515,0.571327,0.399088,0.704549
7,1.0,2.0,3.0,4.0,5.0
8,11.0,12.0,13.0,14.0,15.0


### Subsetting/removing rows

In [70]:
x[1:2, :] # by index

Unnamed: 0,x1,x2,x3,x4,x5
1,0.918193,0.730542,0.17528,0.290656,0.32454
2,0.99533,0.0354157,0.317524,0.873675,0.43646


In [71]:
x[repmat([true, false], 4), :] # by Bool, exact length required

Unnamed: 0,x1,x2,x3,x4,x5
1,0.918193,0.730542,0.17528,0.290656,0.32454
2,0.00393182,0.613515,0.571327,0.399088,0.704549
3,0.99533,0.0354157,0.317524,0.873675,0.43646
4,1.0,2.0,3.0,4.0,5.0


In [72]:
deleterows!(x, 7) # delete one row

Unnamed: 0,x1,x2,x3,x4,x5
1,0.918193,0.730542,0.17528,0.290656,0.32454
2,0.99533,0.0354157,0.317524,0.873675,0.43646
3,0.00393182,0.613515,0.571327,0.399088,0.704549
4,0.918193,0.730542,0.17528,0.290656,0.32454
5,0.99533,0.0354157,0.317524,0.873675,0.43646
6,0.00393182,0.613515,0.571327,0.399088,0.704549
7,11.0,12.0,13.0,14.0,15.0


In [73]:
deleterows!(x, 6:7) # delete collection of rows

Unnamed: 0,x1,x2,x3,x4,x5
1,0.918193,0.730542,0.17528,0.290656,0.32454
2,0.99533,0.0354157,0.317524,0.873675,0.43646
3,0.00393182,0.613515,0.571327,0.399088,0.704549
4,0.918193,0.730542,0.17528,0.290656,0.32454
5,0.99533,0.0354157,0.317524,0.873675,0.43646


### Deduplicating

In [75]:
x = DataFrame(A=[1,2], B=["x","y"])
append!(x, x)
x[:C] = 1:4
x

Unnamed: 0,A,B,C
1,1,x,1
2,2,y,2
3,1,x,3
4,2,y,4


In [78]:
unique(x, [1,2]) # get first unique rows for given index

Unnamed: 0,A,B,C
1,1,x,1
2,2,y,2


In [79]:
unique(x) # now we look at whole rows

Unnamed: 0,A,B,C
1,1,x,1
2,2,y,2
3,1,x,3
4,2,y,4


In [81]:
nonunique(x, :A) # get indicators of non-unique rows

4-element Array{Bool,1}:
 false
 false
  true
  true

In [82]:
unique!(x, :B) # modify x in place

Unnamed: 0,A,B,C
1,1,x,1
2,2,y,2
