# Introduction to DataFrames
**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Dec 2, 2017**

A brief introduction to basic usage of `DataFrames`. Tested under version `0.11`.
I will try to keep it up to date as the package evolves.

In [28]:
using DataFrames # load package

## Manipulating rows of DataFrame

### Reordering rows

In [29]:
x = DataFrame(id=1:10, x = rand(10), y = [zeros(5); ones(5)]) # and we hope that x[:x] is not sorted :)

Unnamed: 0,id,x,y
1,1,0.753558,0.0
2,2,0.407035,0.0
3,3,0.244962,0.0
4,4,0.68109,0.0
5,5,0.232874,0.0
6,6,0.575719,1.0
7,7,0.184303,1.0
8,8,0.183109,1.0
9,9,0.625754,1.0
10,10,0.381387,1.0


In [30]:
sort!(x, cols=:x) # sort x in place

Unnamed: 0,id,x,y
1,8,0.183109,1.0
2,7,0.184303,1.0
3,5,0.232874,0.0
4,3,0.244962,0.0
5,10,0.381387,1.0
6,2,0.407035,0.0
7,6,0.575719,1.0
8,9,0.625754,1.0
9,4,0.68109,0.0
10,1,0.753558,0.0


In [31]:
y = sort(x, cols=:id) # new DataFrame

Unnamed: 0,id,x,y
1,1,0.753558,0.0
2,2,0.407035,0.0
3,3,0.244962,0.0
4,4,0.68109,0.0
5,5,0.232874,0.0
6,6,0.575719,1.0
7,7,0.184303,1.0
8,8,0.183109,1.0
9,9,0.625754,1.0
10,10,0.381387,1.0


In [32]:
sort(x, cols = (:y, :x), rev=(true, false))

Unnamed: 0,id,x,y
1,8,0.183109,1.0
2,7,0.184303,1.0
3,10,0.381387,1.0
4,6,0.575719,1.0
5,9,0.625754,1.0
6,5,0.232874,0.0
7,3,0.244962,0.0
8,2,0.407035,0.0
9,4,0.68109,0.0
10,1,0.753558,0.0


In [33]:
sort(x, cols = (order(:y, rev=true), :x)) # the same as above

Unnamed: 0,id,x,y
1,8,0.183109,1.0
2,7,0.184303,1.0
3,10,0.381387,1.0
4,6,0.575719,1.0
5,9,0.625754,1.0
6,5,0.232874,0.0
7,3,0.244962,0.0
8,2,0.407035,0.0
9,4,0.68109,0.0
10,1,0.753558,0.0


In [34]:
sort(x, cols = (order(:y, rev=true), order(:x, by=v->rem(v,1)))) # some more fancy sorting stuff

Unnamed: 0,id,x,y
1,8,0.183109,1.0
2,7,0.184303,1.0
3,10,0.381387,1.0
4,6,0.575719,1.0
5,9,0.625754,1.0
6,5,0.232874,0.0
7,3,0.244962,0.0
8,2,0.407035,0.0
9,4,0.68109,0.0
10,1,0.753558,0.0


In [35]:
x[shuffle(1:10), :] # reorder rows (here randomly)

Unnamed: 0,id,x,y
1,8,0.183109,1.0
2,6,0.575719,1.0
3,7,0.184303,1.0
4,1,0.753558,0.0
5,2,0.407035,0.0
6,5,0.232874,0.0
7,9,0.625754,1.0
8,10,0.381387,1.0
9,3,0.244962,0.0
10,4,0.68109,0.0


In [36]:
sort!(x, cols=:id)
x[[1,10],:] = x[[10,1],:] # swap rows
x

Unnamed: 0,id,x,y
1,10,0.381387,1.0
2,2,0.407035,0.0
3,3,0.244962,0.0
4,4,0.68109,0.0
5,5,0.232874,0.0
6,6,0.575719,1.0
7,7,0.184303,1.0
8,8,0.183109,1.0
9,9,0.625754,1.0
10,1,0.753558,0.0


In [37]:
x[1,:], x[10,:] = x[10,:], x[1,:] # and swap again
x

Unnamed: 0,id,x,y
1,1,0.753558,0.0
2,2,0.407035,0.0
3,3,0.244962,0.0
4,4,0.68109,0.0
5,5,0.232874,0.0
6,6,0.575719,1.0
7,7,0.184303,1.0
8,8,0.183109,1.0
9,9,0.625754,1.0
10,10,0.381387,1.0


### Merging/adding rows

In [38]:
x = DataFrame(rand(3, 5))

Unnamed: 0,x1,x2,x3,x4,x5
1,0.328494,0.16344,0.0850722,0.920347,0.729588
2,0.0473365,0.238552,0.805049,0.586261,0.554625
3,0.676467,0.28824,0.00808763,0.83394,0.345393


In [39]:
[x; x] # merge by rows - data frames must have the same column names; the same is vcat

Unnamed: 0,x1,x2,x3,x4,x5
1,0.328494,0.16344,0.0850722,0.920347,0.729588
2,0.0473365,0.238552,0.805049,0.586261,0.554625
3,0.676467,0.28824,0.00808763,0.83394,0.345393
4,0.328494,0.16344,0.0850722,0.920347,0.729588
5,0.0473365,0.238552,0.805049,0.586261,0.554625
6,0.676467,0.28824,0.00808763,0.83394,0.345393


In [40]:
append!(x, x) # the same but modifies x

Unnamed: 0,x1,x2,x3,x4,x5
1,0.328494,0.16344,0.0850722,0.920347,0.729588
2,0.0473365,0.238552,0.805049,0.586261,0.554625
3,0.676467,0.28824,0.00808763,0.83394,0.345393
4,0.328494,0.16344,0.0850722,0.920347,0.729588
5,0.0473365,0.238552,0.805049,0.586261,0.554625
6,0.676467,0.28824,0.00808763,0.83394,0.345393


In [41]:
push!(x, 1:5) # add one row to x at the end; must give correct number of values and correct types
x

Unnamed: 0,x1,x2,x3,x4,x5
1,0.328494,0.16344,0.0850722,0.920347,0.729588
2,0.0473365,0.238552,0.805049,0.586261,0.554625
3,0.676467,0.28824,0.00808763,0.83394,0.345393
4,0.328494,0.16344,0.0850722,0.920347,0.729588
5,0.0473365,0.238552,0.805049,0.586261,0.554625
6,0.676467,0.28824,0.00808763,0.83394,0.345393
7,1.0,2.0,3.0,4.0,5.0


In [42]:
push!(x, Dict(:x1=> 11, :x2=> 12, :x3=> 13, :x4=> 14, :x5=> 15)) # also works with dictionaries
x

Unnamed: 0,x1,x2,x3,x4,x5
1,0.328494,0.16344,0.0850722,0.920347,0.729588
2,0.0473365,0.238552,0.805049,0.586261,0.554625
3,0.676467,0.28824,0.00808763,0.83394,0.345393
4,0.328494,0.16344,0.0850722,0.920347,0.729588
5,0.0473365,0.238552,0.805049,0.586261,0.554625
6,0.676467,0.28824,0.00808763,0.83394,0.345393
7,1.0,2.0,3.0,4.0,5.0
8,11.0,12.0,13.0,14.0,15.0


### Subsetting/removing rows

In [43]:
x[1:2, :] # by index

Unnamed: 0,x1,x2,x3,x4,x5
1,0.328494,0.16344,0.0850722,0.920347,0.729588
2,0.0473365,0.238552,0.805049,0.586261,0.554625


In [44]:
view(x, 1:2) # the same but a view

Unnamed: 0,x1,x2,x3,x4,x5
1,0.328494,0.16344,0.0850722,0.920347,0.729588
2,0.0473365,0.238552,0.805049,0.586261,0.554625


In [45]:
x[repmat([true, false], 4), :] # by Bool, exact length required

Unnamed: 0,x1,x2,x3,x4,x5
1,0.328494,0.16344,0.0850722,0.920347,0.729588
2,0.676467,0.28824,0.00808763,0.83394,0.345393
3,0.0473365,0.238552,0.805049,0.586261,0.554625
4,1.0,2.0,3.0,4.0,5.0


In [46]:
view(x, repmat([true, false], 4)) # view again

Unnamed: 0,x1,x2,x3,x4,x5
1,0.328494,0.16344,0.0850722,0.920347,0.729588
2,0.676467,0.28824,0.00808763,0.83394,0.345393
3,0.0473365,0.238552,0.805049,0.586261,0.554625
4,1.0,2.0,3.0,4.0,5.0


In [47]:
deleterows!(x, 7) # delete one row

Unnamed: 0,x1,x2,x3,x4,x5
1,0.328494,0.16344,0.0850722,0.920347,0.729588
2,0.0473365,0.238552,0.805049,0.586261,0.554625
3,0.676467,0.28824,0.00808763,0.83394,0.345393
4,0.328494,0.16344,0.0850722,0.920347,0.729588
5,0.0473365,0.238552,0.805049,0.586261,0.554625
6,0.676467,0.28824,0.00808763,0.83394,0.345393
7,11.0,12.0,13.0,14.0,15.0


In [48]:
deleterows!(x, 6:7) # delete collection of rows

Unnamed: 0,x1,x2,x3,x4,x5
1,0.328494,0.16344,0.0850722,0.920347,0.729588
2,0.0473365,0.238552,0.805049,0.586261,0.554625
3,0.676467,0.28824,0.00808763,0.83394,0.345393
4,0.328494,0.16344,0.0850722,0.920347,0.729588
5,0.0473365,0.238552,0.805049,0.586261,0.554625


### Deduplicating

In [49]:
x = DataFrame(A=[1,2], B=["x","y"])
append!(x, x)
x[:C] = 1:4
x

Unnamed: 0,A,B,C
1,1,x,1
2,2,y,2
3,1,x,3
4,2,y,4


In [50]:
unique(x, [1,2]) # get first unique rows for given index

Unnamed: 0,A,B,C
1,1,x,1
2,2,y,2


In [51]:
unique(x) # now we look at whole rows

Unnamed: 0,A,B,C
1,1,x,1
2,2,y,2
3,1,x,3
4,2,y,4


In [52]:
nonunique(x, :A) # get indicators of non-unique rows

4-element Array{Bool,1}:
 false
 false
  true
  true

In [53]:
unique!(x, :B) # modify x in place

Unnamed: 0,A,B,C
1,1,x,1
2,2,y,2
