# Introduction to DataFrames
**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), August 26, 2019**

In [1]:
using Pkg
Pkg.activate(".")

[32m[1mActivating[22m[39m environment at `d:\Dev\Julia\DataFrames_Tutorial\Project.toml`


In [2]:
using DataFrames

## Split-apply-combine

In [3]:
x = DataFrame(id=[1,2,3,4,1,2,3,4], id2=[1,2,1,2,1,2,1,2], v=rand(8))

Unnamed: 0_level_0,id,id2,v
Unnamed: 0_level_1,Int64,Int64,Float64
1,1,1,0.271526
2,2,2,0.642008
3,3,1,0.303488
4,4,2,0.660327
5,1,1,0.423707
6,2,2,0.833898
7,3,1,0.267755
8,4,2,0.865183


In [4]:
gx1 = groupby(x, :id)

Unnamed: 0_level_0,id,id2,v
Unnamed: 0_level_1,Int64,Int64,Float64
1,1,1,0.271526
2,1,1,0.423707

Unnamed: 0_level_0,id,id2,v
Unnamed: 0_level_1,Int64,Int64,Float64
1,4,2,0.660327
2,4,2,0.865183


In [5]:
gx2 = groupby(x, [:id, :id2])

Unnamed: 0_level_0,id,id2,v
Unnamed: 0_level_1,Int64,Int64,Float64
1,1,1,0.271526
2,1,1,0.423707

Unnamed: 0_level_0,id,id2,v
Unnamed: 0_level_1,Int64,Int64,Float64
1,4,2,0.660327
2,4,2,0.865183


In [6]:
parent(gx2) # get the parent DataFrame 

Unnamed: 0_level_0,id,id2,v
Unnamed: 0_level_1,Int64,Int64,Float64
1,1,1,0.271526
2,2,2,0.642008
3,3,1,0.303488
4,4,2,0.660327
5,1,1,0.423707
6,2,2,0.833898
7,3,1,0.267755
8,4,2,0.865183


In [7]:
vcat(gx2...) # back to the DataFrame, but in a different order of rows than the original

Unnamed: 0_level_0,id,id2,v
Unnamed: 0_level_1,Int64,Int64,Float64
1,1,1,0.271526
2,1,1,0.423707
3,2,2,0.642008
4,2,2,0.833898
5,3,1,0.303488
6,3,1,0.267755
7,4,2,0.660327
8,4,2,0.865183


In [8]:
DataFrame(gx2) # the same

Unnamed: 0_level_0,id,id2,v
Unnamed: 0_level_1,Int64,Int64,Float64
1,1,1,0.271526
2,1,1,0.423707
3,2,2,0.642008
4,2,2,0.833898
5,3,1,0.303488
6,3,1,0.267755
7,4,2,0.660327
8,4,2,0.865183


In [9]:
groupvars(gx2) # vector of names of grouping variables

2-element Array{Symbol,1}:
 :id 
 :id2

In [10]:
groupindices(gx2) # group indices in parent(gx2)

8-element Array{Union{Missing, Int64},1}:
 1
 2
 3
 4
 1
 2
 3
 4

In [11]:
x = DataFrame(id = [missing, 5, 1, 3, missing], x = 1:5)

Unnamed: 0_level_0,id,x
Unnamed: 0_level_1,Int64⍰,Int64
1,missing,1
2,5,2
3,1,3
4,3,4
5,missing,5


In [12]:
groupby(x, :id) # by default groups include mising values and are not sorted

Unnamed: 0_level_0,id,x
Unnamed: 0_level_1,Int64⍰,Int64
1,missing,1
2,missing,5

Unnamed: 0_level_0,id,x
Unnamed: 0_level_1,Int64⍰,Int64
1,3,4


In [13]:
groupby(x, :id, sort=true, skipmissing=true) # but we can change it

Unnamed: 0_level_0,id,x
Unnamed: 0_level_1,Int64⍰,Int64
1,1,3

Unnamed: 0_level_0,id,x
Unnamed: 0_level_1,Int64⍰,Int64
1,5,2


In [14]:
using Statistics
x = DataFrame(id=rand('a':'d', 100), v=rand(100));
by(x, :id, :v=>mean) # apply a function to each group of a data frame

Unnamed: 0_level_0,id,v_mean
Unnamed: 0_level_1,Char,Float64
1,'b',0.558413
2,'c',0.442001
3,'a',0.522355
4,'d',0.502747


In [15]:
by(x, :id, :v=>mean, sort=true) # we can sort the output

Unnamed: 0_level_0,id,v_mean
Unnamed: 0_level_1,Char,Float64
1,'a',0.522355
2,'b',0.558413
3,'c',0.442001
4,'d',0.502747


In [16]:
by(x, :id, res=:v=>mean) # this way we can set a name for a column

Unnamed: 0_level_0,id,res
Unnamed: 0_level_1,Char,Float64
1,'b',0.558413
2,'c',0.442001
3,'a',0.522355
4,'d',0.502747


In [17]:
by(x, :id, res1=:v=>mean, res2=:v=>sum) # you can have multiple operations

Unnamed: 0_level_0,id,res1,res2
Unnamed: 0_level_1,Char,Float64,Float64
1,'b',0.558413,15.6356
2,'c',0.442001,9.72401
3,'a',0.522355,11.4918
4,'d',0.502747,14.0769


In [18]:
x = DataFrame(id=rand('a':'d', 100), x1=rand(100), x2=rand(100))
aggregate(x, :id, sum) # apply a function over all columns of a data frame in groups given by id

Unnamed: 0_level_0,id,x1_sum,x2_sum
Unnamed: 0_level_1,Char,Float64,Float64
1,'c',12.3439,12.8312
2,'a',8.39491,7.5936
3,'b',14.0745,17.1006
4,'d',13.3796,15.5769


In [19]:
aggregate(x, :id, sum, sort=true) # also can be sorted

Unnamed: 0_level_0,id,x1_sum,x2_sum
Unnamed: 0_level_1,Char,Float64,Float64
1,'a',8.39491,7.5936
2,'b',14.0745,17.1006
3,'c',12.3439,12.8312
4,'d',13.3796,15.5769


A new feature is `mapcols` convinience function

In [20]:
x = DataFrame(rand(3, 5))

Unnamed: 0_level_0,x1,x2,x3,x4,x5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.873986,0.315663,0.720103,0.577715,0.241681
2,0.77242,0.714456,0.154359,0.661758,0.0682483
3,0.499805,0.555766,0.663409,0.678376,0.627233


In [21]:
mapcols(mean, x)

Unnamed: 0_level_0,x1,x2,x3,x4,x5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.715404,0.528629,0.512624,0.639283,0.312388


In [22]:
map(mean, eachcol(x, false)) # map a function over each column and return a vector

5-element Array{Float64,1}:
 0.7154036917288855
 0.528628590378507 
 0.5126235198745835
 0.6392830678401201
 0.3123876657450813

In [23]:
foreach(c -> println(c[1], ": ", mean(c[2])), eachcol(x, true)) # an iteration returns a Pair with column name and values

x1: 0.7154036917288855
x2: 0.528628590378507
x3: 0.5126235198745835
x4: 0.6392830678401201
x5: 0.3123876657450813


In [24]:
map(r -> r.x1/r.x2, eachrow(x)) # now the returned value is DataFrameRow which works similarly to a one-row DataFrame

3-element Array{Float64,1}:
 2.768727152127032 
 1.0811298965724347
 0.899308361719669 

In [25]:
er = eachrow(x) # it prints like a data frame, only the caption is different so that you know the type of the object

Unnamed: 0_level_0,x1,x2,x3,x4,x5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.873986,0.315663,0.720103,0.577715,0.241681
2,0.77242,0.714456,0.154359,0.661758,0.0682483
3,0.499805,0.555766,0.663409,0.678376,0.627233


In [26]:
er.x1 # you can access columns of a parent data frame directly

3-element Array{Float64,1}:
 0.8739860966417328
 0.7724196235040937
 0.4998053550408301

In [27]:
ec = eachcol(x) # it prints like a data frame, only the caption is different so that you know the type of the object

Unnamed: 0_level_0,x1,x2,x3,x4,x5
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.873986,0.315663,0.720103,0.577715,0.241681
2,0.77242,0.714456,0.154359,0.661758,0.0682483
3,0.499805,0.555766,0.663409,0.678376,0.627233


In [28]:
ec.x1 # you can access columns of a parent data frame directly

3-element Array{Float64,1}:
 0.8739860966417328
 0.7724196235040937
 0.4998053550408301