# Introduction to DataFrames
**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), October 4, 2017**

In [1]:
using DataFrames # load package

## Joining DataFrames

### Preparing DataFrames for a join

In [2]:
x = DataFrame(ID=[1,2,3,4,missing], name = ["Alice", "Bob", "Conor", "Dave","Zed"])

Unnamed: 0_level_0,ID,name
Unnamed: 0_level_1,Int64⍰,String
1,1,Alice
2,2,Bob
3,3,Conor
4,4,Dave
5,missing,Zed


In [3]:
y = DataFrame(id=[1,2,5,6,missing], age = [21,22,23,24,99])

Unnamed: 0_level_0,id,age
Unnamed: 0_level_1,Int64⍰,Int64
1,1,21
2,2,22
3,5,23
4,6,24
5,missing,99


In [4]:
rename!(x, :ID=>:id) # names of columns on which we want to join must be the same

Unnamed: 0_level_0,id,name
Unnamed: 0_level_1,Int64⍰,String
1,1,Alice
2,2,Bob
3,3,Conor
4,4,Dave
5,missing,Zed


### Standard joins: inner, left, right, outer, semi, anti

In [5]:
join(x, y, on=:id) # :inner join by default, missing is joined

Unnamed: 0_level_0,id,name,age
Unnamed: 0_level_1,Int64⍰,String,Int64
1,1,Alice,21
2,2,Bob,22
3,missing,Zed,99


In [6]:
join(x, y, on=:id, kind=:left)

Unnamed: 0_level_0,id,name,age
Unnamed: 0_level_1,Int64⍰,String,Int64⍰
1,1,Alice,21
2,2,Bob,22
3,3,Conor,missing
4,4,Dave,missing
5,missing,Zed,99


In [7]:
join(x, y, on=:id, kind=:right)

Unnamed: 0_level_0,id,name,age
Unnamed: 0_level_1,Int64⍰,String⍰,Int64
1,1,Alice,21
2,2,Bob,22
3,missing,Zed,99
4,5,missing,23
5,6,missing,24


In [8]:
join(x, y, on=:id, kind=:outer)

Unnamed: 0_level_0,id,name,age
Unnamed: 0_level_1,Int64⍰,String⍰,Int64⍰
1,1,Alice,21
2,2,Bob,22
3,3,Conor,missing
4,4,Dave,missing
5,missing,Zed,99
6,5,missing,23
7,6,missing,24


In [9]:
join(x, y, on=:id, kind=:semi)

Unnamed: 0_level_0,id,name
Unnamed: 0_level_1,Int64⍰,String
1,1,Alice
2,2,Bob
3,missing,Zed


In [10]:
join(x, y, on=:id, kind=:anti)

Unnamed: 0_level_0,id,name
Unnamed: 0_level_1,Int64⍰,String
1,3,Conor
2,4,Dave


### Cross join

In [11]:
join(DataFrame(x=[1,2]), DataFrame(y=["a","b","c"]), kind=:cross)

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Int64,String
1,1,a
2,1,b
3,1,c
4,2,a
5,2,b
6,2,c


### Complex cases of joins

In [12]:
x = DataFrame(id1=[1,1,2,2,missing,missing],
              id2=[1,11,2,21,missing,99],
              name = ["Alice", "Bob", "Conor", "Dave","Zed", "Zoe"])

Unnamed: 0_level_0,id1,id2,name
Unnamed: 0_level_1,Int64⍰,Int64⍰,String
1,1,1,Alice
2,1,11,Bob
3,2,2,Conor
4,2,21,Dave
5,missing,missing,Zed
6,missing,99,Zoe


In [13]:
y = DataFrame(id1=[1,1,3,3,missing,missing],
              id2=[11,1,31,3,missing,999],
              age = [21,22,23,24,99, 100])

Unnamed: 0_level_0,id1,id2,age
Unnamed: 0_level_1,Int64⍰,Int64⍰,Int64
1,1,11,21
2,1,1,22
3,3,31,23
4,3,3,24
5,missing,missing,99
6,missing,999,100


In [14]:
join(x, y, on=[:id1, :id2]) # joining on two columns

Unnamed: 0_level_0,id1,id2,name,age
Unnamed: 0_level_1,Int64⍰,Int64⍰,String,Int64
1,1,1,Alice,22
2,1,11,Bob,21
3,missing,missing,Zed,99


In [15]:
join(x, y, on=[:id1], makeunique=true, kind=:outer, indicator=:source) # with duplicates all combinations are produced

Unnamed: 0_level_0,id1,id2,name,id2_1,age,source
Unnamed: 0_level_1,Int64⍰,Int64⍰,String⍰,Int64⍰,Int64⍰,Categorical…
1,1,1,Alice,11,21,both
2,1,1,Alice,1,22,both
3,1,11,Bob,11,21,both
4,1,11,Bob,1,22,both
5,2,2,Conor,missing,missing,left_only
6,2,21,Dave,missing,missing,left_only
7,missing,missing,Zed,missing,99,both
8,missing,missing,Zed,999,100,both
9,missing,99,Zoe,missing,99,both
10,missing,99,Zoe,999,100,both


In [16]:
join(x, y, on=[:id1], makeunique=true, validate=(true,true)) # you can force validation of uniqueness of key on which you join

ArgumentError: ArgumentError: Merge key(s) are not unique in both df1 and df2. First duplicate in df1 at 2. First duplicate in df2 at 2

In [17]:
join(x, y, on=[:id1], kind=:semi) # but not by :semi join (as it would duplicate rows)

Unnamed: 0_level_0,id1,id2,name
Unnamed: 0_level_1,Int64⍰,Int64⍰,String
1,1,1,Alice
2,1,11,Bob
3,missing,missing,Zed
4,missing,99,Zoe
