# Mod16 Concatenate  and Merge

## Simple Concatenation with pd.concat

In [1]:
import pandas as pd
import numpy as np

By default, the concatenation takes place row-wise within the ``DataFrame`` (i.e., axis=0)

In [2]:
df1 = pd.DataFrame([['A1','B1'], ['A2','B2']],
             columns=['A', 'B'],
             index=[1,2])
df1

Unnamed: 0,A,B
1,A1,B1
2,A2,B2


In [9]:
df2 = pd.DataFrame([['A3','B3'], ['A4','B4']],
             columns=['A', 'B'],
             index=[3,4])
df2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4


In [10]:
pd.concat([df1, df2])

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


#### The append() method

In [11]:
df1.append(df2)

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


#### Duplicated indeics

In [12]:
x = pd.DataFrame([['A0','B0'], ['A1','B1']],
                 columns=['A', 'B'],index=[0,1])
x

Unnamed: 0,A,B
0,A0,B0
1,A1,B1


In [13]:
y = pd.DataFrame([['A2','B2'], ['A3','B3']],
                 columns=['A', 'B'],index=[0,1])
y

Unnamed: 0,A,B
0,A2,B2
1,A3,B3


In [19]:
y.loc[0]

A    A2
B    B2
Name: 0, dtype: object

In [28]:
z = pd.concat([x, y]);z

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A2,B2
1,A3,B3


In [35]:
z.loc[0]

Unnamed: 0,A,B
0,A0,B0
0,A2,B2


In [36]:
# assertion.  set checkpoint for catch expected error
pd.concat([x, y], verify_integrity=True)

ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')

In [37]:
# establish new index. not suggested for big data 
pd.concat([x, y], ignore_index=True)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [None]:
# Hierachchical Indexing
pd.concat([x, y], keys=['x', 'y'])

## Data Merge

#### One-to-One

In [38]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [39]:
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
df2

Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [40]:
pd.merge(df1, df2)

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [41]:
# assign merged key
pd.merge(df1, df2, on='employee')

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [42]:
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})
df3

Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,120000
3,Sue,90000


In [44]:
# different merged key name
# using ‘left_on’ and ‘rigit_on’ argument
pd.merge(df1, df3, left_on="employee", 
         right_on="name").drop('name', axis=1)

Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Sue,HR,90000


#### Many-to-One

In [45]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
df4

Unnamed: 0,group,supervisor
0,Accounting,Carly
1,Engineering,Guido
2,HR,Steve


In [46]:
df5 = pd.merge(df1, df2)
df5

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [47]:
pd.merge(df5, df4)

Unnamed: 0,employee,group,hire_date,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


#### Many-to-Many

In [48]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [49]:
df6 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization']})
df6

Unnamed: 0,group,skills
0,Accounting,math
1,Accounting,spreadsheets
2,Engineering,coding
3,Engineering,linux
4,HR,spreadsheets
5,HR,organization


In [50]:
pd.merge(df1, df6)

Unnamed: 0,employee,group,skills
0,Bob,Accounting,math
1,Bob,Accounting,spreadsheets
2,Jake,Engineering,coding
3,Jake,Engineering,linux
4,Lisa,Engineering,coding
5,Lisa,Engineering,linux
6,Sue,HR,spreadsheets
7,Sue,HR,organization
