In [1]:
import pandas as pd

In [2]:
df1 = pd.DataFrame({'ID':[1,2,3,5,9],
                    'Col_1':[1,2,3,4,5],
                    'Col_2':[6,7,8,9,10],
                    'Col_3':[11,12,13,14,15],
                    'Col_4':['apple','orange','banana','strawberry','raspberry']
                    })

df2 = pd.DataFrame({'ID':[1,1,3,5],
                    'Col_A':[8,9,10,11],
                    'Col_B':[12,13,15,17],
                    'Col_4':['apple','orange','banana','kiwi']
#                     'Col_4':[1,2,3,4]
                   })

In [3]:
df1

Unnamed: 0,ID,Col_1,Col_2,Col_3,Col_4
0,1,1,6,11,apple
1,2,2,7,12,orange
2,3,3,8,13,banana
3,5,4,9,14,strawberry
4,9,5,10,15,raspberry


In [4]:
df2

Unnamed: 0,ID,Col_A,Col_B,Col_4
0,1,8,12,apple
1,1,9,13,orange
2,3,10,15,banana
3,5,11,17,kiwi


# pd.merge()
* For combining data on common columns
* Most flexible, but also complex of the methods we'll discuss
* many-to-one and many-to-many joins are possible
* Side-by-side merge

In [5]:
# we need to tell merge what column to merge on
# we are adding more columns to the df or placing the two columns side by side
# we only keep the rows that are common to both df's (Inner Join - this is default)
# in this example df1 is the left df and df2 is the right df (this can be changed by switching the order)
# if specify left_on and right_on with index you get key_0 this can be avoided with left_index=True, right_index=True
# one-to-many = multiple 1's in df2 merged with the 1 on df1 (df1 repeated it's 1 row to match df2's 1's)

In [6]:
inner = pd.merge(df1, df2)
inner

Unnamed: 0,ID,Col_1,Col_2,Col_3,Col_4,Col_A,Col_B
0,1,1,6,11,apple,8,12
1,3,3,8,13,banana,10,15


In [7]:
# It's a good idea to specify on
pd.merge(df1, df2, on='ID')

Unnamed: 0,ID,Col_1,Col_2,Col_3,Col_4_x,Col_A,Col_B,Col_4_y
0,1,1,6,11,apple,8,12,apple
1,1,1,6,11,apple,9,13,orange
2,3,3,8,13,banana,10,15,banana
3,5,4,9,14,strawberry,11,17,kiwi


In [8]:
# this is the same as the first merge because we are specifying both shared columns
pd.merge(df1, df2, on=['ID','Col_4'])

Unnamed: 0,ID,Col_1,Col_2,Col_3,Col_4,Col_A,Col_B
0,1,1,6,11,apple,8,12
1,3,3,8,13,banana,10,15


In [9]:
# Suffixes & different columns
pd.merge(df1, df2, suffixes=['_l','_r'], left_on='Col_2', right_on='Col_A')

Unnamed: 0,ID_l,Col_1,Col_2,Col_3,Col_4_l,ID_r,Col_A,Col_B,Col_4_r
0,3,3,8,13,banana,1,8,12,apple
1,5,4,9,14,strawberry,1,9,13,orange
2,9,5,10,15,raspberry,3,10,15,banana


In [10]:
# indexes
pd.merge(df1, df2, suffixes=['_l','_r'], left_index=True, right_index=True)

Unnamed: 0,ID_l,Col_1,Col_2,Col_3,Col_4_l,ID_r,Col_A,Col_B,Col_4_r
0,1,1,6,11,apple,1,8,12,apple
1,2,2,7,12,orange,1,9,13,orange
2,3,3,8,13,banana,3,10,15,banana
3,5,4,9,14,strawberry,5,11,17,kiwi


#### INNER
in an inner join, you will lose rows that don’t have a match in the other DataFrame’s key column. <br>
#### OUTER
in an outer join (also known as a full outer join), all rows from both DataFrames will be present in the new DataFrame.<br>
#### LEFT
Using a left outer join will leave your new merged DataFrame with all rows from the left DataFrame, while discarding rows from the right DataFrame that don’t have a match in the key column of the left DataFrame.
<br>
#### RIGHT
The right join (or right outer join) is the mirror-image version of the left join. With this join, all rows from the right DataFrame will be retained, while rows in the left DataFrame without a match in the key column of the right DataFrame will be discarded.


In [11]:
# Changing the type of merge
# you can also do outer, 
                # left, and 
                # right
# Outer will include all rows and fill missing with NaN

pd.merge(df1, df2, on='Col_4', how='outer', suffixes=['_l','_r'],)

Unnamed: 0,ID_l,Col_1,Col_2,Col_3,Col_4,ID_r,Col_A,Col_B
0,1.0,1.0,6.0,11.0,apple,1.0,8.0,12.0
1,2.0,2.0,7.0,12.0,orange,1.0,9.0,13.0
2,3.0,3.0,8.0,13.0,banana,3.0,10.0,15.0
3,5.0,4.0,9.0,14.0,strawberry,,,
4,9.0,5.0,10.0,15.0,raspberry,,,
5,,,,,kiwi,5.0,11.0,17.0


In [12]:
pd.merge(df1, df2, on='Col_4', how='left', suffixes=['_l','_r'],)

Unnamed: 0,ID_l,Col_1,Col_2,Col_3,Col_4,ID_r,Col_A,Col_B
0,1,1,6,11,apple,1.0,8.0,12.0
1,2,2,7,12,orange,1.0,9.0,13.0
2,3,3,8,13,banana,3.0,10.0,15.0
3,5,4,9,14,strawberry,,,
4,9,5,10,15,raspberry,,,


In [13]:
pd.merge(df1, df2, on='Col_4', how='right', suffixes=['_l','_r'],)

Unnamed: 0,ID_l,Col_1,Col_2,Col_3,Col_4,ID_r,Col_A,Col_B
0,1.0,1.0,6.0,11.0,apple,1,8,12
1,2.0,2.0,7.0,12.0,orange,1,9,13
2,3.0,3.0,8.0,13.0,banana,3,10,15
3,,,,,kiwi,5,11,17


In [19]:
# We get a key error becuase Col_1 is not in both df
# pd.merge(df1, df2, on = 'Col_1' )    

In [16]:
# comment out col_4 and use numeric version)
pd.merge(df1, df2, on = 'Col_4' ) 

Unnamed: 0,ID_x,Col_1,Col_2,Col_3,Col_4,ID_y,Col_A,Col_B
0,1,1,6,11,apple,1,8,12
1,2,2,7,12,orange,1,9,13
2,3,3,8,13,banana,3,10,15


In [18]:
# Make sure to remove 
# pd.merge(df1, df2, on='ID', suffixes=['_l','_r'], left_on='Col_2', right_on='Col_A')

* MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

# df.join()
* Anything you can do with .join you can do with .merge

In [20]:
# default join is left
# remember we don't have to specifiy on, but we should always do it anyways
df1.join(df2, on='ID', lsuffix='_l',rsuffix='_r')

Unnamed: 0,ID_l,Col_1,Col_2,Col_3,Col_4_l,ID_r,Col_A,Col_B,Col_4_r
0,1,1,6,11,apple,1.0,9.0,13.0,orange
1,2,2,7,12,orange,3.0,10.0,15.0,banana
2,3,3,8,13,banana,5.0,11.0,17.0,kiwi
3,5,4,9,14,strawberry,,,,
4,9,5,10,15,raspberry,,,,


In [21]:
df1.join(df2, on='ID', how='inner', lsuffix='_l',rsuffix='_r')

Unnamed: 0,ID,ID_l,Col_1,Col_2,Col_3,Col_4_l,ID_r,Col_A,Col_B,Col_4_r
0,1,1,1,6,11,apple,1,9,13,orange
1,2,2,2,7,12,orange,3,10,15,banana
2,3,3,3,8,13,banana,5,11,17,kiwi


# pd.concat()
* good for side by side and stacking on top of eachother


Concatenation is a bit different from the merging techniques you saw above. With merging, you can expect the resulting dataset to have rows from the parent datasets mixed in together, often based on some commonality. Depending on the type of merge, you might also lose rows that don’t have matches in the other dataset.

With concatenation, your datasets are just stitched together along an axis — either the row axis or column axis.

https://realpython.com/pandas-merge-join-and-concat/

In [22]:
# default axis is 0 (stack)
pd.concat([df1, df2])

Unnamed: 0,ID,Col_1,Col_2,Col_3,Col_4,Col_A,Col_B
0,1,1.0,6.0,11.0,apple,,
1,2,2.0,7.0,12.0,orange,,
2,3,3.0,8.0,13.0,banana,,
3,5,4.0,9.0,14.0,strawberry,,
4,9,5.0,10.0,15.0,raspberry,,
0,1,,,,apple,8.0,12.0
1,1,,,,orange,9.0,13.0
2,3,,,,banana,10.0,15.0
3,5,,,,kiwi,11.0,17.0


In [23]:
# reset the index
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,ID,Col_1,Col_2,Col_3,Col_4,Col_A,Col_B
0,1,1.0,6.0,11.0,apple,,
1,2,2.0,7.0,12.0,orange,,
2,3,3.0,8.0,13.0,banana,,
3,5,4.0,9.0,14.0,strawberry,,
4,9,5.0,10.0,15.0,raspberry,,
5,1,,,,apple,8.0,12.0
6,1,,,,orange,9.0,13.0
7,3,,,,banana,10.0,15.0
8,5,,,,kiwi,11.0,17.0


In [24]:
# can side-by-side by specifying axis=1 
pd.concat([df1, df2], axis=1)

# Note: This example assumes that your indices are the same between datasets. 
#     If they are different while concatenating along columns (axis 1), 
#     then by default the extra indices (rows) will also be added, and NaN 
#     values will be filled in as applicable.
# so no data is lost by default

Unnamed: 0,ID,Col_1,Col_2,Col_3,Col_4,ID.1,Col_A,Col_B,Col_4.1
0,1,1,6,11,apple,1.0,8.0,12.0,apple
1,2,2,7,12,orange,1.0,9.0,13.0,orange
2,3,3,8,13,banana,3.0,10.0,15.0,banana
3,5,4,9,14,strawberry,5.0,11.0,17.0,kiwi
4,9,5,10,15,raspberry,,,,


In [25]:
# reset the index
pd.concat([df1, df2], axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1,1,6,11,apple,1.0,8.0,12.0,apple
1,2,2,7,12,orange,1.0,9.0,13.0,orange
2,3,3,8,13,banana,3.0,10.0,15.0,banana
3,5,4,9,14,strawberry,5.0,11.0,17.0,kiwi
4,9,5,10,15,raspberry,,,,


In [26]:
# default join is outer, but you can specify inner where data will 
# be lost in the other dataset (right) if there is no match
pd.concat([df1, df2], axis=1, join = 'inner')

Unnamed: 0,ID,Col_1,Col_2,Col_3,Col_4,ID.1,Col_A,Col_B,Col_4.1
0,1,1,6,11,apple,1,8,12,apple
1,2,2,7,12,orange,1,9,13,orange
2,3,3,8,13,banana,3,10,15,banana
3,5,4,9,14,strawberry,5,11,17,kiwi


In [27]:
pd.concat([df1, df2], axis=0, join = 'inner')

Unnamed: 0,ID,Col_4
0,1,apple
1,2,orange
2,3,banana
3,5,strawberry
4,9,raspberry
0,1,apple
1,1,orange
2,3,banana
3,5,kiwi


# df.append()

In [28]:
df1.append(df2)

  df1.append(df2)


Unnamed: 0,ID,Col_1,Col_2,Col_3,Col_4,Col_A,Col_B
0,1,1.0,6.0,11.0,apple,,
1,2,2.0,7.0,12.0,orange,,
2,3,3.0,8.0,13.0,banana,,
3,5,4.0,9.0,14.0,strawberry,,
4,9,5.0,10.0,15.0,raspberry,,
0,1,,,,apple,8.0,12.0
1,1,,,,orange,9.0,13.0
2,3,,,,banana,10.0,15.0
3,5,,,,kiwi,11.0,17.0


In [29]:
df1.append(df2, sort=True)

  df1.append(df2, sort=True)


Unnamed: 0,Col_1,Col_2,Col_3,Col_4,Col_A,Col_B,ID
0,1.0,6.0,11.0,apple,,,1
1,2.0,7.0,12.0,orange,,,2
2,3.0,8.0,13.0,banana,,,3
3,4.0,9.0,14.0,strawberry,,,5
4,5.0,10.0,15.0,raspberry,,,9
0,,,,apple,8.0,12.0,1
1,,,,orange,9.0,13.0,1
2,,,,banana,10.0,15.0,3
3,,,,kiwi,11.0,17.0,5


https://realpython.com/pandas-merge-join-and-concat/