# Concatenating, Appending, Joining DataFrames

This is a complex topic.  The examples below just demonstrate the basics -- you will need to do more investigation and/or experimentation when you have real work to do.

In [None]:
import numpy as np
import pandas as pd
np.__version__, pd.__version__

In [None]:
A = pd.DataFrame({"A":[1, 2, 3], "B":[4, 5, 6]})
A

In [None]:
B = pd.DataFrame({"C":[1, 2, 3], "D":[4, 5, 6]})
B

In [None]:
# Careful ... indices don't have to be unique -- confusing.
pd.concat([A, B])

In [None]:
# try extracting row '1' - using the dictionary-type (explicit index) method
pd.concat([A,B]).loc[1]

In [None]:
# or row '1' - using Python implicit numbering style
pd.concat([A,B]).iloc[1]

In [None]:
# Catching repeats
# We didn't go over exception handling -- look up 'try/catch' for details :-)
try:
    pd.concat([A, B], verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)

In [None]:
# if indexes don't matter -- ignore them and create a new
# explicit index with the combined dataframe
pd.concat([A, B], ignore_index=True)

In [None]:
# Now the explicit index matches the implict index
pd.concat([A, B], ignore_index=True).index

In [None]:
# specify that the concatenation should happen along axis 1
pd.concat([A, B], axis=1)


In [None]:
C = pd.DataFrame({"A":[7, 8, 9], "B":[10, 11, 12]})
C

In [None]:
pd.concat([A,C])

In [None]:
pd.concat([A,C], ignore_index=True)

In [None]:
pd.concat([A,C], axis=1)

In [None]:
pd.concat([A,C], axis=1)['B']

In [None]:
pd.concat([A,C], axis=1, ignore_index=True)

In [None]:
D = pd.DataFrame({"C":[7, 8, 9], "D":[10, 11, 12]}, index=['x', 'y', 'z'])
D

In [None]:
E = pd.DataFrame({"A":[7, 8, 9], "D":[10, 11, 12]}, index=['x', 'y', 'z'])
E

In [None]:
pd.concat([A,D])

In [None]:
pd.concat([A,D], axis=1)

In [None]:
pd.concat([A,D], ignore_index=True)

In [None]:
pd.concat([A,D], axis=1, ignore_index=True)

In [None]:
pd.concat([A,B,C,D,E])

In [None]:
pd.concat([A,B,C,D,E], axis=1)

## Joins

Implements a subset of <em>relational algebra</em>.  If you have a database background, you will recognize this topic.  If not, we'll do a quick run-through here and will provide some links/referenes for further information.

In [None]:
# Create a dataframe
x = pd.DataFrame({'A' : [100, 200, 400], 'B' : [1, 3, 4]})
x

In [None]:
# Create a second data frame with a common column ('A')
y = pd.DataFrame({'A' : [100, 200, 900, 1000], 'C' : [18, 13, 12, 22]})
y

In [None]:
# Concatenate - try with axis = 0, 1
pd.concat([x, y], sort=False, axis=0)

In [None]:
pd.concat([x, y], sort=False, axis=1)

In [None]:
# With a join, we want something different -- we want to join rows that have a common
# key value so that we can use columns from both sets.
#
# Default merge - include rows where the common row values match (an 'inner join')
pd.merge(x, y)

In [None]:
# try inner, left, right, outer joins
pd.merge(x, y, how="inner")

In [None]:
# Note that when you flip the order of arguments, x and y, you flip the mapping of left and right
pd.merge(y, x, how="right")

In [None]:
# Note also that there is a dataframe.merge() function also
x.merge(y, how="left")
# https://stackoverflow.com/questions/62143713/difference-between-pd-merge-and-dataframe-merge
# pd.merge() is a wraping style function and dataframe.merge() is a chaining style function.
# Another interesting description:
# https://towardsdatascience.com/the-unreasonable-effectiveness-of-method-chaining-in-pandas-15c2109e3c69

### Multi-table joins

In [None]:
# Create a third data frame with a common column ('A')
z = pd.DataFrame({'A' : [100, 400, 1000, 2000], 'D' : [9,6,4,18]})
z

In [None]:
# again, test with inner, left, right, outer -- in all combinations
pd.merge(pd.merge(x, y, how="inner"), z, how="inner")

In [None]:
# note the chaining version
x.merge(y, how="inner").merge(z, how="inner")

### Many-to-one relationships

In [None]:
# What if I have multiple instances of a key value?
# Create a third data frame with a common column ('A') and multiple records with the same A values
z = pd.DataFrame({'A' : [100, 100, 100, 2700, 100], 'D' : [9,6,4,18, 16]})
z

In [None]:
pd.merge(x, y, how="inner")

In [None]:
# Many-to-one relationship
pd.merge(pd.merge(x, y, how="inner"), z, how="inner")

In [None]:
# and we can have an aribitrary number of records with different join (key) values
z = pd.DataFrame({'A' : [100, 100, 100, 2700, 100, 500, 200, 200, 300, 200], 'D' : [9,6,4,18, 16, 22, 78, 19, 23, 42]})
z

In [None]:
# Many-to-one relationship
pd.merge(pd.merge(x, y, how="inner"), z, how="inner")

## Order-Item-SKU Example - Moved to separate Notebook - Order List Example Using Pandas Joins