# Concatenating, Appending, Joining DataFrames

This is a coplex topic.  The exaples below just demonstrate the basics

In [None]:
import numpy as np
import pandas as pd
np.__version__, pd.__version__

In [None]:
A = pd.DataFrame({"A":[1, 2, 3], "B":[4, 5, 6]})
A

In [None]:
B = pd.DataFrame({"C":[1, 2, 3], "D":[4, 5, 6]})
B

In [None]:
# Careful ... indices don't have to be unique -- confusing.
pd.concat([A, B])

In [None]:
# try extracting row '1' - using the dictionary-type method
pd.concat([A,B]).loc[1]

In [None]:
# or row '1' - using Python implicit numbering style
pd.concat([A,B]).iloc[1]

In [None]:
# Catching repeats
# We didn't go over exception handling -- look up 'try/catch' for details :-)
try:
    pd.concat([A, B], verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)

In [None]:
# if indexes don't matter -- ignore them and create a new
# implicit index with the combined dataframe
pd.concat([A, B], ignore_index=True)

In [None]:
# specify that the concatenation should happen along axis 1
pd.concat([A, B], axis=1)

In [None]:
C = pd.DataFrame({"A":[7, 8, 9], "B":[10, 11, 12]})
C

In [None]:
pd.concat([A,C])

In [None]:
pd.concat([A,C], ignore_index=True)

In [None]:
pd.concat([A,C], axis=1)

In [None]:
pd.concat([A,C], axis=1)['B']

In [None]:
pd.concat([A,C], axis=1, ignore_index=True)

In [None]:
D = pd.DataFrame({"C":[7, 8, 9], "D":[10, 11, 12]}, index=['x', 'y', 'z'])
D

In [None]:
E = pd.DataFrame({"A":[7, 8, 9], "D":[10, 11, 12]}, index=['x', 'y', 'z'])
E

In [None]:
pd.concat([A,D], axis=1)

In [None]:
pd.concat([A,D], axis=1, join="inner")

In [None]:
pd.concat([A,D], axis=1, join="outer")

In [None]:
pd.concat([A, E], join_axes=[A.columns])

In [None]:
A

In [None]:
E

## Joins

If you have a database background, you will recognize this topic.  If not, we'll do a quick run-through here and will then circle back later in the semester when we get to the MySQL component.

In [None]:
# Assume that we have a dataframe with a list of orders.
orders = pd.DataFrame({
          'order' : [123, 456, 789, 823, 950, 1024], 
            'sku' : ['A109', 'A227', 'A876', 'A109', 'A227', 'B552'], 
    'sales_price' : [765.55, 227.83, 23.50, 745.87, 235.25, 86.50]})
orders

In [None]:
# And we have a separate dataframe with the sku information.
skus = pd.DataFrame({
     'sku' : ['A100', 'A109', 'A200', 'A227', 'A300', 'A876', 'A904'],
    'name' : ['Widget1', 'Widget2', 'Widget3', 'Widget4', 'Widget5', 'Widget6', 'Widget7'],
    'cost' : [12.50, 423.50, 96.50, 86.34, 1850.45, 3.23, 7.50]
})
skus

In [None]:
# We'd like to join (merge) the data so that the sku information can be 
# easily combined with the order information.
sales = pd.merge(orders, skus)
sales

In [None]:
# now that we have the data we need, let's add a calculated column
sales['profit'] = sales['sales_price'] - sales['cost']
sales

In [None]:
# Notice that our previous merge did not include
# all of the SKU or Orders.
# Join types: inner, left, right, outer
sales = pd.merge(orders, skus, how="inner")
sales