# Concatenating, Appending, Joining DataFrames

This is a coplex topic.  The exaples below just demonstrate the basics

In [115]:
import numpy as np
import pandas as pd
np.__version__, pd.__version__

('1.14.0', '0.22.0')

In [116]:
A = pd.DataFrame({"A":[1, 2, 3], "B":[4, 5, 6]})
A

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [117]:
B = pd.DataFrame({"C":[1, 2, 3], "D":[4, 5, 6]})
B

Unnamed: 0,C,D
0,1,4
1,2,5
2,3,6


In [118]:
# Careful ... indices don't have to be unique -- confusing.
pd.concat([A, B])

Unnamed: 0,A,B,C,D
0,1.0,4.0,,
1,2.0,5.0,,
2,3.0,6.0,,
0,,,1.0,4.0
1,,,2.0,5.0
2,,,3.0,6.0


In [123]:
# Catching repeats
try:
    pd.concat([A, B], verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)

ValueError: Indexes have overlapping values: [0, 1, 2]


In [119]:
pd.concat([A, B], axis=1)

Unnamed: 0,A,B,C,D
0,1,4,1,4
1,2,5,2,5
2,3,6,3,6


In [120]:
C = pd.DataFrame({"A":[7, 8, 9], "B":[10, 11, 12]})
C

Unnamed: 0,A,B
0,7,10
1,8,11
2,9,12


In [121]:
pd.concat([A,C])

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6
0,7,10
1,8,11
2,9,12


In [122]:
pd.concat([A,C], axis=1)

Unnamed: 0,A,B,A.1,B.1
0,1,4,7,10
1,2,5,8,11
2,3,6,9,12


In [111]:
D = pd.DataFrame({"C":[7, 8, 9], "D":[10, 11, 12]}, index=['x', 'y', 'z'])
D

Unnamed: 0,C,D
x,7,10
y,8,11
z,9,12


In [112]:
E = pd.DataFrame({"A":[7, 8, 9], "D":[10, 11, 12]}, index=['x', 'y', 'z'])
E

Unnamed: 0,A,D
x,7,10
y,8,11
z,9,12


In [25]:
pd.concat([A,D], axis=1)

Unnamed: 0,A,B,C,D
0,1.0,4.0,,
1,2.0,5.0,,
2,3.0,6.0,,
x,,,7.0,10.0
y,,,8.0,11.0
z,,,9.0,12.0


In [41]:
pd.concat([A,D], axis=1, join="inner")

Unnamed: 0,A,B,C,D


In [30]:
pd.concat([A,D], axis=1, join="outer")

Unnamed: 0,A,B,C,D
0,1.0,4.0,,
1,2.0,5.0,,
2,3.0,6.0,,
x,,,7.0,10.0
y,,,8.0,11.0
z,,,9.0,12.0


In [51]:
pd.concat([A, E], join_axes=[A.columns])

Unnamed: 0,A,B
0,1,4.0
1,2,5.0
2,3,6.0
x,7,
y,8,
z,9,


In [45]:
A

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [46]:
E

Unnamed: 0,A,D
x,7,10
y,8,11
z,9,12


## Joins

If you have a database background, you will recognize this topic.  If not, we'll do a quick run-through here and will then circle back later in the semester when we get to the MySQL component.

In [113]:
# Assume that we have a dataframe with a list of orders.
orders = pd.DataFrame({
    'order': [123, 456, 789, 823, 950, 1024], 
    'sku':['A109', 'A227', 'A876', 'A109', 'A227', 'B552'], 
    'sales_price':[765.55, 227.83, 23.50, 745.87, 235.25, 86.50]})
orders

Unnamed: 0,order,sales_price,sku
0,123,765.55,A109
1,456,227.83,A227
2,789,23.5,A876
3,823,745.87,A109
4,950,235.25,A227
5,1024,86.5,B552


In [114]:
# And we have a separate dataframe with the sku information.
skus = pd.DataFrame({
    'sku' :['A100', 'A109', 'A200', 'A227', 'A300', 'A876', 'A904'],
    'name':['Widget1', 'Widget2', 'Widget3', 'Widget4', 'Widget5', 'Widget6', 'Widget7'],
    'cost': [12.50, 423.50, 96.50, 86.34, 1850.45, 3.23, 7.50]
})
skus

Unnamed: 0,cost,name,sku
0,12.5,Widget1,A100
1,423.5,Widget2,A109
2,96.5,Widget3,A200
3,86.34,Widget4,A227
4,1850.45,Widget5,A300
5,3.23,Widget6,A876
6,7.5,Widget7,A904


In [105]:
# We'd like to join (merge) the data so that the sku information can be 
# easily combined with the order information.
sales = pd.merge(orders, skus)
sales

Unnamed: 0,order,sales_price,sku,cost,name
0,123,765.55,A109,423.5,Widget2
1,823,745.87,A109,423.5,Widget2
2,456,227.83,A227,86.34,Widget4
3,950,235.25,A227,86.34,Widget4
4,789,23.5,A876,3.23,Widget6


In [107]:
# now that we have the data we need, let's add a calculated column
sales['profit'] = sales['sales_price'] - sales['cost']
sales

Unnamed: 0,order,sales_price,sku,cost,name,profit
0,123,765.55,A109,423.5,Widget2,342.05
1,823,745.87,A109,423.5,Widget2,322.37
2,456,227.83,A227,86.34,Widget4,141.49
3,950,235.25,A227,86.34,Widget4,148.91
4,789,23.5,A876,3.23,Widget6,20.27


In [102]:
# inner, left, right, outer joins
sales = pd.merge(orders, skus, how="inner")
sales

Unnamed: 0,order,sales_price,sku,cost,name
0,123,765.55,A109,423.5,Widget2
1,823,745.87,A109,423.5,Widget2
2,456,227.83,A227,86.34,Widget4
3,950,235.25,A227,86.34,Widget4
4,789,23.5,A876,3.23,Widget6
