# Combining Datasets: Concat and Append

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Here's a function for making some DataFrames for messing around with

def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
            for c in cols}
    return pd.DataFrame(data, ind)

In [3]:
make_df('ABCDEF', range(8))

Unnamed: 0,A,B,C,D,E,F
0,A0,B0,C0,D0,E0,F0
1,A1,B1,C1,D1,E1,F1
2,A2,B2,C2,D2,E2,F2
3,A3,B3,C3,D3,E3,F3
4,A4,B4,C4,D4,E4,F4
5,A5,B5,C5,D5,E5,F5
6,A6,B6,C6,D6,E6,F6
7,A7,B7,C7,D7,E7,F7


In [4]:
# Here's some kind of a class for showing DataFrames Next to each other

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

### Recall: Concatenation of Numpy Arrays

In [15]:
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]

xyz = np.concatenate([[x],[y],[z]])

np.concatenate([xyz, xyz], axis=0)

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9],
       [1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [16]:
np.concatenate([xyz, xyz], axis=1)

array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6],
       [7, 8, 9, 7, 8, 9]])

### Simple concatenation

In [17]:
# the analogous function to the numpy '.concatenate' method is 'pd.concat(___)'

ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [18]:
# pd.concat works with high dimensional objects than just Series

df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
display('df1', 'df2', 'pd.concat([df1, df2])')

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [23]:
# When it is necessary the axis can be specified

df3 = make_df('AB', [0,1])
df4 = make_df('CD', [0,1])
display('df3','df4',"pd.concat([df3, df4], axis='columns')")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,C,D
0,C0,D0
1,C1,D1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


### Pandas preserves Indices when that axis is being concatenated

In [33]:
display('df3','df4',"pd.concat([df3, df4], axis='columns')", "pd.concat([df3, df4], axis='rows')")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,C,D
0,C0,D0
1,C1,D1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1

Unnamed: 0,A,B,C,D
0,A0,B0,,
1,A1,B1,,
0,,,C0,D0
1,,,C1,D1


In [37]:
# To raise an error when the indices are going to repeat 
# set the 'verify_integrity' flag to 'True'

try:
    pd.concat([df3, df4], axis='rows', verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)

ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


In [38]:
# To ignore the index altogether set the 'ignore_index' flag to 'True'
# This names the index the default integer range starting from 0
pd.concat([df3, df4], axis='rows', ignore_index=True)

Unnamed: 0,A,B,C,D
0,A0,B0,,
1,A1,B1,,
2,,,C0,D0
3,,,C1,D1


In [41]:
# You could also MultiIndex the DataFrame by adding setting a list to the 'keys' keyword.

display('df3', 'df4', "pd.concat([df3, df4], axis='rows', keys=['Barrel','Bung'])")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,C,D
0,C0,D0
1,C1,D1

Unnamed: 0,Unnamed: 1,A,B,C,D
Barrel,0,A0,B0,,
Barrel,1,A1,B1,,
Bung,0,,,C0,D0
Bung,1,,,C1,D1


### Concatenation with Joins

In [42]:
# By default pd.concat peforms an outer join, encompassing all of the indices


df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
display('df5', 'df6', 'pd.concat([df5, df6])')

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [50]:
# Set the keyword 'join' equal to 'inner' to perform an inner join and
# only keep the common indices between the two dataframes
pd.concat?
display('df5', 'df6', "pd.concat([df5, df6], join='inner')")

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [54]:
# Another option is to explicitly state the indices to use
# The way to do this is to use the '.reindex()' method

display('df5', 'df6', 'pd.concat([df5, df6]).reindex(df5.columns, axis=1)')

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2
3,,B3,C3
4,,B4,C4


In [60]:
# The '.append(___)' method is also available for combining dataframes
# 
# When joining large dataframes or performing multiple joins it is better to
# use the '.concat()' method because '.append(__)' creates a new dataframe
# making it inefficient.

display('df1', 'df2', 'df1.append(df2)')

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4
