In [3]:
%pylab inline
import pandas as pd
from pandas import DataFrame, Series

Populating the interactive namespace from numpy and matplotlib


In [4]:
df1 = DataFrame({'key': list('bbacaab'),
                 'data1': range(7)})
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [5]:
df2 = DataFrame({'key': list('abd'),
                 'data2': range(3)})
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,d


The following keeps rows where the 'key' values (the only matching columns) have the same values, and combines the other columns - data1 and data2 here. Rows with values for the 'key' column that don't have a matching value in the other DataFrame - here, rows with 'key' values of 'c' and 'd' - are left out of the resulting merged DataFrame. This is an inner join, which is merge's default. See below for other types of join.

In [6]:
pd.merge(df1, df2)

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


The merge method uses overlapping/matching column names as keys if no keys are specified. It's good practice to specify the keys.

In [7]:
pd.merge(df1, df2, on='key')

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [8]:
df3 = DataFrame({'lkey': list('bbacaab'),
                 'data1': range(7)})
df3

Unnamed: 0,data1,lkey
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [9]:
df4 = DataFrame({'rkey': list('abd'),
                 'data2': range(3)})
df4

Unnamed: 0,data2,rkey
0,0,a
1,1,b
2,2,d


In [10]:
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,data1,lkey,data2,rkey
0,0,b,1,b
1,1,b,1,b
2,6,b,1,b
3,2,a,0,a
4,4,a,0,a
5,5,a,0,a


You can do more than just the default inner join, using the 'how' parameter. In these examples, an outer join this keeps the 'c' and 'd' key values and their associated data. An outer join here is a combination of left and right joins, which can also be specified using 'how'. Considered differently, an inner join is the intersection of the keys, and an outer join is the union of the keys.

In [11]:
pd.merge(df1, df2, how='outer')

Unnamed: 0,data1,key,data2
0,0.0,b,1.0
1,1.0,b,1.0
2,6.0,b,1.0
3,2.0,a,0.0
4,4.0,a,0.0
5,5.0,a,0.0
6,3.0,c,
7,,d,2.0


Up to this point we've been doing many-to-one merges, because the second DataFrame's had only a single row for each key value. You can also do many-to-many merges.

In [12]:
df1 = DataFrame({'key': list('bbacab'),
                 'data1': range(6)})
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [13]:
df2 = DataFrame({'key': list('ababd'),
                 'data2': range(5)})
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,a
3,3,b
4,4,d


In [14]:
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,data1,key,data2
0,0,b,1.0
1,0,b,3.0
2,1,b,1.0
3,1,b,3.0
4,2,a,0.0
5,2,a,2.0
6,3,c,
7,4,a,0.0
8,4,a,2.0
9,5,b,1.0


There's more on page 180 and 181 about merging using more than one key, and to handle overlapping column names by specifying text to append to overlapping names. (You can also rename axis labels, as explained later.)

##Merging on index

By default the merge keys are assumed to be in columns. You can also merge using rows/indexes.

In [15]:
left1 = DataFrame({'key': list('abaabc'),
                   'value': range(6)})
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [16]:
right1 = DataFrame({'group_val': [3.5, 7]}, index=['a','b'])
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [18]:
# for the right side, use the row index as the key
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [19]:
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


###Merging hierarchically-indexed data

In [20]:
lefth = DataFrame({'key1': ['Ohio','Ohio','Ohio','Nevada','Nevada'],
                   'key2': [2000, 2001, 2002, 2001, 2002],
                   'data': np.arange(5.)})
lefth

Unnamed: 0,data,key1,key2
0,0,Ohio,2000
1,1,Ohio,2001
2,2,Ohio,2002
3,3,Nevada,2001
4,4,Nevada,2002


In [22]:
righth = DataFrame(np.arange(12).reshape((6,2)),
                   index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],
                          [2001, 2000, 2000, 2000, 2001, 2002]],
                   columns=['event1','event2'])
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [23]:
pd.merge(lefth, righth, left_on=['key1','key2'], right_index=True)

Unnamed: 0,data,key1,key2,event1,event2
0,0,Ohio,2000,4,5
0,0,Ohio,2000,6,7
1,1,Ohio,2001,8,9
2,2,Ohio,2002,10,11
3,3,Nevada,2001,0,1


In [24]:
pd.merge(lefth, righth, left_on=['key1','key2'],
         right_index=True, how='outer')

Unnamed: 0,data,key1,key2,event1,event2
0,0.0,Ohio,2000,4.0,5.0
0,0.0,Ohio,2000,6.0,7.0
1,1.0,Ohio,2001,8.0,9.0
2,2.0,Ohio,2002,10.0,11.0
3,3.0,Nevada,2001,0.0,1.0
4,4.0,Nevada,2002,,
4,,Nevada,2000,2.0,3.0


In [25]:
left2 = DataFrame([[1.,2.], [3.,4.], [5.,6.]],
                  index=list('ace'),
                  columns=['Ohio','Nevada'])
left2

Unnamed: 0,Ohio,Nevada
a,1,2
c,3,4
e,5,6


In [27]:
right2 = DataFrame([[7.,8.], [9.,10.], [11.,12.], [13.,14.]],
                   index=list('bcde'),
                   columns=['Missouri','Alabama'])
right2

Unnamed: 0,Missouri,Alabama
b,7,8
c,9,10
d,11,12
e,13,14


In [28]:
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


You can also use the 'more convenient' 'join' instance method to merge by index, and 'also to combine together many DataFrame objects that have the same or similar indices but non-overlapping columns. You could do the previous example as follows:

In [29]:
left2.join(right2, how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


You can also join on the index of the passed DataFrame on one of the columns of the calling DataFrame.

In [30]:
left1.join(right1, on='key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


Finally, to do a 'simple index-on-index merge', you can pass a list of DataFrame instances to join. This is an alternative to the more general concat function described later.

In [31]:
another = DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17.]],
                    index=list('acef'),
                    columns=['New York','Oregon'])
another

Unnamed: 0,New York,Oregon
a,7,8
c,9,10
e,11,12
f,16,17


In [32]:
left2

Unnamed: 0,Ohio,Nevada
a,1,2
c,3,4
e,5,6


In [33]:
right2

Unnamed: 0,Missouri,Alabama
b,7,8
c,9,10
d,11,12
e,13,14


In [36]:
left2.join([right2, another])

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1,2,,,7,8
c,3,4,9.0,10.0,9,10
e,5,6,13.0,14.0,11,12


In [35]:
left2.join([right2, another], how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
b,,,7.0,8.0,,
c,3.0,4.0,9.0,10.0,9.0,10.0
d,,,11.0,12.0,,
e,5.0,6.0,13.0,14.0,11.0,12.0
f,,,,,16.0,17.0


##Concatenating along an axis - "concatenating", "binding", "stacking"

NumPy has a concatenate method.

In [37]:
arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [38]:
np.concatenate([arr, arr], axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [39]:
np.concatenate([arr, arr])

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

Pandas objects - Series, DataFrame - have labeled axes, which 'further generalize' how you do array concatenation.

In [40]:
s1 = Series([0, 1], index=['a','b'])
s1

a    0
b    1
dtype: int64

In [41]:
s2 = Series([2, 3, 4], index=list('cde'))
s2

c    2
d    3
e    4
dtype: int64

In [42]:
s3 = Series([5, 6], index=['f','g'])
s3

f    5
g    6
dtype: int64

In [44]:
# glue together values and indexes
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

By default, concat works along axis=0 (rows) and produces another Series. Passing axis=1 gets you a DataFrame, because axis=1 is columns.

In [45]:
pd.concat([s1, s2, s3], axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


Above there's no overlap in the axis values; the resulting axis is the sorted union (outer join) of the indexes. To intersect them/do an inner join:

In [51]:
s1

a    0
b    1
dtype: int64

In [47]:
s4 = pd.concat([s1 * 5, s3])
s4

a    0
b    5
f    5
g    6
dtype: int64

In [49]:
pd.concat([s1, s4], axis=1)

Unnamed: 0,0,1
a,0.0,0
b,1.0,5
f,,5
g,,6


In [50]:
pd.concat([s1, s4], axis=1, join='inner')

Unnamed: 0,0,1
a,0,0
b,1,5


In the above examples, the concatenated pieces can't be identified in the result. "You might want a hierarchical index on the concatenation axis", which you can do with the 'keys' argument:

In [52]:
result = pd.concat([s1, s1, s3], keys=['one','two','three'])
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [53]:
# then unstack to rotate the inner part of the MultiIndex to columns
result.unstack()

Unnamed: 0,a,b,f,g
one,0.0,1.0,,
two,0.0,1.0,,
three,,,5.0,6.0


And if you combine Series along axis=1 (concatenate by adding columns), the keys become the DataFrame column headers.

In [54]:
pd.concat([s1, s2, s3], axis=1, keys=['one','two','three'])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


And the same applies when you concatenate DataFrames as below, instead of Series as above.

In [55]:
df1 = DataFrame(np.arange(6).reshape(3,2), index=list('abc'),
                columns=['one','two'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [56]:
df2 = DataFrame(np.arange(4).reshape(2,2), index=['a','c'],
                columns=['three','four'])
df2

Unnamed: 0,three,four
a,0,1
c,2,3


In [57]:
pd.concat([df1, df2], axis=1, keys=['level1','level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,0.0,1.0
b,2,3,,
c,4,5,2.0,3.0


There's more on p187 and 188, about how to affect how the hierarchical index is created, including use of 'names' and 'ignore_index' when the row index isn't meaningful in context.

##Combining data with overlap