# Merge, join, and concatenate

## concat

In [1]:
import pandas as pd
import numpy as np

pandas使用pd.concat函数进行连接DataFrame的操作

In [2]:
df_open = pd.DataFrame(np.random.randn(5, 2), 
                  index=pd.date_range("2000-01-01", periods=5, name="trade_date"),
                  columns=pd.Index(["000001.SZ", "000002.SZ"], name="code"))

In [3]:
df_close = pd.DataFrame(np.random.randn(5, 2), 
                  index=pd.date_range("2000-01-01", periods=5, name="trade_date"),
                  columns=pd.Index(["000001.SZ", "000002.SZ"], name="code"))

In [4]:
df_high = pd.DataFrame(np.random.randn(5, 2), 
                  index=pd.date_range("2000-01-01", periods=5, name="trade_date"),
                  columns=pd.Index(["000001.SZ", "000002.SZ"], name="code"))

In [5]:
frames = [df_open, df_close, df_high]
frames

[code        000001.SZ  000002.SZ
 trade_date                      
 2000-01-01  -2.104930  -0.629700
 2000-01-02   0.247390   1.427145
 2000-01-03  -0.127110  -0.952104
 2000-01-04   0.392625  -0.036554
 2000-01-05  -0.437633  -0.042132, code        000001.SZ  000002.SZ
 trade_date                      
 2000-01-01  -0.972513  -0.253390
 2000-01-02   0.604554   0.952528
 2000-01-03  -0.318157   0.557226
 2000-01-04   0.667385  -0.653972
 2000-01-05  -0.414903   0.572259, code        000001.SZ  000002.SZ
 trade_date                      
 2000-01-01   1.136805  -1.238213
 2000-01-02  -0.617595   0.329081
 2000-01-03  -0.639690  -0.187827
 2000-01-04  -1.122278  -0.963806
 2000-01-05  -0.029967   0.142067]

In [6]:
result = pd.concat(frames, axis=0)
result

code,000001.SZ,000002.SZ
trade_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-01-01,-2.10493,-0.6297
2000-01-02,0.24739,1.427145
2000-01-03,-0.12711,-0.952104
2000-01-04,0.392625,-0.036554
2000-01-05,-0.437633,-0.042132
2000-01-01,-0.972513,-0.25339
2000-01-02,0.604554,0.952528
2000-01-03,-0.318157,0.557226
2000-01-04,0.667385,-0.653972
2000-01-05,-0.414903,0.572259


In [7]:
result = pd.concat(frames, axis=1)
result

code,000001.SZ,000002.SZ,000001.SZ,000002.SZ,000001.SZ,000002.SZ
trade_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-01,-2.10493,-0.6297,-0.972513,-0.25339,1.136805,-1.238213
2000-01-02,0.24739,1.427145,0.604554,0.952528,-0.617595,0.329081
2000-01-03,-0.12711,-0.952104,-0.318157,0.557226,-0.63969,-0.187827
2000-01-04,0.392625,-0.036554,0.667385,-0.653972,-1.122278,-0.963806
2000-01-05,-0.437633,-0.042132,-0.414903,0.572259,-0.029967,0.142067


如果指定keys参数，那么就会生成带有MultiIndex的DataFrame

In [8]:
keys = ['open', 'close', 'high']

In [9]:
result = pd.concat(frames, keys=keys,  axis=1)

In [10]:
result 

Unnamed: 0_level_0,open,open,close,close,high,high
code,000001.SZ,000002.SZ,000001.SZ,000002.SZ,000001.SZ,000002.SZ
trade_date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2000-01-01,-2.10493,-0.6297,-0.972513,-0.25339,1.136805,-1.238213
2000-01-02,0.24739,1.427145,0.604554,0.952528,-0.617595,0.329081
2000-01-03,-0.12711,-0.952104,-0.318157,0.557226,-0.63969,-0.187827
2000-01-04,0.392625,-0.036554,0.667385,-0.653972,-1.122278,-0.963806
2000-01-05,-0.437633,-0.042132,-0.414903,0.572259,-0.029967,0.142067


或者干脆直接传入一个带key的dict，结果是一样的

In [11]:
frame_dict = {key: frame for key,frame in zip(keys, frames)}
frame_dict

{'close': code        000001.SZ  000002.SZ
 trade_date                      
 2000-01-01  -0.972513  -0.253390
 2000-01-02   0.604554   0.952528
 2000-01-03  -0.318157   0.557226
 2000-01-04   0.667385  -0.653972
 2000-01-05  -0.414903   0.572259, 'high': code        000001.SZ  000002.SZ
 trade_date                      
 2000-01-01   1.136805  -1.238213
 2000-01-02  -0.617595   0.329081
 2000-01-03  -0.639690  -0.187827
 2000-01-04  -1.122278  -0.963806
 2000-01-05  -0.029967   0.142067, 'open': code        000001.SZ  000002.SZ
 trade_date                      
 2000-01-01  -2.104930  -0.629700
 2000-01-02   0.247390   1.427145
 2000-01-03  -0.127110  -0.952104
 2000-01-04   0.392625  -0.036554
 2000-01-05  -0.437633  -0.042132}

In [12]:
result2 = pd.concat(frame_dict, axis=1)
result2

Unnamed: 0_level_0,close,close,high,high,open,open
code,000001.SZ,000002.SZ,000001.SZ,000002.SZ,000001.SZ,000002.SZ
trade_date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2000-01-01,-0.972513,-0.25339,1.136805,-1.238213,-2.10493,-0.6297
2000-01-02,0.604554,0.952528,-0.617595,0.329081,0.24739,1.427145
2000-01-03,-0.318157,0.557226,-0.63969,-0.187827,-0.12711,-0.952104
2000-01-04,0.667385,-0.653972,-1.122278,-0.963806,0.392625,-0.036554
2000-01-05,-0.414903,0.572259,-0.029967,0.142067,-0.437633,-0.042132


此外还有DataFrame.append方法可以用于DataFrame合并，有兴趣可以参考文档，但要注意，append不适合用在循环里，如果需要在每一个循环里向一个DataFrame添加新的DataFrame，那么更好的方式是用一个list将循环里每一步生成的DataFrame存储下来，再在循环结束后，用concat函数将他们合并成为新的DataFrame，效率会有巨大提升

## merge

pd.merge 和 DataFrame.merge原理都是一样的，看个人喜欢用哪个，我们以pd.merge为例子进行介绍

In [13]:
df1 = df_close.stack().rename("close").iloc[:8].reset_index()
df1

Unnamed: 0,trade_date,code,close
0,2000-01-01,000001.SZ,-0.972513
1,2000-01-01,000002.SZ,-0.25339
2,2000-01-02,000001.SZ,0.604554
3,2000-01-02,000002.SZ,0.952528
4,2000-01-03,000001.SZ,-0.318157
5,2000-01-03,000002.SZ,0.557226
6,2000-01-04,000001.SZ,0.667385
7,2000-01-04,000002.SZ,-0.653972


In [14]:
df2 = df_open.stack().rename("open").iloc[-8:].reset_index()
df2

Unnamed: 0,trade_date,code,open
0,2000-01-02,000001.SZ,0.24739
1,2000-01-02,000002.SZ,1.427145
2,2000-01-03,000001.SZ,-0.12711
3,2000-01-03,000002.SZ,-0.952104
4,2000-01-04,000001.SZ,0.392625
5,2000-01-04,000002.SZ,-0.036554
6,2000-01-05,000001.SZ,-0.437633
7,2000-01-05,000002.SZ,-0.042132


如果想要将两个DataFrame按照trade_date和code进行对齐，需要使用merge函数

In [16]:
pd.merge(df1, df2, on=["trade_date", "code"]) # 默认how="inner"

Unnamed: 0,trade_date,code,close,open
0,2000-01-02,000001.SZ,0.604554,0.24739
1,2000-01-02,000002.SZ,0.952528,1.427145
2,2000-01-03,000001.SZ,-0.318157,-0.12711
3,2000-01-03,000002.SZ,0.557226,-0.952104
4,2000-01-04,000001.SZ,0.667385,0.392625
5,2000-01-04,000002.SZ,-0.653972,-0.036554


In [17]:
pd.merge(df1, df2, on=["trade_date", "code"], how="outer") 

Unnamed: 0,trade_date,code,close,open
0,2000-01-01,000001.SZ,-0.972513,
1,2000-01-01,000002.SZ,-0.25339,
2,2000-01-02,000001.SZ,0.604554,0.24739
3,2000-01-02,000002.SZ,0.952528,1.427145
4,2000-01-03,000001.SZ,-0.318157,-0.12711
5,2000-01-03,000002.SZ,0.557226,-0.952104
6,2000-01-04,000001.SZ,0.667385,0.392625
7,2000-01-04,000002.SZ,-0.653972,-0.036554
8,2000-01-05,000001.SZ,,-0.437633
9,2000-01-05,000002.SZ,,-0.042132


In [18]:
pd.merge(df1, df2, on=["trade_date", "code"], how="left") 

Unnamed: 0,trade_date,code,close,open
0,2000-01-01,000001.SZ,-0.972513,
1,2000-01-01,000002.SZ,-0.25339,
2,2000-01-02,000001.SZ,0.604554,0.24739
3,2000-01-02,000002.SZ,0.952528,1.427145
4,2000-01-03,000001.SZ,-0.318157,-0.12711
5,2000-01-03,000002.SZ,0.557226,-0.952104
6,2000-01-04,000001.SZ,0.667385,0.392625
7,2000-01-04,000002.SZ,-0.653972,-0.036554


In [19]:
pd.merge(df1, df2, on=["trade_date", "code"], how="right") 

Unnamed: 0,trade_date,code,close,open
0,2000-01-02,000001.SZ,0.604554,0.24739
1,2000-01-02,000002.SZ,0.952528,1.427145
2,2000-01-03,000001.SZ,-0.318157,-0.12711
3,2000-01-03,000002.SZ,0.557226,-0.952104
4,2000-01-04,000001.SZ,0.667385,0.392625
5,2000-01-04,000002.SZ,-0.653972,-0.036554
6,2000-01-05,000001.SZ,,-0.437633
7,2000-01-05,000002.SZ,,-0.042132


更详细的操作请参考文档http://pandas.pydata.org/pandas-docs/stable/merging.html