# 时间数据重采样

## resample

In [1]:
import pandas as pd
import numpy as np

date_rng = pd.date_range('20170101', periods=100, freq='D')
ser_obj = pd.Series(range(len(date_rng)), index=date_rng)
print(ser_obj.head(10))

2017-01-01    0
2017-01-02    1
2017-01-03    2
2017-01-04    3
2017-01-05    4
2017-01-06    5
2017-01-07    6
2017-01-08    7
2017-01-09    8
2017-01-10    9
Freq: D, dtype: int32


In [2]:
# 统计每个月的数据总和
resample_month_sum = ser_obj.resample('M').sum()
# 统计每个月的数据平均
resample_month_mean = ser_obj.resample('M').mean()

print('按月求和：', resample_month_sum)
print('按月求均值：', resample_month_mean)

按月求和： 2017-01-31     465
2017-02-28    1246
2017-03-31    2294
2017-04-30     945
Freq: M, dtype: int32
按月求均值： 2017-01-31    15.0
2017-02-28    44.5
2017-03-31    74.0
2017-04-30    94.5
Freq: M, dtype: float64


## 降采样

In [3]:
# 将数据聚合到5天的频率
five_day_sum_sample = ser_obj.resample('5D').sum()
five_day_mean_sample = ser_obj.resample('5D').mean()
five_day_ohlc_sample = ser_obj.resample('5D').ohlc()

print('降采样，sum')
print(five_day_sum_sample)

降采样，sum
2017-01-01     10
2017-01-06     35
2017-01-11     60
2017-01-16     85
2017-01-21    110
2017-01-26    135
2017-01-31    160
2017-02-05    185
2017-02-10    210
2017-02-15    235
2017-02-20    260
2017-02-25    285
2017-03-02    310
2017-03-07    335
2017-03-12    360
2017-03-17    385
2017-03-22    410
2017-03-27    435
2017-04-01    460
2017-04-06    485
Freq: 5D, dtype: int32


In [4]:
print('降采样，mean')
print(five_day_mean_sample)

降采样，mean
2017-01-01     2
2017-01-06     7
2017-01-11    12
2017-01-16    17
2017-01-21    22
2017-01-26    27
2017-01-31    32
2017-02-05    37
2017-02-10    42
2017-02-15    47
2017-02-20    52
2017-02-25    57
2017-03-02    62
2017-03-07    67
2017-03-12    72
2017-03-17    77
2017-03-22    82
2017-03-27    87
2017-04-01    92
2017-04-06    97
Freq: 5D, dtype: int32


In [6]:
# 使用groupby降采样
print(ser_obj.groupby(lambda x: x.month).sum())

1     465
2    1246
3    2294
4     945
dtype: int32


In [7]:
print(ser_obj.groupby(lambda x: x.weekday).sum())

0    750
1    665
2    679
3    693
4    707
5    721
6    735
dtype: int32


## 升采样

In [16]:
df = pd.DataFrame(np.random.randn(5, 3),
                 index=pd.date_range('20170101', periods=5, freq='W-MON'),
                 columns=['S1', 'S2', 'S3'])
print(df)

                  S1        S2        S3
2017-01-02 -1.144930  2.056606  1.280437
2017-01-09  0.665305 -1.167066  0.652643
2017-01-16  1.725148  0.310244  0.577190
2017-01-23 -0.155053 -0.563262  0.588269
2017-01-30  0.352137 -0.223055  0.638517


In [18]:
# 直接重采样会产生空值
print(df.resample('D').asfreq())

                  S1        S2        S3
2017-01-02 -1.144930  2.056606  1.280437
2017-01-03       NaN       NaN       NaN
2017-01-04       NaN       NaN       NaN
2017-01-05       NaN       NaN       NaN
2017-01-06       NaN       NaN       NaN
2017-01-07       NaN       NaN       NaN
2017-01-08       NaN       NaN       NaN
2017-01-09  0.665305 -1.167066  0.652643
2017-01-10       NaN       NaN       NaN
2017-01-11       NaN       NaN       NaN
2017-01-12       NaN       NaN       NaN
2017-01-13       NaN       NaN       NaN
2017-01-14       NaN       NaN       NaN
2017-01-15       NaN       NaN       NaN
2017-01-16  1.725148  0.310244  0.577190
2017-01-17       NaN       NaN       NaN
2017-01-18       NaN       NaN       NaN
2017-01-19       NaN       NaN       NaN
2017-01-20       NaN       NaN       NaN
2017-01-21       NaN       NaN       NaN
2017-01-22       NaN       NaN       NaN
2017-01-23 -0.155053 -0.563262  0.588269
2017-01-24       NaN       NaN       NaN
2017-01-25      

In [21]:
#ffill
print(df.resample('D').ffill(2))

                  S1        S2        S3
2017-01-02 -1.144930  2.056606  1.280437
2017-01-03 -1.144930  2.056606  1.280437
2017-01-04 -1.144930  2.056606  1.280437
2017-01-05       NaN       NaN       NaN
2017-01-06       NaN       NaN       NaN
2017-01-07       NaN       NaN       NaN
2017-01-08       NaN       NaN       NaN
2017-01-09  0.665305 -1.167066  0.652643
2017-01-10  0.665305 -1.167066  0.652643
2017-01-11  0.665305 -1.167066  0.652643
2017-01-12       NaN       NaN       NaN
2017-01-13       NaN       NaN       NaN
2017-01-14       NaN       NaN       NaN
2017-01-15       NaN       NaN       NaN
2017-01-16  1.725148  0.310244  0.577190
2017-01-17  1.725148  0.310244  0.577190
2017-01-18  1.725148  0.310244  0.577190
2017-01-19       NaN       NaN       NaN
2017-01-20       NaN       NaN       NaN
2017-01-21       NaN       NaN       NaN
2017-01-22       NaN       NaN       NaN
2017-01-23 -0.155053 -0.563262  0.588269
2017-01-24 -0.155053 -0.563262  0.588269
2017-01-25 -0.15

In [20]:
print(df.resample('D').bfill())

                  S1        S2        S3
2017-01-02 -1.144930  2.056606  1.280437
2017-01-03  0.665305 -1.167066  0.652643
2017-01-04  0.665305 -1.167066  0.652643
2017-01-05  0.665305 -1.167066  0.652643
2017-01-06  0.665305 -1.167066  0.652643
2017-01-07  0.665305 -1.167066  0.652643
2017-01-08  0.665305 -1.167066  0.652643
2017-01-09  0.665305 -1.167066  0.652643
2017-01-10  1.725148  0.310244  0.577190
2017-01-11  1.725148  0.310244  0.577190
2017-01-12  1.725148  0.310244  0.577190
2017-01-13  1.725148  0.310244  0.577190
2017-01-14  1.725148  0.310244  0.577190
2017-01-15  1.725148  0.310244  0.577190
2017-01-16  1.725148  0.310244  0.577190
2017-01-17 -0.155053 -0.563262  0.588269
2017-01-18 -0.155053 -0.563262  0.588269
2017-01-19 -0.155053 -0.563262  0.588269
2017-01-20 -0.155053 -0.563262  0.588269
2017-01-21 -0.155053 -0.563262  0.588269
2017-01-22 -0.155053 -0.563262  0.588269
2017-01-23 -0.155053 -0.563262  0.588269
2017-01-24  0.352137 -0.223055  0.638517
2017-01-25  0.35

In [23]:
print(df.resample('D').fillna('ffill'))

                  S1        S2        S3
2017-01-02 -1.144930  2.056606  1.280437
2017-01-03 -1.144930  2.056606  1.280437
2017-01-04 -1.144930  2.056606  1.280437
2017-01-05 -1.144930  2.056606  1.280437
2017-01-06 -1.144930  2.056606  1.280437
2017-01-07 -1.144930  2.056606  1.280437
2017-01-08 -1.144930  2.056606  1.280437
2017-01-09  0.665305 -1.167066  0.652643
2017-01-10  0.665305 -1.167066  0.652643
2017-01-11  0.665305 -1.167066  0.652643
2017-01-12  0.665305 -1.167066  0.652643
2017-01-13  0.665305 -1.167066  0.652643
2017-01-14  0.665305 -1.167066  0.652643
2017-01-15  0.665305 -1.167066  0.652643
2017-01-16  1.725148  0.310244  0.577190
2017-01-17  1.725148  0.310244  0.577190
2017-01-18  1.725148  0.310244  0.577190
2017-01-19  1.725148  0.310244  0.577190
2017-01-20  1.725148  0.310244  0.577190
2017-01-21  1.725148  0.310244  0.577190
2017-01-22  1.725148  0.310244  0.577190
2017-01-23 -0.155053 -0.563262  0.588269
2017-01-24 -0.155053 -0.563262  0.588269
2017-01-25 -0.15

In [24]:
print(df.resample('D').interpolate('linear'))

                  S1        S2        S3
2017-01-02 -1.144930  2.056606  1.280437
2017-01-03 -0.886325  1.596082  1.190753
2017-01-04 -0.627720  1.135557  1.101068
2017-01-05 -0.369115  0.675032  1.011383
2017-01-06 -0.110510  0.214508  0.921698
2017-01-07  0.148095 -0.246017  0.832013
2017-01-08  0.406700 -0.706542  0.742328
2017-01-09  0.665305 -1.167066  0.652643
2017-01-10  0.816711 -0.956022  0.641864
2017-01-11  0.968118 -0.744978  0.631085
2017-01-12  1.119524 -0.533933  0.620306
2017-01-13  1.270930 -0.322889  0.609527
2017-01-14  1.422336 -0.111845  0.598748
2017-01-15  1.573742  0.099200  0.587969
2017-01-16  1.725148  0.310244  0.577190
2017-01-17  1.456548  0.185457  0.578773
2017-01-18  1.187948  0.060671  0.580355
2017-01-19  0.919347 -0.064116  0.581938
2017-01-20  0.650747 -0.188903  0.583521
2017-01-21  0.382147 -0.313689  0.585104
2017-01-22  0.113547 -0.438476  0.586686
2017-01-23 -0.155053 -0.563262  0.588269
2017-01-24 -0.082597 -0.514661  0.595447
2017-01-25 -0.01