# Indexing pandas time series

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Readning and slicing times

In [11]:
df = pd.read_csv('data/temperature_with_date.csv')
df.head()

Unnamed: 0,Temperature (deg F),Dew Point (deg F),Pressure (atm),Date
0,70.1,50.0,0.0,2014-01-01 00:00:00
1,70.1,50.0,0.0,2014-01-01 01:00:00
2,70.2,50.1,0.0,2014-01-01 02:00:00
3,70.2,50.1,0.0,2014-01-01 03:00:00
4,70.2,50.1,0.0,2014-01-01 04:00:00


In [12]:
df = pd.read_csv('data/temperature_with_date.csv', index_col='Date', parse_dates=True)
df.head()

Unnamed: 0_level_0,Temperature (deg F),Dew Point (deg F),Pressure (atm)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-01-01 00:00:00,70.1,50.0,0.0
2014-01-01 01:00:00,70.1,50.0,0.0
2014-01-01 02:00:00,70.2,50.1,0.0
2014-01-01 03:00:00,70.2,50.1,0.0
2014-01-01 04:00:00,70.2,50.1,0.0


In [14]:
df.loc['2014-01-01 01:00:00']

Temperature (deg F)    70.1
Dew Point (deg F)      50.0
Pressure (atm)          0.0
Name: 2014-01-01 01:00:00, dtype: float64

# Creating and using a DatetimeIndex

In [34]:
from datetime import datetime

# 构建奇葩的time series
dates = pd.date_range('2014-01-01 00:00:00', periods=100, freq='1H')
dates = [date.strftime('%Y%m%d %H:%M') for date in dates]
dates[0:5]

['20140101 00:00',
 '20140101 01:00',
 '20140101 02:00',
 '20140101 03:00',
 '20140101 04:00']

In [40]:
# 转成格式化的DatetimeIndex
datetimes = pd.to_datetime(dates, format='%Y-%m-%d %H:%M:%S')
datetimes[0:5]

DatetimeIndex(['2014-01-01 00:00:00', '2014-01-01 01:00:00',
               '2014-01-01 02:00:00', '2014-01-01 03:00:00',
               '2014-01-01 04:00:00'],
              dtype='datetime64[ns]', freq=None)

In [43]:
# 以DatetimeInde为索引构建Series
series = pd.Series(np.random.randn(100), index = datetimes)
series.head()

2014-01-01 00:00:00    1.179114
2014-01-01 01:00:00    1.144815
2014-01-01 02:00:00   -0.139964
2014-01-01 03:00:00    0.394016
2014-01-01 04:00:00    1.326695
dtype: float64

# Partial string indexing and slicing

In [45]:
series.head()

2014-01-01 00:00:00    1.179114
2014-01-01 01:00:00    1.144815
2014-01-01 02:00:00   -0.139964
2014-01-01 03:00:00    0.394016
2014-01-01 04:00:00    1.326695
dtype: float64

In [47]:
series.loc['2014-01-01 02:00:00']

-0.13996423306414421

In [48]:
series.loc['2014-01-01']

2014-01-01 00:00:00    1.179114
2014-01-01 01:00:00    1.144815
2014-01-01 02:00:00   -0.139964
2014-01-01 03:00:00    0.394016
2014-01-01 04:00:00    1.326695
2014-01-01 05:00:00    0.846242
2014-01-01 06:00:00   -0.284385
2014-01-01 07:00:00    0.132625
2014-01-01 08:00:00   -1.489304
2014-01-01 09:00:00    0.845247
2014-01-01 10:00:00   -0.190809
2014-01-01 11:00:00   -0.052209
2014-01-01 12:00:00    1.838720
2014-01-01 13:00:00   -0.726570
2014-01-01 14:00:00    1.089657
2014-01-01 15:00:00   -2.598899
2014-01-01 16:00:00   -0.752987
2014-01-01 17:00:00    0.432658
2014-01-01 18:00:00   -2.153558
2014-01-01 19:00:00   -1.261777
2014-01-01 20:00:00   -0.054899
2014-01-01 21:00:00    1.733527
2014-01-01 22:00:00    0.288773
2014-01-01 23:00:00   -0.865243
dtype: float64

In [49]:
series.loc['2014-01-01 01:00:00':'2014-01-01 05:00:00']

2014-01-01 01:00:00    1.144815
2014-01-01 02:00:00   -0.139964
2014-01-01 03:00:00    0.394016
2014-01-01 04:00:00    1.326695
2014-01-01 05:00:00    0.846242
dtype: float64

# Reindexing the Index

* Reindexing is useful in preparation for adding or otherwise combining two time series data sets. 
* To reindex the data, we provide a new index and ask pandas to try and match the old data to the new index. 
* If data is unavailble for one of the new index dates or times, you must tell pandas how to fill it in. 
* Otherwise, pandas will fill with NaN by default.

In [56]:
ts1 = pd.date_range('2016-07-01', periods=20)
ts1 = pd.Series(range(20), index = ts1)
ts1

2016-07-01     0
2016-07-02     1
2016-07-03     2
2016-07-04     3
2016-07-05     4
2016-07-06     5
2016-07-07     6
2016-07-08     7
2016-07-09     8
2016-07-10     9
2016-07-11    10
2016-07-12    11
2016-07-13    12
2016-07-14    13
2016-07-15    14
2016-07-16    15
2016-07-17    16
2016-07-18    17
2016-07-19    18
2016-07-20    19
Freq: D, dtype: int64

In [57]:
ts2 = pd.date_range('2016-07-11', periods=20)
ts2 = pd.Series(range(20), index = ts2)
ts2

2016-07-11     0
2016-07-12     1
2016-07-13     2
2016-07-14     3
2016-07-15     4
2016-07-16     5
2016-07-17     6
2016-07-18     7
2016-07-19     8
2016-07-20     9
2016-07-21    10
2016-07-22    11
2016-07-23    12
2016-07-24    13
2016-07-25    14
2016-07-26    15
2016-07-27    16
2016-07-28    17
2016-07-29    18
2016-07-30    19
Freq: D, dtype: int64

In [59]:
# 创建一个新的index，用ts1的来匹配ts2
ts3 = ts2.reindex(ts1.index)
ts3

2016-07-01    NaN
2016-07-02    NaN
2016-07-03    NaN
2016-07-04    NaN
2016-07-05    NaN
2016-07-06    NaN
2016-07-07    NaN
2016-07-08    NaN
2016-07-09    NaN
2016-07-10    NaN
2016-07-11    0.0
2016-07-12    1.0
2016-07-13    2.0
2016-07-14    3.0
2016-07-15    4.0
2016-07-16    5.0
2016-07-17    6.0
2016-07-18    7.0
2016-07-19    8.0
2016-07-20    9.0
Freq: D, dtype: float64

In [60]:
ts4 = ts2.reindex(ts1.index, method = 'ffill')
ts4

2016-07-01    NaN
2016-07-02    NaN
2016-07-03    NaN
2016-07-04    NaN
2016-07-05    NaN
2016-07-06    NaN
2016-07-07    NaN
2016-07-08    NaN
2016-07-09    NaN
2016-07-10    NaN
2016-07-11    0.0
2016-07-12    1.0
2016-07-13    2.0
2016-07-14    3.0
2016-07-15    4.0
2016-07-16    5.0
2016-07-17    6.0
2016-07-18    7.0
2016-07-19    8.0
2016-07-20    9.0
Freq: D, dtype: float64

In [61]:
ts5 = ts2.reindex(ts1.index, method = 'bfill')
ts5

2016-07-01    0
2016-07-02    0
2016-07-03    0
2016-07-04    0
2016-07-05    0
2016-07-06    0
2016-07-07    0
2016-07-08    0
2016-07-09    0
2016-07-10    0
2016-07-11    0
2016-07-12    1
2016-07-13    2
2016-07-14    3
2016-07-15    4
2016-07-16    5
2016-07-17    6
2016-07-18    7
2016-07-19    8
2016-07-20    9
Freq: D, dtype: int64

In [62]:
ts1 + ts2

2016-07-01     NaN
2016-07-02     NaN
2016-07-03     NaN
2016-07-04     NaN
2016-07-05     NaN
2016-07-06     NaN
2016-07-07     NaN
2016-07-08     NaN
2016-07-09     NaN
2016-07-10     NaN
2016-07-11    10.0
2016-07-12    12.0
2016-07-13    14.0
2016-07-14    16.0
2016-07-15    18.0
2016-07-16    20.0
2016-07-17    22.0
2016-07-18    24.0
2016-07-19    26.0
2016-07-20    28.0
2016-07-21     NaN
2016-07-22     NaN
2016-07-23     NaN
2016-07-24     NaN
2016-07-25     NaN
2016-07-26     NaN
2016-07-27     NaN
2016-07-28     NaN
2016-07-29     NaN
2016-07-30     NaN
Freq: D, dtype: float64

In [63]:
ts1 + ts3

2016-07-01     NaN
2016-07-02     NaN
2016-07-03     NaN
2016-07-04     NaN
2016-07-05     NaN
2016-07-06     NaN
2016-07-07     NaN
2016-07-08     NaN
2016-07-09     NaN
2016-07-10     NaN
2016-07-11    10.0
2016-07-12    12.0
2016-07-13    14.0
2016-07-14    16.0
2016-07-15    18.0
2016-07-16    20.0
2016-07-17    22.0
2016-07-18    24.0
2016-07-19    26.0
2016-07-20    28.0
Freq: D, dtype: float64

In [64]:
ts1 + ts4

2016-07-01     NaN
2016-07-02     NaN
2016-07-03     NaN
2016-07-04     NaN
2016-07-05     NaN
2016-07-06     NaN
2016-07-07     NaN
2016-07-08     NaN
2016-07-09     NaN
2016-07-10     NaN
2016-07-11    10.0
2016-07-12    12.0
2016-07-13    14.0
2016-07-14    16.0
2016-07-15    18.0
2016-07-16    20.0
2016-07-17    22.0
2016-07-18    24.0
2016-07-19    26.0
2016-07-20    28.0
Freq: D, dtype: float64

In [65]:
ts1 + ts5

2016-07-01     0
2016-07-02     1
2016-07-03     2
2016-07-04     3
2016-07-05     4
2016-07-06     5
2016-07-07     6
2016-07-08     7
2016-07-09     8
2016-07-10     9
2016-07-11    10
2016-07-12    12
2016-07-13    14
2016-07-14    16
2016-07-15    18
2016-07-16    20
2016-07-17    22
2016-07-18    24
2016-07-19    26
2016-07-20    28
Freq: D, dtype: int64