In [42]:
import pandas as pd
from datetime import datetime
from datetime import timedelta

In [5]:
now = datetime.now()
now

datetime.datetime(2022, 11, 24, 19, 26, 6, 408863)

In [14]:
datetime(2017, 9, 25, 14, 5, 52, 72973)
now.year,now.month, now.day

(2022, 11, 24)

In [12]:
now.year,now.month, now.day

(2022, 11, 24)

In [18]:
delta = datetime(2022, 11, 24) - datetime(2021, 5, 5, 15, 15)
delta

datetime.timedelta(days=567, seconds=31500)

In [19]:
delta.days

567

In [20]:
delta.seconds

31500

In [22]:
start = datetime(2021, 5, 5)
start + timedelta(12)

datetime.datetime(2021, 5, 17, 0, 0)

In [24]:
start - 2* timedelta(12)

datetime.datetime(2021, 4, 11, 0, 0)

### 字符串和datetime的相互转换

#### datetime转换成字符串

In [27]:
stamp = datetime(2021, 5, 5)
str(stamp)

'2021-05-05 00:00:00'

In [28]:
stamp.strftime('%Y-%m-%d')


'2021-05-05'

#### 字符串转换成datetime

In [31]:
value = '2021-05-05'
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2021, 5, 5, 0, 0)

In [32]:
datestrs=['7/6/2011', '8/6/2011']
[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]

[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

datetime.strptime是通过已知格式进行日期解析的最佳方式。但是每次都要编写格式定义是很麻烦
的事情，尤其是对于一些常见的日期格式。这种情况下，你可以用dateutil这个第三方包中的
parser.parse方法（pandas中已经自动安装好了）：

In [35]:
from dateutil.parser import parse
parse('2011-01-03')

datetime.datetime(2011, 1, 3, 0, 0)

In [36]:
parse('Jan 31, 1998 10:45 PM')

datetime.datetime(1998, 1, 31, 22, 45)

在国际通用的格式中，日出现在月的前面很普遍，传入dayfirst=True即可解决这个问题：m

In [38]:
parse('6.12.2011', dayfirst=True)

datetime.datetime(2011, 12, 6, 0, 0)

In [43]:
#pandas通常处理成组日期
datestrs = ['2021-05-09 12:00:00','1998-10-01 12:00:00']
pd.to_datetime(datestrs)

DatetimeIndex(['2021-05-09 12:00:00', '1998-10-01 12:00:00'], dtype='datetime64[ns]', freq=None)

它还可以处理缺省值

In [45]:
idx = pd.to_datetime(datestrs +[None])
idx

DatetimeIndex(['2021-05-09 12:00:00', '1998-10-01 12:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

In [46]:
idx[2]

NaT

In [47]:
pd.isnull(idx)

array([False, False,  True])

NaT（Not a Time）是pandas中时间戳数据的null值。

### 时间序列基础

pandas最基本的时间序列类型就是以时间戳（通常以Python字符串或datatime对象表示）为索引
的Series：

In [55]:
from datetime import datetime
import numpy as np

In [53]:
dates = [datetime(2022, 1, 2), datetime(2022, 1,5),
        datetime(2022, 1, 7), datetime(2022, 1, 11),
         datetime(2022, 1, 18), datetime(2022, 1, 20)
        ]

In [56]:
ts = pd.Series(np.random.randn(6), index=dates)

In [57]:
ts

2022-01-02   -1.622742
2022-01-05    0.155666
2022-01-07    1.324892
2022-01-11   -1.472709
2022-01-18   -0.068213
2022-01-20   -0.370113
dtype: float64

In [58]:
ts.index

DatetimeIndex(['2022-01-02', '2022-01-05', '2022-01-07', '2022-01-11',
               '2022-01-18', '2022-01-20'],
              dtype='datetime64[ns]', freq=None)

跟其他Series一样，不同索引的时间序列之间的算术运算会自动按日期对齐：

In [63]:
ts + ts[::2]

2022-01-02   -3.245485
2022-01-05         NaN
2022-01-07    2.649784
2022-01-11         NaN
2022-01-18   -0.136425
2022-01-20         NaN
dtype: float64

In [64]:
ts.index.dtype

dtype('<M8[ns]')

In [68]:
stamp = ts.index[0]
stamp

Timestamp('2022-01-02 00:00:00')

只要有需要，TimeStamp可以随时自动转换为datetime对象。此外，它还可以存储频率信息（如果
有的话），且知道如何执行时区转换以及其他操作。稍后将对此进行详细讲解。

## 索引、选取、子集构造

In [129]:
stamp = ts.index[2]
ts[stamp]

1.3248918369387133

In [130]:
stamp

Timestamp('2022-01-07 00:00:00')

还有一种更为方便的用法：传入一个可以被解释为日期的字符串


In [77]:
ts['2022-01-05']

0.1556658601825808

In [79]:
ts['2022/1/5']

0.1556658601825808

对于较长的时间序列，只需传入“年”或“年月”即可轻松选取数据的切片:

In [84]:
longer_ts = pd.Series(np.random.randn(1000),
                     index=pd.date_range('1/1/2000',periods=1000))
longer_ts

2000-01-01    2.688546
2000-01-02   -0.448028
2000-01-03    1.090136
2000-01-04   -1.008766
2000-01-05    1.333529
                ...   
2002-09-22   -1.482377
2002-09-23    1.066108
2002-09-24   -1.470787
2002-09-25   -1.066910
2002-09-26    0.798489
Freq: D, Length: 1000, dtype: float64

In [85]:
longer_ts['2001']

2001-01-01   -1.081483
2001-01-02    0.173587
2001-01-03    0.828907
2001-01-04    0.621203
2001-01-05   -0.023710
                ...   
2001-12-27   -0.673518
2001-12-28    0.944483
2001-12-29   -0.021893
2001-12-30    1.605607
2001-12-31    0.447632
Freq: D, Length: 365, dtype: float64

In [86]:
longer_ts.shape

(1000,)

In [87]:
longer_ts['2001-05']

2001-05-01   -0.172704
2001-05-02    0.772071
2001-05-03    1.630154
2001-05-04    1.338632
2001-05-05    0.201156
2001-05-06    2.139703
2001-05-07   -0.388343
2001-05-08   -1.043599
2001-05-09    0.756941
2001-05-10   -0.698600
2001-05-11    0.301018
2001-05-12   -0.306102
2001-05-13    0.244888
2001-05-14    0.157064
2001-05-15   -1.312037
2001-05-16   -0.920206
2001-05-17    1.088420
2001-05-18   -0.916746
2001-05-19    0.552032
2001-05-20   -0.457060
2001-05-21    0.512361
2001-05-22    0.890011
2001-05-23   -0.799416
2001-05-24    0.747376
2001-05-25    1.607080
2001-05-26   -0.593533
2001-05-27   -0.253000
2001-05-28    1.355293
2001-05-29    0.482863
2001-05-30    1.731761
2001-05-31    0.241620
Freq: D, dtype: float64

In [88]:
ts[datetime(2011,1,7):]

2022-01-02   -1.622742
2022-01-05    0.155666
2022-01-07    1.324892
2022-01-11   -1.472709
2022-01-18   -0.068213
2022-01-20   -0.370113
dtype: float64

由于大部分时间序列数据都是按照时间先后排序的，因此你也可以用不存在于该时间序列中的时间
戳对其进行切片（即范围查询）：

In [90]:
ts


2022-01-02   -1.622742
2022-01-05    0.155666
2022-01-07    1.324892
2022-01-11   -1.472709
2022-01-18   -0.068213
2022-01-20   -0.370113
dtype: float64

由于大部分时间序列数据都是按照时间先后排序的，因此你也可以用不存在于该时间序列中的时间
戳对其进行切片（即范围查询）：

In [132]:
ts['2011/6/1':'2022-01-07 00:00:00']

2022-01-02   -1.622742
2022-01-05    0.155666
2022-01-07    1.324892
dtype: float64

跟之前一样，你可以传入字符串日期、datetime或Timestamp。注意，这样切片所产生的是原时间
序列的视图，跟NumPy数组的切片运算是一样的。
这意味着，没有数据被复制，对切片进行修改会反映到原始数据上。
此外，还有一个等价的实例方法也可以截取两个日期之间TimeSeries：

In [95]:
ts.truncate(after = '2022/1/18')

2022-01-02   -1.622742
2022-01-05    0.155666
2022-01-07    1.324892
2022-01-11   -1.472709
2022-01-18   -0.068213
dtype: float64

面这些操作对DataFrame也有效。例如，对DataFrame的行进行索引：

In [133]:
dates = pd.date_range('2022/1/1', periods = 100, freq = 'W-WED')
dates

DatetimeIndex(['2022-01-05', '2022-01-12', '2022-01-19', '2022-01-26',
               '2022-02-02', '2022-02-09', '2022-02-16', '2022-02-23',
               '2022-03-02', '2022-03-09', '2022-03-16', '2022-03-23',
               '2022-03-30', '2022-04-06', '2022-04-13', '2022-04-20',
               '2022-04-27', '2022-05-04', '2022-05-11', '2022-05-18',
               '2022-05-25', '2022-06-01', '2022-06-08', '2022-06-15',
               '2022-06-22', '2022-06-29', '2022-07-06', '2022-07-13',
               '2022-07-20', '2022-07-27', '2022-08-03', '2022-08-10',
               '2022-08-17', '2022-08-24', '2022-08-31', '2022-09-07',
               '2022-09-14', '2022-09-21', '2022-09-28', '2022-10-05',
               '2022-10-12', '2022-10-19', '2022-10-26', '2022-11-02',
               '2022-11-09', '2022-11-16', '2022-11-23', '2022-11-30',
               '2022-12-07', '2022-12-14', '2022-12-21', '2022-12-28',
               '2023-01-04', '2023-01-11', '2023-01-18', '2023-01-25',
      

In [98]:
long_df = pd.DataFrame(np.random.randn(100, 4),
                      index=dates,
                      columns=['Colorado', 'Texas',
                              'New York', 'Ohio'])

In [100]:
long_df

Unnamed: 0,Colorado,Texas,New York,Ohio
2022-01-05,-0.488573,-0.037909,-0.616836,0.138221
2022-01-12,0.044128,-1.553364,0.091441,-0.468735
2022-01-19,-1.890373,-1.533116,-0.213571,-0.283070
2022-01-26,-0.759586,0.750122,0.237385,0.722104
2022-02-02,0.158307,1.749732,0.222498,1.805427
...,...,...,...,...
2023-11-01,0.026888,-0.205883,-0.806361,0.805693
2023-11-08,1.239165,-0.466644,0.387494,0.730094
2023-11-15,0.294777,1.252454,0.245323,0.189063
2023-11-22,0.720570,-0.495632,0.278254,0.831433


In [99]:
long_df.loc['5-2022']

Unnamed: 0,Colorado,Texas,New York,Ohio
2022-05-04,1.616759,-0.156337,-0.342254,-0.102106
2022-05-11,-0.17934,1.029836,-1.139,-0.196827
2022-05-18,-0.954412,-1.007934,-0.781335,0.474228
2022-05-25,-1.089094,0.135177,-0.747347,0.994612


# 带有重复索引的时间序列

In [103]:
dates = pd.DatetimeIndex(['1/1/2000','1/2/2000','1/2/2000','1/2/2000','1/3/2000'])

In [106]:
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

通过检查索引的is_unique属性，我们就可以知道它是不是唯一的:

In [108]:
dup_ts.index.is_unique

False

对这个时间序列进行索引，要么产生标量值，要么产生切片，具体要看所选的时间点是否重复：

In [116]:
dup_ts['2000/1/3'] # not duplicated

4

In [120]:

dup_ts['2000/1/2'] #  duplicated

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

假设你想要对具有非唯一时间戳的数据进行聚合。一个办法是使用groupby，并传入level=0：


In [126]:
grouped = dup_ts.groupby(level = 0)

In [124]:
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

# 日期的范围、频率以及移动