In [21]:
import pandas as pd
import numpy as np

from pytz import all_timezones

In [8]:
df = pd.DataFrame()

df['date'] = pd.date_range('1/1/2001', periods=150, freq='W')

# 为年月日，时分秒创建特征
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute

df.head()

Unnamed: 0,date,year,month,day,hour,minute
0,2001-01-07,2001,1,7,0,0
1,2001-01-14,2001,1,14,0,0
2,2001-01-21,2001,1,21,0,0
3,2001-01-28,2001,1,28,0,0
4,2001-02-04,2001,2,4,0,0


In [13]:
# 计算 日期之间的差
df = pd.DataFrame()

# 创建两个 datetime 特征
df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]

# 计算特征之间的间隔
df['Left'] - df['Arrived']

# 计算特征之间的间隔
pd.Series(delta.days for delta in (df['Left'] - df['Arrived']))

0    0
1    2
dtype: int64

In [15]:
# 将字符传转为时间戳

# 创建字符串
date_strings = np.array(['03-04-2005 11:35 PM',
                         '23-05-2010 12:01 AM',
                         '04-09-2009 09:09 PM'])
date_strings

array(['03-04-2005 11:35 PM', '23-05-2010 12:01 AM',
       '04-09-2009 09:09 PM'], dtype='<U19')

In [20]:
# 转换为 datetime
[pd.to_datetime(date, format='%d-%m-%Y %I:%M %p', errors='coerce') for date in date_strings]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

#### 时区

In [30]:
# 转换时区

# 展示十个时区
all_timezones[0:10]

# 创建十个日期
dates = pd.Series(pd.date_range('3/3/2002', periods=10, freq='M'))
dates.head()

0   2002-03-31
1   2002-04-30
2   2002-05-31
3   2002-06-30
4   2002-07-31
dtype: datetime64[ns]

In [31]:
# 设置时区
dates_with_abidjan_time_zone = dates.dt.tz_localize('Africa/Abidjan')
dates_with_abidjan_time_zone.head()

0   2002-03-31 00:00:00+00:00
1   2002-04-30 00:00:00+00:00
2   2002-05-31 00:00:00+00:00
3   2002-06-30 00:00:00+00:00
4   2002-07-31 00:00:00+00:00
dtype: datetime64[ns, Africa/Abidjan]

In [32]:
# 转换时区
dates_with_london_time_zone = dates_with_abidjan_time_zone.dt.tz_convert('Europe/London')

# 查看 pandas 序列
dates_with_london_time_zone.head()

0   2002-03-31 00:00:00+00:00
1   2002-04-30 01:00:00+01:00
2   2002-05-31 01:00:00+01:00
3   2002-06-30 01:00:00+01:00
4   2002-07-31 01:00:00+01:00
dtype: datetime64[ns, Europe/London]

#### 编码星期几

In [33]:
dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))

# 查看数据
dates

0   2002-02-28
1   2002-03-31
2   2002-04-30
dtype: datetime64[ns]

In [40]:
dates.dt.weekday

0    3
1    6
2    1
dtype: int64

#### 处理时间序列中的缺失值

In [41]:
# 创建日期
time_index = pd.date_range('01/01/2010', periods=5, freq='M')

# 创建数据帧，设置索引
df = pd.DataFrame(index=time_index)

# 创建带有一些缺失值的特征
df['Sales'] = [1.0,2.0,np.nan,np.nan,5.0]

# 对缺失值执行插值
df.interpolate()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,4.0
2010-05-31,5.0


In [42]:
# 前向填充
df.ffill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,2.0
2010-04-30,2.0
2010-05-31,5.0


In [43]:
# 后向填充
df.bfill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,5.0
2010-04-30,5.0
2010-05-31,5.0


In [46]:
# 对缺失值执行插值
df.interpolate(limit=1, limit_direction='forward')

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,
2010-05-31,5.0


#### 平移时间特征

In [52]:
df = pd.DataFrame()
df['date'] = pd.date_range('2/2/2021', periods=5, freq='D')
df['stock_price'] = [1.1,2.2,3.3,4.4,5.5]
df['next_stock_price'] = df['stock_price'].shift(-1)
df

Unnamed: 0,date,stock_price,next_stock_price
0,2021-02-02,1.1,2.2
1,2021-02-03,2.2,3.3
2,2021-02-04,3.3,4.4
3,2021-02-05,4.4,5.5
4,2021-02-06,5.5,


#### 滑动窗口

In [58]:
index = pd.date_range('2/2/2021', periods=5, freq='D')
df = pd.DataFrame(index=index)
df['stock_price'] = [1.1,2.2,3.3,4.4,5.5]
df.rolling(window=2).mean()

Unnamed: 0,stock_price
2021-02-02,
2021-02-03,1.65
2021-02-04,2.75
2021-02-05,3.85
2021-02-06,4.95


In [61]:
# 识别滑动时间窗口中的最大值
df.rolling(window=2).max()

Unnamed: 0,stock_price
2021-02-02,
2021-02-03,2.2
2021-02-04,3.3
2021-02-05,4.4
2021-02-06,5.5


#### 选择日期时间范围

In [65]:
df = pd.DataFrame()

# 创建 datetime
df['date'] = pd.date_range('1/1/2002', periods=1000, freq='H')
df[(df['date'] > '2002-1-1 01:00:00') & (df['date'] <= '2002-1-1 04:00:00')]

Unnamed: 0,date
2,2002-01-01 02:00:00
3,2002-01-01 03:00:00
4,2002-01-01 04:00:00


In [66]:
# 设置索引
df = df.set_index(df['date'])

# 选择两个日期时间之间的观测
df.loc['2002-1-1 01:00:00':'2002-1-1 04:00:00']

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2002-01-01 01:00:00,2002-01-01 01:00:00
2002-01-01 02:00:00,2002-01-01 02:00:00
2002-01-01 03:00:00,2002-01-01 03:00:00
2002-01-01 04:00:00,2002-01-01 04:00:00
