# Chapter 7 Handling Dates and Times

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import numpy as np
import pandas as pd

## 7.1 Converting Strings to Dates

In [2]:
date_strings = np.array(['03-04-2005 11:35 PM',
                         '23-05-2010 12:01 AM',
                         '04-09-2009 09:09 PM'])
# coverting
[pd.to_datetime(date, format="%d-%m-%Y %I:%M %p") for date in date_strings]
# errors='coerce' will not raise error but leave NaT
[pd.to_datetime(date, format="%d-%m-%Y %I:%M %p", errors='coerce') for date in date_strings]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

Complete List of Python String Time Codes: *strftime.org*

## 7.2 Handling Time Zones

In [3]:
# add a time zone using tz
date = pd.Timestamp('2017-05-01 06:00:00', tz='Europe/London')
date

# or using tz_localize
date = pd.Timestamp('2017-05-01 06:00:00')
date_in_london = date.tz_localize('Europe/London')
date_in_london

# or convert to a different time zone
date_in_abidjan = date_in_london.tz_convert('Africa/Abidjan')
date_in_abidjan

# vectorized function
dates = pd.Series(pd.date_range('2/2/2022', periods=3, freq='M'))
dates.dt.tz_localize('Europe/London')

Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan')

0   2022-02-28 00:00:00+00:00
1   2022-03-31 00:00:00+01:00
2   2022-04-30 00:00:00+01:00
dtype: datetime64[ns, Europe/London]

In [4]:
# if we want to see all the time zone
from pytz import all_timezones
print(len(all_timezones))
print(all_timezones[0:2])

593
['Africa/Abidjan', 'Africa/Accra']


## 7.3 Selecting Dates and Times

In [5]:
df = pd.DataFrame()
df['date'] = pd.date_range('1/1/2021', periods=100000, freq='H')
# using bool index
df[(df['date'] > '2022-1-1 01:00:00') & (df['date'] < '2022-1-1 04:00:00')]
# or using loc to locate index
df = df.set_index(df['date'])
df.loc['2022-1-1 01:00:00':'2022-1-1 04:00:00']

Unnamed: 0,date
8762,2022-01-01 02:00:00
8763,2022-01-01 03:00:00


Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2022-01-01 01:00:00,2022-01-01 01:00:00
2022-01-01 02:00:00,2022-01-01 02:00:00
2022-01-01 03:00:00,2022-01-01 03:00:00
2022-01-01 04:00:00,2022-01-01 04:00:00


## 7.4 Breaking Up Dates Data into Multiple Features

In [6]:
df = pd.DataFrame()
df['date'] = pd.date_range('1/1/2001', periods=150, freq='W')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df.head()

Unnamed: 0,date,year,month,day,hour,minute
0,2001-01-07,2001,1,7,0,0
1,2001-01-14,2001,1,14,0,0
2,2001-01-21,2001,1,21,0,0
3,2001-01-28,2001,1,28,0,0
4,2001-02-04,2001,2,4,0,0


## 7.5 Calculating the Difference Between Dates

In [7]:
df = pd.DataFrame()
df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]
# calculate
df['Left'] - df['Arrived']
# get numeric value
(df['Left'] - df['Arrived']).dt.days

0   0 days
1   2 days
dtype: timedelta64[ns]

0    0
1    2
dtype: int64

## 7.6 Encoding Days of the Week

In [8]:
dates = pd.Series(pd.date_range('2/2/2022', periods=3, freq='M'))
# the name
dates.dt.day_name()
# numeric value
dates.dt.weekday

0      Monday
1    Thursday
2    Saturday
dtype: object

0    0
1    3
2    5
dtype: int64

## 7.7 Creating a Lagged Feature

In [9]:
df = pd.DataFrame()
df['dates'] = pd.date_range('1/1/2021', periods=5, freq='D')
df['stock_price'] = [1.1, 2.2, 3.3, 4.4, 5.5]
# lagged values by one row
df['previous_day_stock_price'] = df['stock_price'].shift(1)
df

Unnamed: 0,dates,stock_price,previous_day_stock_price
0,2021-01-01,1.1,
1,2021-01-02,2.2,1.1
2,2021-01-03,3.3,2.2
3,2021-01-04,4.4,3.3
4,2021-01-05,5.5,4.4


## 7.8 Using Rolling Time Windows

In [10]:
time_index = pd.date_range('1/1/2021', periods=5, freq='M')
df = pd.DataFrame(index=time_index)
df['stock_price'] = [1, 2, 3, 4, 5]
df['window=2'] = df.rolling(window=2).mean()['stock_price']
df['window=3'] = df.rolling(window=3).mean()['stock_price']
df

Unnamed: 0,stock_price,window=2,window=3
2021-01-31,1,,
2021-02-28,2,1.5,
2021-03-31,3,2.5,2.0
2021-04-30,4,3.5,3.0
2021-05-31,5,4.5,4.0


## 7.9 Handling Missing Data in Time Series

In [11]:
time_index = pd.date_range('1/1/2021', periods=5, freq='M')
df = pd.DataFrame(index=time_index)
df['sales'] = [1.0, 2.0, np.nan, np.nan, 5.0]
df['interpolate'] = df.interpolate()['sales']
df['ffill'] = df.ffill()['sales']
df['bfill'] = df.bfill()['sales']
df['interpolate_quadratic'] = df.interpolate(method='quadratic')['sales']
df['interpolate_limit'] = df.interpolate(limit=1, limit_direction='forward')['sales']
df

Unnamed: 0,sales,interpolate,ffill,bfill,interpolate_quadratic,interpolate_limit
2021-01-31,1.0,1.0,1.0,1.0,1.0,1.0
2021-02-28,2.0,2.0,2.0,2.0,2.0,2.0
2021-03-31,,3.0,2.0,5.0,3.059808,3.0
2021-04-30,,4.0,2.0,5.0,4.038069,
2021-05-31,5.0,5.0,5.0,5.0,5.0,5.0
