In [2]:
import pandas as pd
import numpy as np

In [66]:
from pytz import all_timezones
import warnings 
warnings.filterwarnings("ignore")

<h2>Converting Strings to Date</h2>

In [17]:
date_string = np.array(['03-04-2005 11:35 AM',
                       '23-05-2012 12:01 PM',
                       '09-11-2024 6:00 PM'])
[pd.to_datetime(date, format = "%d-%m-%Y %I:%M %p") for date in date_string]

[Timestamp('2005-04-03 11:35:00'),
 Timestamp('2012-05-23 12:01:00'),
 Timestamp('2024-11-09 18:00:00')]

In [19]:
[pd.to_datetime(date, format = "%d-%m-%Y %I:%M %p",  errors = "coerce") for date in date_string]

[Timestamp('2005-04-03 11:35:00'),
 Timestamp('2012-05-23 12:01:00'),
 Timestamp('2024-11-09 18:00:00')]

<div style="font-size:18px;font-family:Calibri">
    If $errors=coerce$, then any problem that occurs will not raise an error (the default behavior) but instead will set the value causing the error to $NaT (missing$ $value)$. This allows you to deal with outliers by filling them with null values, as opposed to troubleshooting errors for individual records in the data.
</div>

<h2>Handling Time Zones</h2>

In [30]:
pd.Timestamp("2017-05-01 06:00:00", tz = "Europe/London")

Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

In [44]:
date = pd.Timestamp("2017-05-01 06:00:00")
date_in_london = date.tz_localize("Europe/London")
date_in_london

Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

In [48]:
date_in_london.tz_convert("Africa/Abidjan")

Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan')

In [62]:
dates = pd.Series(pd.date_range("2/2/2002", periods = 3, freq = "ME"))
dates

0   2002-02-28
1   2002-03-31
2   2002-04-30
dtype: datetime64[ns]

In [64]:
dates.dt.tz_localize("Africa/Abidjan")

0   2002-02-28 00:00:00+00:00
1   2002-03-31 00:00:00+00:00
2   2002-04-30 00:00:00+00:00
dtype: datetime64[ns, Africa/Abidjan]

In [92]:
print(f"There are {len(all_timezones)} different timezones.")

There are 596 different timezones.


In [94]:
all_timezones[:5]

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara']

<h2>Selecting Dates and Times</h2>

In [101]:
df = pd.DataFrame()
df["date"] = pd.date_range("1/1/2001", periods = 100000, freq="H")
df.head()

Unnamed: 0,date
0,2001-01-01 00:00:00
1,2001-01-01 01:00:00
2,2001-01-01 02:00:00
3,2001-01-01 03:00:00
4,2001-01-01 04:00:00
5,2001-01-01 05:00:00
6,2001-01-01 06:00:00
7,2001-01-01 07:00:00
8,2001-01-01 08:00:00
9,2001-01-01 09:00:00


In [109]:
df[(df["date"] > "2002/1/1 01:00:00") & (df["date"] <= "2002/1/1 04:00:00")]

Unnamed: 0,date
8762,2002-01-01 02:00:00
8763,2002-01-01 03:00:00
8764,2002-01-01 04:00:00


In [113]:
df = df.set_index(df["date"])
df.loc['2002/1/1 01:00:00':'2002/1/1 04:00:00']

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2002-01-01 01:00:00,2002-01-01 01:00:00
2002-01-01 02:00:00,2002-01-01 02:00:00
2002-01-01 03:00:00,2002-01-01 03:00:00
2002-01-01 04:00:00,2002-01-01 04:00:00


In [117]:
df = pd.DataFrame()
df["date"] = pd.date_range("1/1/2001", periods = 150, freq="W")
df.head()

Unnamed: 0,date
0,2001-01-07
1,2001-01-14
2,2001-01-21
3,2001-01-28
4,2001-02-04


In [131]:
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
df["hour"] = df["date"].dt.hour
df["minute"] = df["date"].dt.minute

In [133]:
df.head()

Unnamed: 0,date,year,month,day,hour,minute
0,2001-01-07,2001,1,7,0,0
1,2001-01-14,2001,1,14,0,0
2,2001-01-21,2001,1,21,0,0
3,2001-01-28,2001,1,28,0,0
4,2001-02-04,2001,2,4,0,0


<h2>Calculating the Differences Between Dates</h2>

In [136]:
df = pd.DataFrame()

In [142]:
df["Arrived"] = [pd.Timestamp("01-01-2017"), pd.Timestamp("01-04-2017")]
df["Left"] = [pd.Timestamp("01-01-2017"), pd.Timestamp("01-06-2017")]

In [144]:
df

Unnamed: 0,Arrived,Left
0,2017-01-01,2017-01-01
1,2017-01-04,2017-01-06


In [146]:
df["Left"] - df["Arrived"]

0   0 days
1   2 days
dtype: timedelta64[ns]

In [152]:
pd.Series([delta.days for delta in (df["Left"] - df["Arrived"])])

0    0
1    2
dtype: int64

<h2>Encoding Days of the Week</h2>

In [165]:
dates = pd.Series(pd.date_range("1/1/2001", periods = 3, freq="MS"))

In [167]:
dates.dt.day_name()

0      Monday
1    Thursday
2    Thursday
dtype: object

In [171]:
dates.dt.weekday

0    0
1    3
2    3
dtype: int32

<h2>Containing a Lagged Feature</h2>

In [178]:
df = pd.DataFrame()
df["dates"] = pd.date_range("1/1/2005", periods = 5, freq = "D")
df["stock_price"] = [1.1, 2.2, 3.4, 4.9, 5.2]
df

Unnamed: 0,dates,stock_price
0,2005-01-01,1.1
1,2005-01-02,2.2
2,2005-01-03,3.4
3,2005-01-04,4.9
4,2005-01-05,5.2


In [188]:
df["prev_day_stock_price"] = df["stock_price"].shift(1)

In [192]:
df[:3]

Unnamed: 0,dates,stock_price,prev_day_stock_price
0,2005-01-01,1.1,
1,2005-01-02,2.2,1.1
2,2005-01-03,3.4,2.2


<h2>Using Rolling Time Windows</h2>
A moving average is a statistic that captures the average change in a data series over time.

In [197]:
time_idx = pd.date_range("01/01/2005", periods = 5, freq = "M")
time_idx

DatetimeIndex(['2005-01-31', '2005-02-28', '2005-03-31', '2005-04-30',
               '2005-05-31'],
              dtype='datetime64[ns]', freq='ME')

In [211]:
df = pd.DataFrame(index = time_idx)
df["stock_price"] = [1, 2.1, 5.3, 4, 1.5]

In [213]:
df.rolling(window=2).mean()

Unnamed: 0,stock_price
2005-01-31,
2005-02-28,1.55
2005-03-31,3.7
2005-04-30,4.65
2005-05-31,2.75


<h2>Handling Missing Data in Time Series</h2>

In [232]:
df.iloc[3] = df.iloc[2] = np.nan

In [234]:
df

Unnamed: 0,stock_price
2005-01-31,1.0
2005-02-28,2.1
2005-03-31,
2005-04-30,
2005-05-31,1.5


In [236]:
df.interpolate()

Unnamed: 0,stock_price
2005-01-31,1.0
2005-02-28,2.1
2005-03-31,1.9
2005-04-30,1.7
2005-05-31,1.5


In [238]:
df.ffill()

Unnamed: 0,stock_price
2005-01-31,1.0
2005-02-28,2.1
2005-03-31,2.1
2005-04-30,2.1
2005-05-31,1.5


In [240]:
df.bfill()

Unnamed: 0,stock_price
2005-01-31,1.0
2005-02-28,2.1
2005-03-31,1.5
2005-04-30,1.5
2005-05-31,1.5


In [242]:
df.interpolate(method = "quadratic")

Unnamed: 0,stock_price
2005-01-31,1.0
2005-02-28,2.1
2005-03-31,2.619675
2005-04-30,2.424023
2005-05-31,1.5


In [244]:
df.interpolate(limit = 1, limit_direction = "forward")

Unnamed: 0,stock_price
2005-01-31,1.0
2005-02-28,2.1
2005-03-31,1.9
2005-04-30,
2005-05-31,1.5
