In [3]:
import numpy as np
import pandas as pd

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 7)
pd.set_option('precision', 7)

# useful for date/time manipulations
import datetime
from datetime import datetime

# And some items for matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 


# Time-series data and the DatetimeIndex

In [6]:
# create a a DatetimeIndex from an array of datetime's
dates = [datetime(2014, 8, 1), datetime(2014, 8, 2)]
dti = pd.DatetimeIndex(dates)
dti

DatetimeIndex(['2014-08-01', '2014-08-02'], dtype='datetime64[ns]', freq=None)

In [8]:
# a Series given a datetime list will automatically create
# a DatetimeIndex as its index
np.random.seed(123456)
ts = pd.Series(np.random.randn(2), dates)
type(ts.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [10]:
# retrieve a value using a datetime object
ts[datetime(2014, 8, 2)]

-0.2828633443286633

In [12]:
# this can also be performed with a string
ts['2014-8-2']

-0.2828633443286633

In [14]:
# create a Series with a DatetimeIndex using strings as dates
np.random.seed(123456)
dates = ['2014-08-01', '2014-08-02']
ts = pd.Series(np.random.randn(2), dates)
ts

2014-08-01    0.4691123
2014-08-02   -0.2828633
dtype: float64

In [16]:
# convert a list of items to a DatetimeIndex
dti = pd.to_datetime(['Aug 1, 2014', '2014-08-02', 
                      '2014.8.3', None])
dti

DatetimeIndex(['2014-08-01', '2014-08-02', '2014-08-03', 'NaT'], dtype='datetime64[ns]', freq=None)

In [18]:
# watch out as a failure to convert an item on the list
# to a date/time will result in the return value being a
# NumPy array instead of a DatetimeIndex
dti2 = pd.to_datetime(['Aug 1, 2014', 'foo'])
type(dti2)

ParserError: Unknown string format: foo

In [20]:
# coerce pandas to convert all to datetime and a DatetimeIndex
# substituting NaT where values can not be converted
pd.to_datetime(['Aug 1, 2014', 'foo'], coerce=True)

TypeError: to_datetime() got an unexpected keyword argument 'coerce'

In [22]:
# demonstrate two representations of the same date, one 
# month first, the other day first, converting to the 
# same date representation in pandas
dti1 = pd.to_datetime(['8/1/2014'])
dti2 = pd.to_datetime(['1/8/2014'], dayfirst=True)
dti1[0], dti2[0]

(Timestamp('2014-08-01 00:00:00'), Timestamp('2014-08-01 00:00:00'))

In [24]:
# create a Series with a DatetimeIndex starting at 8/1/2014
# and consisting of 10 consequtive days
np.random.seed(123456)
dates = pd.date_range('8/1/2014', periods=10)
s1 = pd.Series(np.random.randn(10), dates)
s1[:5]

2014-08-01    0.4691123
2014-08-02   -0.2828633
2014-08-03   -1.5090585
2014-08-04   -1.1356324
2014-08-05    1.2121120
Freq: D, dtype: float64

In [38]:
import yfinance as yf
import datetime

# Define the start and end dates
start = '2012-01-01'
end = '2013-12-30'

# Fetch the data
msft = yf.download("MSFT", start=start, end=end)

# Display the first five rows of the data
print(msft.head(5))

[*********************100%%**********************]  1 of 1 completed

                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2012-01-03  26.5499992  26.9599991  26.3899994  26.7700005  21.2005119   
2012-01-04  26.8199997  27.4699993  26.7800007  27.3999996  21.6994438   
2012-01-05  27.3799992  27.7299995  27.2900009  27.6800003  21.9211941   
2012-01-06  27.5300007  28.1900005  27.5300007  28.1100006  22.2617207   
2012-01-09  28.0499992  28.1000004  27.7199993  27.7399998  21.9687099   

              Volume  
Date                  
2012-01-03  64731500  
2012-01-04  80516100  
2012-01-05  56081400  
2012-01-06  99455500  
2012-01-09  59706800  





In [40]:
# extract just the Adj Close values
msftAC = msft['Adj Close']
msftAC.head(3)

Date
2012-01-03    21.2005119
2012-01-04    21.6994438
2012-01-05    21.9211941
Name: Adj Close, dtype: float64

In [42]:
# slicing using a DatetimeIndex nicely works with dates 
# passed as strings
msft['2012-01-01':'2012-01-05']

                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2012-01-03  26.5499992  26.9599991  26.3899994  26.7700005  21.2005119   
2012-01-04  26.8199997  27.4699993  26.7800007  27.3999996  21.6994438   
2012-01-05  27.3799992  27.7299995  27.2900009  27.6800003  21.9211941   

              Volume  
Date                  
2012-01-03  64731500  
2012-01-04  80516100  
2012-01-05  56081400  

In [44]:
# returns a Series representing all the values of the 
# single row indexed by the column names
msft.loc['2012-01-03']

Open         2.6549999e+01
High         2.6959999e+01
Low          2.6389999e+01
Close        2.6770000e+01
Adj Close    2.1200512e+01
Volume       6.4731500e+07
Name: 2012-01-03 00:00:00, dtype: float64

In [46]:
# this is an error as this tries to retrieve a column
# named '2012-01-03'
# msft['2012-01-03'] # commented to prevent killing the notebook

In [48]:
# this is a Series, so the lookup works
msftAC['2012-01-03']

21.200511932373047

In [54]:
# we can lookup using partial date specifications
# such as only year and month
msft.loc['2012-02'].head(5)

                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2012-02-01  29.7900009  30.0499992  29.7600002  29.8899994  23.6714039   
2012-02-02  29.8999996  30.1700001  29.7099991  29.9500008  23.7189159   
2012-02-03  30.1399994  30.3999996  30.0900002  30.2399998  23.9485855   
2012-02-06  30.0400009  30.2199993  29.9699993  30.2000008  23.9169064   
2012-02-07  30.1499996  30.4899998  30.0499992  30.3500004  24.0356960   

              Volume  
Date                  
2012-02-01  67409900  
2012-02-02  52223300  
2012-02-03  41838500  
2012-02-06  28039700  
2012-02-07  39242400  

In [56]:
# slice starting at the beginning of Feb 2012 and 
# end on Feb 9 2012
msft['2012-02':'2012-02-09'][:5]

                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2012-02-01  29.7900009  30.0499992  29.7600002  29.8899994  23.6714039   
2012-02-02  29.8999996  30.1700001  29.7099991  29.9500008  23.7189159   
2012-02-03  30.1399994  30.3999996  30.0900002  30.2399998  23.9485855   
2012-02-06  30.0400009  30.2199993  29.9699993  30.2000008  23.9169064   
2012-02-07  30.1499996  30.4899998  30.0499992  30.3500004  24.0356960   

              Volume  
Date                  
2012-02-01  67409900  
2012-02-02  52223300  
2012-02-03  41838500  
2012-02-06  28039700  
2012-02-07  39242400  

# Creating time-series with specific frequencies

In [59]:
# create a time-series with one minute frequency
bymin = pd.Series(np.arange(0, 90*60*24),
                  pd.date_range('2014-08-01', 
                                '2014-10-29 23:59:00',
                                freq='T'))
bymin

2014-08-01 00:00:00         0
2014-08-01 00:01:00         1
2014-08-01 00:02:00         2
                        ...  
2014-10-29 23:57:00    129597
2014-10-29 23:58:00    129598
2014-10-29 23:59:00    129599
Freq: T, Length: 129600, dtype: int64

In [61]:
# slice at the minute level
bymin['2014-08-01 12:30':'2014-08-01 12:59']

2014-08-01 12:30:00    750
2014-08-01 12:31:00    751
2014-08-01 12:32:00    752
                      ... 
2014-08-01 12:57:00    777
2014-08-01 12:58:00    778
2014-08-01 12:59:00    779
Freq: T, Length: 30, dtype: int64

# Representing intervals of time using periods

In [64]:
# create a period representing a start of 
# 2014-08 and for a duration of one month
aug2014 = pd.Period('2014-08', freq='M')
aug2014

Period('2014-08', 'M')

In [66]:
# pandas determined the following start and end
# for the period
aug2014.start_time, aug2014.end_time

(Timestamp('2014-08-01 00:00:00'), Timestamp('2014-08-31 23:59:59.999999999'))

In [68]:
# what is the one month period following the given period?
sep2014 = aug2014 + 1
sep2014

Period('2014-09', 'M')

In [70]:
# the calculated start and end are
sep2014.start_time, sep2014.end_time

(Timestamp('2014-09-01 00:00:00'), Timestamp('2014-09-30 23:59:59.999999999'))

In [72]:
# create a pandas PeriodIndex
mp2013 = pd.period_range('1/1/2013', '12/31/2013', freq='M')
mp2013

PeriodIndex(['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06',
             '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'],
            dtype='period[M]')

In [78]:
# dump all the calculated periods
for p in mp2013: 
    print ("{0} {1} {2} {3}".format(p, 
                                   p.freq, 
                                   p.start_time, 
                                   p.end_time))

2013-01 <MonthEnd> 2013-01-01 00:00:00 2013-01-31 23:59:59.999999999
2013-02 <MonthEnd> 2013-02-01 00:00:00 2013-02-28 23:59:59.999999999
2013-03 <MonthEnd> 2013-03-01 00:00:00 2013-03-31 23:59:59.999999999
2013-04 <MonthEnd> 2013-04-01 00:00:00 2013-04-30 23:59:59.999999999
2013-05 <MonthEnd> 2013-05-01 00:00:00 2013-05-31 23:59:59.999999999
2013-06 <MonthEnd> 2013-06-01 00:00:00 2013-06-30 23:59:59.999999999
2013-07 <MonthEnd> 2013-07-01 00:00:00 2013-07-31 23:59:59.999999999
2013-08 <MonthEnd> 2013-08-01 00:00:00 2013-08-31 23:59:59.999999999
2013-09 <MonthEnd> 2013-09-01 00:00:00 2013-09-30 23:59:59.999999999
2013-10 <MonthEnd> 2013-10-01 00:00:00 2013-10-31 23:59:59.999999999
2013-11 <MonthEnd> 2013-11-01 00:00:00 2013-11-30 23:59:59.999999999
2013-12 <MonthEnd> 2013-12-01 00:00:00 2013-12-31 23:59:59.999999999


In [80]:
# and now create a Series using the PeriodIndex
np.random.seed(123456)
ps = pd.Series(np.random.randn(12), mp2013)
ps

2013-01    0.4691123
2013-02   -0.2828633
2013-03   -1.5090585
             ...    
2013-10   -2.1045692
2013-11   -0.4949293
2013-12    1.0718038
Freq: M, Length: 12, dtype: float64

# Shifting and lagging time-series data

In [83]:
# refresh our memory on the data in the MSFT closing prices Series
msftAC[:5]

Date
2012-01-03    21.2005119
2012-01-04    21.6994438
2012-01-05    21.9211941
2012-01-06    22.2617207
2012-01-09    21.9687099
Name: Adj Close, dtype: float64

In [85]:
# shift the prices one index position forward
shifted_forward = msftAC.shift(1)
shifted_forward[:5]

Date
2012-01-03           NaN
2012-01-04    21.2005119
2012-01-05    21.6994438
2012-01-06    21.9211941
2012-01-09    22.2617207
Name: Adj Close, dtype: float64

In [87]:
# the last item is also shifted away 
msftAC.tail(5), shifted_forward.tail(5)

(Date
 2013-12-20    30.8747978
 2013-12-23    30.7237720
 2013-12-24    31.1097050
 2013-12-26    31.4117527
 2013-12-27    31.2859097
 Name: Adj Close, dtype: float64,
 Date
 2013-12-20    30.4133492
 2013-12-23    30.8747978
 2013-12-24    30.7237720
 2013-12-26    31.1097050
 2013-12-27    31.4117527
 Name: Adj Close, dtype: float64)

In [89]:
# shift backwards 2 index labels
shifted_backwards = msftAC.shift(-2)
shifted_backwards[:5]

Date
2012-01-03    21.9211941
2012-01-04    22.2617207
2012-01-05    21.9687099
2012-01-06    22.0478973
2012-01-09    21.9528694
Name: Adj Close, dtype: float64

In [91]:
# this has resulted in 2 NaN values at 
# the end of the resulting Series
shifted_backwards.tail(5)

Date
2013-12-20    31.1097050
2013-12-23    31.4117527
2013-12-24    31.2859097
2013-12-26           NaN
2013-12-27           NaN
Name: Adj Close, dtype: float64

In [93]:
# shift by a different frequency does not realign
# and ends up essentially changing the index labels by
# the specific amount of time
msftAC.shift(1, freq="S")

Date
2012-01-03 00:00:01    21.2005119
2012-01-04 00:00:01    21.6994438
2012-01-05 00:00:01    21.9211941
                          ...    
2013-12-24 00:00:01    31.1097050
2013-12-26 00:00:01    31.4117527
2013-12-27 00:00:01    31.2859097
Name: Adj Close, Length: 500, dtype: float64

In [107]:
# Shift the index labels by one day using shift with DateOffset
msftAC_shifted = msftAC.shift(periods=1, freq=pd.DateOffset(days=1))

# Display the first few rows of the shifted series
print(msftAC_shifted.head(5))

Date
2012-01-04    21.2005119
2012-01-05    21.6994438
2012-01-06    21.9211941
2012-01-07    22.2617207
2012-01-10    21.9687099
Name: Adj Close, dtype: float64


In [109]:
# calculate the percentage change in closing price
msftAC / msftAC.shift(1) - 1

Date
2012-01-03          NaN
2012-01-04    0.0235340
2012-01-05    0.0102192
                ...    
2013-12-24    0.0125614
2013-12-26    0.0097091
2013-12-27   -0.0040062
Name: Adj Close, Length: 500, dtype: float64

# Frequency conversion of time-series data

In [112]:
# take a two item sample of the msftAC data for demonstrations
sample = msftAC[:2]
sample

Date
2012-01-03    21.2005119
2012-01-04    21.6994438
Name: Adj Close, dtype: float64

In [114]:
# demonstrate resampling to hour intervals
# realignment causes many NaN's
sample.asfreq("H")

Date
2012-01-03 00:00:00    21.2005119
2012-01-03 01:00:00           NaN
2012-01-03 02:00:00           NaN
                          ...    
2012-01-03 22:00:00           NaN
2012-01-03 23:00:00           NaN
2012-01-04 00:00:00    21.6994438
Freq: H, Name: Adj Close, Length: 25, dtype: float64

In [116]:
# fill NaN's with the last know non-NaN valuen
sample.asfreq("H", method="ffill")

Date
2012-01-03 00:00:00    21.2005119
2012-01-03 01:00:00    21.2005119
2012-01-03 02:00:00    21.2005119
                          ...    
2012-01-03 22:00:00    21.2005119
2012-01-03 23:00:00    21.2005119
2012-01-04 00:00:00    21.6994438
Freq: H, Name: Adj Close, Length: 25, dtype: float64

In [118]:
# fill with the "next known" value
sample.asfreq("H", method="bfill")

Date
2012-01-03 00:00:00    21.2005119
2012-01-03 01:00:00    21.6994438
2012-01-03 02:00:00    21.6994438
                          ...    
2012-01-03 22:00:00    21.6994438
2012-01-03 23:00:00    21.6994438
2012-01-04 00:00:00    21.6994438
Freq: H, Name: Adj Close, Length: 25, dtype: float64

## Up and down resampling of time-series

In [121]:
# calculate the cumulative daily returns for MSFT
msft_cum_ret = (1 + (msftAC / msftAC.shift() - 1)).cumprod()
msft_cum_ret

Date
2012-01-03          NaN
2012-01-04    1.0235340
2012-01-05    1.0339936
                ...    
2013-12-24    1.4674035
2013-12-26    1.4816507
2013-12-27    1.4757148
Name: Adj Close, Length: 500, dtype: float64

In [123]:
# resample to a monthly cumulative return
msft_monthly_cum_ret = msft_cum_ret.resample("M")
msft_monthly_cum_ret

<pandas.core.resample.DatetimeIndexResampler object at 0x325f62f10>

In [125]:
# verify the monthly average for 2012-01
msft_cum_ret['2012-01'].mean()

1.0686747873246352

In [131]:
# Calculate cumulative returns
msft['Cumulative Return'] = (msft['Adj Close'] / msft['Adj Close'].iloc[0]) - 1

# Resample cumulative returns on a monthly basis and take the mean (default behavior)
msft_cum_ret = msft['Cumulative Return']
msft_cum_ret_resampled = msft_cum_ret.resample("M").mean()

print(msft_cum_ret_resampled)

Date
2012-01-31    0.0652410
2012-02-29    0.1556976
2012-03-31    0.2105697
                ...    
2013-10-31    0.3503986
2013-11-30    0.4719148
2013-12-31    0.4827126
Freq: M, Name: Cumulative Return, Length: 24, dtype: float64


In [135]:
# Resample to monthly and get open, high, low, close
msft_monthly_ohlc = msft.resample("M").ohlc()

# Display the first 5 rows
print(msft_monthly_ohlc.head(5))

                  Open                                            High  ...  \
                  open        high         low       close        open  ...   
Date                                                                    ...   
2012-01-31  26.5499992  29.6599998  26.5499992  29.6599998  26.9599991  ...   
2012-02-29  29.7900009  31.8899994  29.7900009  31.8899994  30.0499992  ...   
2012-03-31  31.9300003  32.9099998  31.5400009  32.4000015  32.3899994  ...   
2012-04-30  32.2200012  32.3100014  30.4300003  31.9799995  32.4599991  ...   
2012-05-31  32.0499992  32.0499992  29.1000004  29.2999992  32.3400002  ...   

              Volume Cumulative Return                                   
               close              open       high        low      close  
Date                                                                     
2012-01-31  50572400         0.0000000  0.1105718  0.0000000  0.1031005  
2012-02-29  59323600         0.1165487  0.1983492  0.1165487  0.1934614

In [139]:
# Calculate cumulative returns
msft['Cumulative Return'] = (msft['Adj Close'] / msft['Adj Close'].iloc[0]) - 1

# Resample to monthly periods and calculate the mean
by_periods = msft['Cumulative Return'].resample('M', kind='period').mean()

# Print the start and end times along with the mean values
for i in by_periods.index[:5]:
    print(f"{i.start_time}:{i.end_time} {by_periods[i]}")

2012-01-01 00:00:00:2012-01-31 23:59:59.999999999 0.06524104795840355
2012-02-01 00:00:00:2012-02-29 23:59:59.999999999 0.155697562629249
2012-03-01 00:00:00:2012-03-31 23:59:59.999999999 0.21056972125754314
2012-04-01 00:00:00:2012-04-30 23:59:59.999999999 0.18464369968477326
2012-05-01 00:00:00:2012-05-31 23:59:59.999999999 0.14051601482311069


In [141]:
# upsampling will be demonstrated using the second
# and third values (first is NaN)
sample = msft_cum_ret[1:3]
sample

Date
2012-01-04    0.0235340
2012-01-05    0.0339936
Name: Cumulative Return, dtype: float64

In [143]:
# upsampling this will have a lot of NaN's
by_hour = sample.resample("H")
by_hour

<pandas.core.resample.DatetimeIndexResampler object at 0x3273abbd0>

In [145]:
by_hour.interpolate()

Date
2012-01-04 00:00:00    0.0235340
2012-01-04 01:00:00    0.0239698
2012-01-04 02:00:00    0.0244056
                         ...    
2012-01-04 22:00:00    0.0331220
2012-01-04 23:00:00    0.0335578
2012-01-05 00:00:00    0.0339936
Freq: H, Name: Cumulative Return, Length: 25, dtype: float64