# Chapter 4 - Time Series

In [81]:
import numpy as np
import pandas as pd
import datetime
from datetime import datetime
import matplotlib.pyplot as plt

## DatetimeIndex

In [64]:
#simple set up of dates
dates = [datetime(2014, 8, 1), datetime(2014, 8, 2)]
dti = pd.DatetimeIndex(dates)
dti

DatetimeIndex(['2014-08-01', '2014-08-02'], dtype='datetime64[ns]', freq=None)

In [65]:
#A Series automaticaaly passes a list of dattime object as its index
np.random.seed(123456)
ts = pd.Series(np.random.randn(2), dates)
type(ts.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [66]:
#Series objects take the datetime objects and construct a DatetimeIndex from date values
#where each value is a Timetsampo object and ea element can be used to access the corresponding value in Series
ts["2014-08-01"]

0.4691122999071863

In [67]:
#convert different formatted dates to datetime
dti = pd.to_datetime(["Aug 1, 2014", "2014-08-02" ,"2014.8.2", None]) #in bounds with dates
dti

DatetimeIndex(['2014-08-01', '2014-08-02', '2014-08-02', 'NaT'], dtype='datetime64[ns]', freq=None)

In [68]:
#to datetime will return default no.array if value not found
#dti2 = pd.to_datetime(["2014-08-01", "foo"], coerce = True)
#type(dti2)

In [95]:
#to use day first (default is month first)

dti2 = pd.to_datetime("1/8/2014", dayfirst = True)

In [70]:
#a range of timestamps at a specific frequency
np.random.seed(123456)
dates = pd.date_range("8/1/2014", periods = 10)
s1 = pd.Series(np.random.randn(10), dates) #Series with random values and dates as index
s1[:5]

2014-08-01    0.469112
2014-08-02   -0.282863
2014-08-03   -1.509059
2014-08-04   -1.135632
2014-08-05    1.212112
Freq: D, dtype: float64

In [117]:
import pandas_datareader as web
msft = web.DataReader("WIKI/MSFT", "quandl", "2012-1-1", "2013-12-30")
msft.tail(3)


Unnamed: 0_level_0,Open,High,Low,Close,Volume,ExDividend,SplitRatio,AdjOpen,AdjHigh,AdjLow,AdjClose,AdjVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2012-02-02,29.9,30.17,29.71,29.95,52223300.0,0.0,1.0,25.461918,25.691842,25.30012,25.504497,52223300.0
2012-02-01,29.79,30.05,29.76,29.89,67409900.0,0.0,1.0,25.368246,25.589654,25.342699,25.453403,67409900.0
2012-01-31,29.66,29.7,29.23,29.53,50572400.0,0.0,1.0,25.257542,25.291604,24.891367,25.146838,50572400.0


In [118]:
msftAC = msft["AdjClose"] #pandas Series object

"""It goes backwards"""
#slicing of datetimes on a dataframe
msft["2012-02-02":"2012-01-31"] 

Unnamed: 0_level_0,Open,High,Low,Close,Volume,ExDividend,SplitRatio,AdjOpen,AdjHigh,AdjLow,AdjClose,AdjVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2012-02-02,29.9,30.17,29.71,29.95,52223300.0,0.0,1.0,25.461918,25.691842,25.30012,25.504497,52223300.0
2012-02-01,29.79,30.05,29.76,29.89,67409900.0,0.0,1.0,25.368246,25.589654,25.342699,25.453403,67409900.0
2012-01-31,29.66,29.7,29.23,29.53,50572400.0,0.0,1.0,25.257542,25.291604,24.891367,25.146838,50572400.0


In [110]:
msftAC = msft["AdjClose"] #pandas Series object
msft.loc["2012-01-09"]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,ExDividend,SplitRatio,AdjOpen,AdjHigh,AdjLow,AdjClose,AdjVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2012-01-09,28.05,28.1,27.72,27.74,59706800.0,0.0,1.0,23.886515,23.929094,23.605497,23.622529,59706800.0


# Creating time-series with specific frequencies

In [89]:
#Represent time intervals other than daily frequencies
bymin = pd.Series(np.arange(0, 90*60*24), pd.date_range("2014-08", "2014-10-29 23:59:00", freq = "T"))
bymin[:5]

2014-08-01 00:00:00    0
2014-08-01 00:01:00    1
2014-08-01 00:02:00    2
2014-08-01 00:03:00    3
2014-08-01 00:04:00    4
Freq: T, dtype: int64

In [90]:
#slice sepcific frequency time-series
bymin["2014-09-01 12:30":"2014-09-02 12:59"][:5]

2014-09-01 12:30:00    45390
2014-09-01 12:31:00    45391
2014-09-01 12:32:00    45392
2014-09-01 12:33:00    45393
2014-09-01 12:34:00    45394
Freq: T, dtype: int64

# Respresenting intervals of time using periods

In [91]:
""""bounded interval: Periods object = start time (anchor), frequency --> create the end date """
#Create a period respresenting a 1-m period anchored to aug2014
aug2014 = pd.Period("2014-08", freq = "M")
aug2014

Period('2014-08', 'M')

In [74]:
#Start and end properties
aug2014.start_time, aug2014.end_time

(Timestamp('2014-08-01 00:00:00'), Timestamp('2014-08-31 23:59:59.999999999'))

In [75]:
#calculate different periods using mathematical operations
sep2014 = aug2014 + 1
sep2014
sep2014.start_time, sep2014.end_time

(Timestamp('2014-09-01 00:00:00'), Timestamp('2014-09-30 23:59:59.999999999'))

In [76]:
#PeriodIndex = combined period objects into a collection
mp2013 = pd.period_range("1/1/2013", "12/31/2013", freq = "M")
mp2013

PeriodIndex(['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06',
             '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'],
            dtype='period[M]', freq='M')

In [77]:
#PeriodIndex = The index labels are a period, DatetimeIndex = not a period
for p in mp2013:
    print("{0}{1}{2}{3}".format(p,p.freq, p.start_time, p.end_time))


2013-01<MonthEnd>2013-01-01 00:00:002013-01-31 23:59:59.999999999
2013-02<MonthEnd>2013-02-01 00:00:002013-02-28 23:59:59.999999999
2013-03<MonthEnd>2013-03-01 00:00:002013-03-31 23:59:59.999999999
2013-04<MonthEnd>2013-04-01 00:00:002013-04-30 23:59:59.999999999
2013-05<MonthEnd>2013-05-01 00:00:002013-05-31 23:59:59.999999999
2013-06<MonthEnd>2013-06-01 00:00:002013-06-30 23:59:59.999999999
2013-07<MonthEnd>2013-07-01 00:00:002013-07-31 23:59:59.999999999
2013-08<MonthEnd>2013-08-01 00:00:002013-08-31 23:59:59.999999999
2013-09<MonthEnd>2013-09-01 00:00:002013-09-30 23:59:59.999999999
2013-10<MonthEnd>2013-10-01 00:00:002013-10-31 23:59:59.999999999
2013-11<MonthEnd>2013-11-01 00:00:002013-11-30 23:59:59.999999999
2013-12<MonthEnd>2013-12-01 00:00:002013-12-31 23:59:59.999999999


In [78]:
#Construct a Series using the PeriodIndex as the index 
"""Time-series => index = measurement tht spans a period of time rather than just a specific time."""
np.random.seed(123456)
ps = pd.Series(np.random.randn(12), mp2013)
ps

2013-01    0.469112
2013-02   -0.282863
2013-03   -1.509059
2013-04   -1.135632
2013-05    1.212112
2013-06   -0.173215
2013-07    0.119209
2013-08   -1.044236
2013-09   -0.861849
2013-10   -2.104569
2013-11   -0.494929
2013-12    1.071804
Freq: M, dtype: float64

# Shifting and Lagging time-series data

In [79]:
"""Lag values back or forward such as to calculate percentage change from sample to sample"""
#Shift adjusted close price forward by 1 day
shifted_forward = msftAC.shift(1)
shifted_forward[:5]

Date
2013-12-30          NaN
2013-12-27    33.650110
2013-12-26    33.650110
2013-12-24    33.785469
2013-12-23    33.460608
Name: AdjClose, dtype: float64

In [120]:
#Shift forward tail it results in the loss of one or two end values
msftAC.tail(3), shifted_forward.tail(5)

(Date
 2012-01-05    23.571435
 2012-01-04    23.332995
 2012-01-03    22.792249
 Name: AdjClose, dtype: float64, Date
 2012-01-09    23.707686
 2012-01-06    23.622529
 2012-01-05    23.933352
 2012-01-04    23.571435
 2012-01-03    23.332995
 Name: AdjClose, dtype: float64)

In [129]:
#Shift values in opposite direction
shifted_backwards = msftAC.shift(-2)
shifted_backwards[:5]

Date
2013-12-30    33.785469
2013-12-27    33.460608
2013-12-26    33.045509
2013-12-24    33.207939
2013-12-23    32.711625
Name: AdjClose, dtype: float64

In [130]:
#result is 2 NaN values at the tail
shifted_backwards.tail()

Date
2012-01-09    23.571435
2012-01-06    23.332995
2012-01-05    22.792249
2012-01-04          NaN
2012-01-03          NaN
Name: AdjClose, dtype: float64

In [131]:
#Shift by different frequencies
#1 day per one second
msftAC.shift(1, freq = "S")

Date
2013-12-30 00:00:01    33.650110
2013-12-27 00:00:01    33.650110
2013-12-26 00:00:01    33.785469
2013-12-24 00:00:01    33.460608
2013-12-23 00:00:01    33.045509
2013-12-20 00:00:01    33.207939
2013-12-19 00:00:01    32.711625
2013-12-18 00:00:01    33.009414
2013-12-17 00:00:01    32.955270
2013-12-16 00:00:01    33.284642
2013-12-13 00:00:01    33.108676
2013-12-12 00:00:01    33.586943
2013-12-11 00:00:01    33.938875
2013-12-10 00:00:01    34.390070
2013-12-09 00:00:01    34.926992
2013-12-06 00:00:01    34.615667
2013-12-05 00:00:01    34.290807
2013-12-04 00:00:01    35.139053
2013-12-03 00:00:01    34.570548
2013-12-02 00:00:01    34.696882
2013-11-29 00:00:01    34.408117
2013-11-27 00:00:01    33.929851
2013-11-26 00:00:01    33.704254
2013-11-25 00:00:01    33.965947
2013-11-22 00:00:01    33.902779
2013-11-21 00:00:01    33.749373
2013-11-20 00:00:01    33.460608
2013-11-19 00:00:01    33.153796
2013-11-18 00:00:01    33.314997
2013-11-15 00:00:01    33.889054
     

In [132]:
#TSHIFT: instead of changing the alignment of the data --> new df values of index are changed
#by the n of offsets of the value of the freq param
msftAC.shift(1, freq = "D")

Date
2013-12-31    33.650110
2013-12-28    33.650110
2013-12-27    33.785469
2013-12-25    33.460608
2013-12-24    33.045509
2013-12-21    33.207939
2013-12-20    32.711625
2013-12-19    33.009414
2013-12-18    32.955270
2013-12-17    33.284642
2013-12-14    33.108676
2013-12-13    33.586943
2013-12-12    33.938875
2013-12-11    34.390070
2013-12-10    34.926992
2013-12-07    34.615667
2013-12-06    34.290807
2013-12-05    35.139053
2013-12-04    34.570548
2013-12-03    34.696882
2013-11-30    34.408117
2013-11-28    33.929851
2013-11-27    33.704254
2013-11-26    33.965947
2013-11-23    33.902779
2013-11-22    33.749373
2013-11-21    33.460608
2013-11-20    33.153796
2013-11-19    33.314997
2013-11-16    33.889054
                ...    
2012-02-15    25.930281
2012-02-14    26.040985
2012-02-11    25.968602
2012-02-10    26.202783
2012-02-09    26.109111
2012-02-08    25.845124
2012-02-07    25.717389
2012-02-04    25.751452
2012-02-03    25.504497
2012-02-02    25.453403
2012-02-01 

In [134]:
"""Calculation of daily percentage changes from the previous day (day-to-day perc change of AC)"""
#i.e. Return (Pt/Pt=1) - 1
msftAC/msftAC.shift(1) - 1

Date
2013-12-30         NaN
2013-12-27    0.000000
2013-12-26    0.004023
2013-12-24   -0.009615
2013-12-23   -0.012406
2013-12-20    0.004915
2013-12-19   -0.014946
2013-12-18    0.009103
2013-12-17   -0.001640
2013-12-16    0.009995
2013-12-13   -0.005287
2013-12-12    0.014445
2013-12-11    0.010478
2013-12-10    0.013294
2013-12-09    0.015613
2013-12-06   -0.008914
2013-12-05   -0.009385
2013-12-04    0.024737
2013-12-03   -0.016179
2013-12-02    0.003654
2013-11-29   -0.008322
2013-11-27   -0.013900
2013-11-26   -0.006649
2013-11-25    0.007764
2013-11-22   -0.001860
2013-11-21   -0.004525
2013-11-20   -0.008556
2013-11-19   -0.009169
2013-11-18    0.004862
2013-11-15    0.017231
                ...   
2012-02-14    0.006656
2012-02-13    0.004269
2012-02-10   -0.002780
2012-02-09    0.009018
2012-02-08   -0.003575
2012-02-07   -0.010111
2012-02-06   -0.004942
2012-02-03    0.001325
2012-02-02   -0.009590
2012-02-01   -0.002003
2012-01-31   -0.012044
2012-01-30    0.002709
2012-0

# Frequency Conversion of time-series data

In [145]:
#Convert frequency of data
sample = msftAC[:2]
#resample to have hourly sampling in between index labels
sample

Date
2013-12-30    33.65011
2013-12-27    33.65011
Name: AdjClose, dtype: float64

In [146]:
#resample to have hourly sampling in between index labels
sample.asfreq("H")
#but between the hours there are NaNs so fill with ffill or bfill
sample.asfreq("H", method ="ffill")

Series([], Freq: H, Name: AdjClose, dtype: float64)

# Resampling of time-series

In [149]:
"""Calculate daily cumulative returns for MSFT stock over 2012, 2013 and resample it to monthly frequency."""
#Cumulative dailt return:
msft_cum_ret = (1 + (msftAC/msftAC.shift() - 1)).cumprod()
msft_cum_ret[:5]

Date
2013-12-30         NaN
2013-12-27    1.000000
2013-12-26    1.004023
2013-12-24    0.994368
2013-12-23    0.982033
Name: AdjClose, dtype: float64

In [152]:
#Downsampling daily cum rets from d2d to m2m
msft_monthly_cum_ret = msft_cum_ret.resample("M")
msft_monthly_cum_ret[:3]

.resample() is now a deferred operation
You called __getitem__(...) on this deferred object which materialized it into a series
by implicitly taking the mean.  Use .resample(...).mean() instead
  This is separate from the ipykernel package so we can avoid doing imports until


Date
2012-01-31    0.721636
2012-02-29    0.782935
2012-03-31    0.820109
Freq: M, Name: AdjClose, dtype: float64

In [153]:
#Resample by mean
msft_cum_ret.resample("M", how = "mean")
msft_cum_ret[:5]

the new syntax is .resample(...).mean()
  


Date
2013-12-30         NaN
2013-12-27    1.000000
2013-12-26    1.004023
2013-12-24    0.994368
2013-12-23    0.982033
Name: AdjClose, dtype: float64

In [154]:
#Resample by OHLC
msft_cum_ret.resample("M", how = "ohlc")[:5]

the new syntax is .resample(...).ohlc()
  


Unnamed: 0_level_0,open,high,low,close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-31,0.677331,0.752365,0.677331,0.747303
2012-02-29,0.756414,0.811853,0.756414,0.808541
2012-03-31,0.822552,0.836817,0.803829,0.82166
2012-04-30,0.822552,0.825864,0.773005,0.815547
2012-05-31,0.815419,0.815419,0.745172,0.748506


In [156]:
#The type of index from a resampling is controlled by the kind parameterd set to; a) timestamp, default, b) period
#return an index based on periods:
by_periods = msft_cum_ret.resample("M", how = "mean", kind = "period")

for i in by_periods.index[:5]:
    print("{0}:{1} {2}".format(i.start_time, i, i.end_time, by_periods[i]))

2012-01-01 00:00:00:2012-01 2012-01-31 23:59:59.999999999
2012-02-01 00:00:00:2012-02 2012-02-29 23:59:59.999999999
2012-03-01 00:00:00:2012-03 2012-03-31 23:59:59.999999999
2012-04-01 00:00:00:2012-04 2012-04-30 23:59:59.999999999
2012-05-01 00:00:00:2012-05 2012-05-31 23:59:59.999999999


the new syntax is .resample(...).mean()
  This is separate from the ipykernel package so we can avoid doing imports until


In [157]:
#Unsampling
sample = msft_cum_ret[1:3]
sample

Date
2013-12-27    1.000000
2013-12-26    1.004023
Name: AdjClose, dtype: float64

In [158]:
#Upsample by hour
by_hour = sample.resample("H")
by_hour
#buy so many NaNs

DatetimeIndexResampler [freq=<Hour>, axis=0, closed=left, label=left, convention=start, base=0]

In [159]:
#Use ffill or bfill or interpolate
by_hour.interpolate()

Date
2013-12-26 00:00:00    1.004023
2013-12-26 01:00:00    1.003855
2013-12-26 02:00:00    1.003687
2013-12-26 03:00:00    1.003520
2013-12-26 04:00:00    1.003352
2013-12-26 05:00:00    1.003184
2013-12-26 06:00:00    1.003017
2013-12-26 07:00:00    1.002849
2013-12-26 08:00:00    1.002682
2013-12-26 09:00:00    1.002514
2013-12-26 10:00:00    1.002346
2013-12-26 11:00:00    1.002179
2013-12-26 12:00:00    1.002011
2013-12-26 13:00:00    1.001844
2013-12-26 14:00:00    1.001676
2013-12-26 15:00:00    1.001508
2013-12-26 16:00:00    1.001341
2013-12-26 17:00:00    1.001173
2013-12-26 18:00:00    1.001006
2013-12-26 19:00:00    1.000838
2013-12-26 20:00:00    1.000670
2013-12-26 21:00:00    1.000503
2013-12-26 22:00:00    1.000335
2013-12-26 23:00:00    1.000168
2013-12-27 00:00:00    1.000000
Freq: H, Name: AdjClose, dtype: float64