# Statistics Introduction Applied to Data Science
## Bonus : Two - Time Series

### Basic Time Series Calculations

* shift(): Moving data between past and future.
* div() : Calculate one period percent change.
* diff(): Built-in time series change.
* pct_change(): Built-in time series % change.
* Multi-periods returns.

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('YesBank_StockPrices.csv', sep=',')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    185 non-null    object 
 1   Open    185 non-null    float64
 2   High    185 non-null    float64
 3   Low     185 non-null    float64
 4   Close   185 non-null    float64
dtypes: float64(4), object(1)
memory usage: 7.4+ KB


In [4]:
df.head(3)

Unnamed: 0,Date,Open,High,Low,Close
0,Jul-05,13.0,14.0,11.25,12.46
1,Aug-05,12.58,14.88,12.55,13.42
2,Sep-05,13.48,14.87,12.27,13.3


In [5]:
df.tail(3)

Unnamed: 0,Date,Open,High,Low,Close
182,Sep-20,14.3,15.34,12.75,13.15
183,Oct-20,13.3,14.01,12.11,12.42
184,Nov-20,12.41,14.9,12.21,14.67


In [6]:
# You can create a sequences of dates and times
idx = pd.date_range(start='2005-07', end='2020-12', freq='M')
idx

DatetimeIndex(['2005-07-31', '2005-08-31', '2005-09-30', '2005-10-31',
               '2005-11-30', '2005-12-31', '2006-01-31', '2006-02-28',
               '2006-03-31', '2006-04-30',
               ...
               '2020-02-29', '2020-03-31', '2020-04-30', '2020-05-31',
               '2020-06-30', '2020-07-31', '2020-08-31', '2020-09-30',
               '2020-10-31', '2020-11-30'],
              dtype='datetime64[ns]', length=185, freq='M')

In [7]:
# Add DatetimeIndex to dataframe
df.set_index(idx, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 185 entries, 2005-07-31 to 2020-11-30
Freq: M
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    185 non-null    object 
 1   Open    185 non-null    float64
 2   High    185 non-null    float64
 3   Low     185 non-null    float64
 4   Close   185 non-null    float64
dtypes: float64(4), object(1)
memory usage: 8.7+ KB


In [8]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close
2005-07-31,Jul-05,13.0,14.0,11.25,12.46
2005-08-31,Aug-05,12.58,14.88,12.55,13.42
2005-09-30,Sep-05,13.48,14.87,12.27,13.3
2005-10-31,Oct-05,13.2,14.47,12.4,12.99
2005-11-30,Nov-05,13.35,13.88,12.88,13.41


### .shift(): Moving data between past & future

In [9]:
df['Close_Shifted'] = df.Close.shift() # default: periods=1
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Close_Shifted
2005-07-31,Jul-05,13.0,14.0,11.25,12.46,
2005-08-31,Aug-05,12.58,14.88,12.55,13.42,12.46
2005-09-30,Sep-05,13.48,14.87,12.27,13.3,13.42
2005-10-31,Oct-05,13.2,14.47,12.4,12.99,13.3
2005-11-30,Nov-05,13.35,13.88,12.88,13.41,12.99


In [10]:
df['Close_Lagged'] = df.Close.shift(periods=-1)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Close_Shifted,Close_Lagged
2005-07-31,Jul-05,13.0,14.0,11.25,12.46,,13.42
2005-08-31,Aug-05,12.58,14.88,12.55,13.42,12.46,13.3
2005-09-30,Sep-05,13.48,14.87,12.27,13.3,13.42,12.99
2005-10-31,Oct-05,13.2,14.47,12.4,12.99,13.3,13.41
2005-11-30,Nov-05,13.35,13.88,12.88,13.41,12.99,13.71


In [11]:
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Close_Shifted,Close_Lagged
2020-07-31,Jul-20,25.6,28.3,11.1,11.95,25.6,14.37
2020-08-31,Aug-20,12.0,17.16,11.85,14.37,11.95,13.15
2020-09-30,Sep-20,14.3,15.34,12.75,13.15,14.37,12.42
2020-10-31,Oct-20,13.3,14.01,12.11,12.42,13.15,14.67
2020-11-30,Nov-20,12.41,14.9,12.21,14.67,12.42,


### Calculate one-period percent change

In [12]:
df['change'] = df.Close.div(df.Close_Shifted)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Close_Shifted,Close_Lagged,change
2005-07-31,Jul-05,13.0,14.0,11.25,12.46,,13.42,
2005-08-31,Aug-05,12.58,14.88,12.55,13.42,12.46,13.3,1.077047
2005-09-30,Sep-05,13.48,14.87,12.27,13.3,13.42,12.99,0.991058
2005-10-31,Oct-05,13.2,14.47,12.4,12.99,13.3,13.41,0.976692
2005-11-30,Nov-05,13.35,13.88,12.88,13.41,12.99,13.71,1.032333


In [13]:
# You can chain all DataFrame methods that return a DataFrame. The return DataFrame will be used as input for the next
# calculation.
# Example: Subtracting 1 and multiplying the result by 100 to obtain the relative change in percentage terms.

df['return'] = df.change.sub(1).mul(100)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Close_Shifted,Close_Lagged,change,return
2005-07-31,Jul-05,13.0,14.0,11.25,12.46,,13.42,,
2005-08-31,Aug-05,12.58,14.88,12.55,13.42,12.46,13.3,1.077047,7.704655
2005-09-30,Sep-05,13.48,14.87,12.27,13.3,13.42,12.99,0.991058,-0.894188
2005-10-31,Oct-05,13.2,14.47,12.4,12.99,13.3,13.41,0.976692,-2.330827
2005-11-30,Nov-05,13.35,13.88,12.88,13.41,12.99,13.71,1.032333,3.233256


### diff(): built-in time-series change

In [14]:
# xt -x(t-1)
# Calculates the change between values at different points in time.
df['diff_Close'] = df.Close.diff()
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Close_Shifted,Close_Lagged,change,return,diff_Close
2005-07-31,Jul-05,13.0,14.0,11.25,12.46,,13.42,,,
2005-08-31,Aug-05,12.58,14.88,12.55,13.42,12.46,13.3,1.077047,7.704655,0.96
2005-09-30,Sep-05,13.48,14.87,12.27,13.3,13.42,12.99,0.991058,-0.894188,-0.12
2005-10-31,Oct-05,13.2,14.47,12.4,12.99,13.3,13.41,0.976692,-2.330827,-0.31
2005-11-30,Nov-05,13.35,13.88,12.88,13.41,12.99,13.71,1.032333,3.233256,0.42


### pct_change(): built-in time-series % change

In [15]:
# Percent change for two adjacent periodos
# xt / x(t-1)
df['pct_Close'] = df.Close.pct_change().mul(100)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Close_Shifted,Close_Lagged,change,return,diff_Close,pct_Close
2005-07-31,Jul-05,13.0,14.0,11.25,12.46,,13.42,,,,
2005-08-31,Aug-05,12.58,14.88,12.55,13.42,12.46,13.3,1.077047,7.704655,0.96,7.704655
2005-09-30,Sep-05,13.48,14.87,12.27,13.3,13.42,12.99,0.991058,-0.894188,-0.12,-0.894188
2005-10-31,Oct-05,13.2,14.47,12.4,12.99,13.3,13.41,0.976692,-2.330827,-0.31,-2.330827
2005-11-30,Nov-05,13.35,13.88,12.88,13.41,12.99,13.71,1.032333,3.233256,0.42,3.233256


### Looking ahead: Get multi-period returns

In [16]:
df['Close_3d'] = df.Close.pct_change(periods=3).mul(100)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Close_Shifted,Close_Lagged,change,return,diff_Close,pct_Close,Close_3d
2005-07-31,Jul-05,13.0,14.0,11.25,12.46,,13.42,,,,,
2005-08-31,Aug-05,12.58,14.88,12.55,13.42,12.46,13.3,1.077047,7.704655,0.96,7.704655,
2005-09-30,Sep-05,13.48,14.87,12.27,13.3,13.42,12.99,0.991058,-0.894188,-0.12,-0.894188,
2005-10-31,Oct-05,13.2,14.47,12.4,12.99,13.3,13.41,0.976692,-2.330827,-0.31,-2.330827,4.253612
2005-11-30,Nov-05,13.35,13.88,12.88,13.41,12.99,13.71,1.032333,3.233256,0.42,3.233256,-0.074516


In [17]:
# End