### reindex()

In [1]:
import pandas as pd

In [2]:
cols = ['date', 'high', 'low', 'open', 'close']
apple_stock = pd.read_csv('AAPL.csv', usecols=cols, index_col='date', parse_dates=['date'])
apple_stock.head(7)

Unnamed: 0_level_0,close,high,low,open
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-05-27 00:00:00+00:00,132.045,132.26,130.05,130.34
2015-05-28 00:00:00+00:00,131.78,131.95,131.1,131.86
2015-05-29 00:00:00+00:00,130.28,131.45,129.9,131.23
2015-06-01 00:00:00+00:00,130.535,131.39,130.05,131.2
2015-06-02 00:00:00+00:00,129.96,130.655,129.32,129.86
2015-06-03 00:00:00+00:00,130.12,130.94,129.9,130.66
2015-06-04 00:00:00+00:00,129.36,130.58,128.91,129.58


In [3]:
apple_stock.index.day_name().value_counts()

date
Wednesday    257
Tuesday      257
Thursday     255
Friday       253
Monday       236
Name: count, dtype: int64

In [4]:
# Let's make it so it also contain weekends
max_date = apple_stock.index.max()              # Get the last day
max_date

Timestamp('2020-05-22 00:00:00+0000', tz='UTC')

In [5]:
min_date = apple_stock.index.min()      # Get the first day
min_date

Timestamp('2015-05-27 00:00:00+0000', tz='UTC')

In [6]:
dates_incl_weeknds = pd.date_range(start=min_date, end=max_date, freq='d')
dates_incl_weeknds

DatetimeIndex(['2015-05-27 00:00:00+00:00', '2015-05-28 00:00:00+00:00',
               '2015-05-29 00:00:00+00:00', '2015-05-30 00:00:00+00:00',
               '2015-05-31 00:00:00+00:00', '2015-06-01 00:00:00+00:00',
               '2015-06-02 00:00:00+00:00', '2015-06-03 00:00:00+00:00',
               '2015-06-04 00:00:00+00:00', '2015-06-05 00:00:00+00:00',
               ...
               '2020-05-13 00:00:00+00:00', '2020-05-14 00:00:00+00:00',
               '2020-05-15 00:00:00+00:00', '2020-05-16 00:00:00+00:00',
               '2020-05-17 00:00:00+00:00', '2020-05-18 00:00:00+00:00',
               '2020-05-19 00:00:00+00:00', '2020-05-20 00:00:00+00:00',
               '2020-05-21 00:00:00+00:00', '2020-05-22 00:00:00+00:00'],
              dtype='datetime64[ns, UTC]', length=1823, freq='D')

In [7]:
# Verify that it contains all weekend
dates_incl_weeknds.day_name().value_counts()

Wednesday    261
Thursday     261
Friday       261
Saturday     260
Sunday       260
Monday       260
Tuesday      260
Name: count, dtype: int64

#### Replace the old index with the new index

In [8]:
apple_stock.reindex(dates_incl_weeknds).head(7)         # fillna('CLOSED')

Unnamed: 0,close,high,low,open
2015-05-27 00:00:00+00:00,132.045,132.26,130.05,130.34
2015-05-28 00:00:00+00:00,131.78,131.95,131.1,131.86
2015-05-29 00:00:00+00:00,130.28,131.45,129.9,131.23
2015-05-30 00:00:00+00:00,,,,
2015-05-31 00:00:00+00:00,,,,
2015-06-01 00:00:00+00:00,130.535,131.39,130.05,131.2
2015-06-02 00:00:00+00:00,129.96,130.655,129.32,129.86


#### Using fill methods: ```ffill```, ```bfill``` and ```nearest```

In [9]:
as2 = apple_stock.reindex(dates_incl_weeknds, method='ffill')
as2['day'] = as2.index.day_name()
as2.head(7)

Unnamed: 0,close,high,low,open,day
2015-05-27 00:00:00+00:00,132.045,132.26,130.05,130.34,Wednesday
2015-05-28 00:00:00+00:00,131.78,131.95,131.1,131.86,Thursday
2015-05-29 00:00:00+00:00,130.28,131.45,129.9,131.23,Friday
2015-05-30 00:00:00+00:00,130.28,131.45,129.9,131.23,Saturday
2015-05-31 00:00:00+00:00,130.28,131.45,129.9,131.23,Sunday
2015-06-01 00:00:00+00:00,130.535,131.39,130.05,131.2,Monday
2015-06-02 00:00:00+00:00,129.96,130.655,129.32,129.86,Tuesday


### resample()
- Essentially groups our datetime objects into separate groups depending on the frequency we provide.

In [11]:
cols = ['date', 'high', 'low', 'open', 'close']
apple_stock = pd.read_csv('AAPL.csv', usecols=cols, index_col='date', parse_dates=['date'])
apple_stock.head(7)

Unnamed: 0_level_0,close,high,low,open
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-05-27 00:00:00+00:00,132.045,132.26,130.05,130.34
2015-05-28 00:00:00+00:00,131.78,131.95,131.1,131.86
2015-05-29 00:00:00+00:00,130.28,131.45,129.9,131.23
2015-06-01 00:00:00+00:00,130.535,131.39,130.05,131.2
2015-06-02 00:00:00+00:00,129.96,130.655,129.32,129.86
2015-06-03 00:00:00+00:00,130.12,130.94,129.9,130.66
2015-06-04 00:00:00+00:00,129.36,130.58,128.91,129.58


In [None]:
### resample()
months = apple_stock.resample('MS')     # Groups all dates by month
months

<pandas.core.resample.DatetimeIndexResampler object at 0x00000238D4FE5AC0>

In [None]:
months.groups         # key = Timestamp, value = the index os where that group ends

{Timestamp('2015-05-01 00:00:00+0000', tz='UTC'): np.int64(3),
 Timestamp('2015-06-01 00:00:00+0000', tz='UTC'): np.int64(25),
 Timestamp('2015-07-01 00:00:00+0000', tz='UTC'): np.int64(47),
 Timestamp('2015-08-01 00:00:00+0000', tz='UTC'): np.int64(68),
 Timestamp('2015-09-01 00:00:00+0000', tz='UTC'): np.int64(89),
 Timestamp('2015-10-01 00:00:00+0000', tz='UTC'): np.int64(111),
 Timestamp('2015-11-01 00:00:00+0000', tz='UTC'): np.int64(131),
 Timestamp('2015-12-01 00:00:00+0000', tz='UTC'): np.int64(153),
 Timestamp('2016-01-01 00:00:00+0000', tz='UTC'): np.int64(172),
 Timestamp('2016-02-01 00:00:00+0000', tz='UTC'): np.int64(192),
 Timestamp('2016-03-01 00:00:00+0000', tz='UTC'): np.int64(214),
 Timestamp('2016-04-01 00:00:00+0000', tz='UTC'): np.int64(235),
 Timestamp('2016-05-01 00:00:00+0000', tz='UTC'): np.int64(256),
 Timestamp('2016-06-01 00:00:00+0000', tz='UTC'): np.int64(278),
 Timestamp('2016-07-01 00:00:00+0000', tz='UTC'): np.int64(298),
 Timestamp('2016-08-01 00:00:00

#### We can use the same syntax that we used with groups, but we have to be specific here

In [None]:
months.get_group('2016-09-01 00:00:00+0000')      # will give us the values back for 2016

Unnamed: 0_level_0,close,high,low,open
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-09-01 00:00:00+00:00,106.73,106.8,105.62,106.14
2016-09-02 00:00:00+00:00,107.73,108.0,106.82,107.7
2016-09-06 00:00:00+00:00,107.7,108.3,107.51,107.9
2016-09-07 00:00:00+00:00,108.36,108.76,107.07,107.83
2016-09-08 00:00:00+00:00,105.52,107.27,105.24,107.25
2016-09-09 00:00:00+00:00,103.13,105.72,103.13,104.64
2016-09-12 00:00:00+00:00,105.44,105.72,102.53,102.65
2016-09-13 00:00:00+00:00,107.95,108.79,107.24,107.51
2016-09-14 00:00:00+00:00,111.77,113.03,108.6,108.73
2016-09-15 00:00:00+00:00,115.57,115.73,113.49,113.86


#### Then we can perform operations as normal

In [None]:
months.mean().round(2)

# The problem for this approach is that we are referring to a specific moment
# even if the result are for whole month.

Unnamed: 0_level_0,close,high,low,open
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-05-01 00:00:00+00:00,131.37,131.89,130.35,131.14
2015-06-01 00:00:00+00:00,127.81,128.76,127.24,128.06
2015-07-01 00:00:00+00:00,125.34,126.20,124.34,125.45
2015-08-01 00:00:00+00:00,113.39,115.50,111.29,113.44
2015-09-01 00:00:00+00:00,112.80,114.21,111.52,113.00
...,...,...,...,...
2020-01-01 00:00:00+00:00,311.92,314.33,308.83,311.17
2020-02-01 00:00:00+00:00,311.27,315.25,306.73,310.31
2020-03-01 00:00:00+00:00,262.44,269.69,254.85,261.07
2020-04-01 00:00:00+00:00,272.39,275.78,268.08,271.81


#### Using ```kind``` to define what kind of datetime object we want to use

In [None]:
months2 = apple_stock.resample('ME', kind='period')
months2        
# Groups all dates by year

  months2 = apple_stock.resample('ME', kind='period')


<pandas.core.resample.DatetimeIndexResampler object at 0x00000238D6398890>

In [18]:
# The months are grouped as period
months2.groups

{Period('2015-05', 'M'): np.int64(3),
 Period('2015-06', 'M'): np.int64(25),
 Period('2015-07', 'M'): np.int64(47),
 Period('2015-08', 'M'): np.int64(68),
 Period('2015-09', 'M'): np.int64(89),
 Period('2015-10', 'M'): np.int64(111),
 Period('2015-11', 'M'): np.int64(131),
 Period('2015-12', 'M'): np.int64(153),
 Period('2016-01', 'M'): np.int64(172),
 Period('2016-02', 'M'): np.int64(192),
 Period('2016-03', 'M'): np.int64(214),
 Period('2016-04', 'M'): np.int64(235),
 Period('2016-05', 'M'): np.int64(256),
 Period('2016-06', 'M'): np.int64(278),
 Period('2016-07', 'M'): np.int64(298),
 Period('2016-08', 'M'): np.int64(321),
 Period('2016-09', 'M'): np.int64(342),
 Period('2016-10', 'M'): np.int64(363),
 Period('2016-11', 'M'): np.int64(384),
 Period('2016-12', 'M'): np.int64(405),
 Period('2017-01', 'M'): np.int64(425),
 Period('2017-02', 'M'): np.int64(444),
 Period('2017-03', 'M'): np.int64(467),
 Period('2017-04', 'M'): np.int64(486),
 Period('2017-05', 'M'): np.int64(508),
 Perio

In [19]:
# This makes our DataFrames make more sense
months2.mean().round(2)

Unnamed: 0_level_0,close,high,low,open
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-05,131.37,131.89,130.35,131.14
2015-06,127.81,128.76,127.24,128.06
2015-07,125.34,126.20,124.34,125.45
2015-08,113.39,115.50,111.29,113.44
2015-09,112.80,114.21,111.52,113.00
...,...,...,...,...
2020-01,311.92,314.33,308.83,311.17
2020-02,311.27,315.25,306.73,310.31
2020-03,262.44,269.69,254.85,261.07
2020-04,272.39,275.78,268.08,271.81


In [21]:
months2.agg({'high': ['max', 'min'], 'low': ['max', 'min']}).round(2)

Unnamed: 0_level_0,high,high,low,low
Unnamed: 0_level_1,max,min,max,min
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2015-05,132.26,131.45,131.10,129.90
2015-06,131.39,126.12,130.05,124.48
2015-07,132.97,122.57,130.70,119.22
2015-08,122.57,108.80,117.52,92.00
2015-09,116.89,110.45,115.44,107.36
...,...,...,...,...
2020-01,327.85,299.96,321.38,292.75
2020-02,327.22,278.41,323.35,256.37
2020-03,304.00,228.50,293.13,212.61
2020-04,294.53,245.15,288.35,236.90
