In [175]:
import os

In [176]:
os.chdir('/Users/anurag_chandra/Downloads')

In [177]:
pwd

'/Users/anurag_chandra/Downloads'

In [178]:
# Loading the Holidays data

import pandas as pd
df = pd.read_csv('holidays_data.csv')

In [179]:
# Checking the starting and ending dates of Holidays data

df.head()
df.tail()

Unnamed: 0,route,date,daytype,rides
845350,X98,12/23/2019,W,1
845351,X98,12/26/2019,W,17
845352,X98,12/27/2019,W,18
845353,X98,12/28/2019,A,1
845354,X98,12/30/2019,W,14


In [180]:
df['date'] = pd.to_datetime(df['date'])  

In [181]:
# We require only the dates staring from year 2017 and ending at year 2019. So the remaining all dates can be dropped 
# from the table

mask = (df['date'] > '2017-01-01') & (df['date'] <= '2019-12-30')
df = df.loc[mask]

In [182]:
# Check for missing values

(df.isna()).sum()   



route      0
date       0
daytype    0
rides      0
dtype: int64

In [183]:
# The columns route and rides are irrelevant and hence they can be dropped

df = df.drop(columns=['route','rides'])   
df    

Unnamed: 0,date,daytype
719025,2017-02-23,W
719026,2017-03-22,W
719027,2017-02-08,W
719028,2017-03-30,W
719029,2017-01-03,W
719030,2017-01-22,U
719031,2017-02-15,W
719032,2017-02-02,W
719033,2017-01-22,U
719034,2017-02-01,W


In [184]:
# Sorting the dataframe by order of dates

df = df.sort_values(by='date')  

In [196]:
# All the duplicates of date column can be dropped as we need only one value of a particular date

data = df.drop_duplicates()
data

Unnamed: 0,date,daytype
726318,2017-01-02,U
726952,2017-01-03,W
727959,2017-01-04,W
728441,2017-01-05,W
726697,2017-01-06,W
720134,2017-01-07,A
726560,2017-01-08,U
724632,2017-01-09,W
726774,2017-01-10,W
727832,2017-01-11,W


In [197]:
holidays_2017 = ['2017-01-16','2017-02-13','2017-02-20','2017-03-06','2017-05-29',
                 '2017-07-04','2017-09-04','2017-11-23','2017-11-24','2017-12-25']

# This is the list of public holidays for calendar year 2017 which is not mentioned in dataset, so we convert these 
# dates to holidays i.e 'U' 
# These holidays include Martin Luther King Jr. day, Spring Break, Thanksgiving days, Labour day and other significant
# holidays 

In [199]:
for x in holidays_2017 :
    
    data.loc[data['date'] == x, 'daytype'] = 'U'

# Replace the weekdays of daytime column with "U" instead of "W" indicating a public holiday of year 2017

In [190]:
holidays_2018 = ['2018-01-15','2018-02-12','2018-02-19','2018-03-05','2018-05-28', 
                 '2018-07-04','2018-09-03','2018-11-06','2018-11-22','2018-11-23','2018-12-25']

# List of public holidays for calendar year 2018

In [200]:
for y in holidays_2018 :
    
    data.loc[data['date'] == y, 'daytype'] = 'U'

# Replace the public holidays of year 2018 with "U" instead of "W"

In [201]:
holidays_2019 = ['2019-01-21','2019-02-12','2019-02-18','2019-03-04','2019-05-27',
                 '2019-07-04','2019-09-02','2019-11-28','2019-11-29','2019-12-25']

# List of public holidays for calendar year 2019

In [202]:
for z in holidays_2019 :
    
    data.loc[data['date'] == z, 'daytype'] = 'U'

# Replace the public holidays of year 2019 with "U" instead of "W"

In [204]:
data[data['date'] == '2018-12-25']  # Checking whether Christmas Eve 2018 is given as 'U' or 'W' 

Unnamed: 0,date,daytype
801368,2018-12-25,U


In [212]:
# This is the cleaned Holidays data which indicates all public holidays and sundays as "U", saturdays as "A" and
# weekdays as "W"

data.set_index('date')

Unnamed: 0_level_0,daytype
date,Unnamed: 1_level_1
2017-01-02,U
2017-01-03,W
2017-01-04,W
2017-01-05,W
2017-01-06,W
2017-01-07,A
2017-01-08,U
2017-01-09,W
2017-01-10,W
2017-01-11,W
