In [1]:
import pandas as pd 
# Read data from file 'filename.csv' 
# (in the same directory that your python process is based)
# Control delimiters, rows, column names with read_csv (see later) 
vancouver_data = pd.read_csv("./vancouver-parent_data.csv") 
# Preview the first 5 lines of the loaded data 
vancouver_data.head(5)
print(vancouver_data.shape[0])

193378


In [2]:
# Drop uneeded columns from dataset
vancouver_data = vancouver_data.drop(['TYPE', 'HUNDRED_BLOCK', 'NEIGHBOURHOOD', 'X', 'Y', 'HOUR', 'MINUTE'], axis=1)
vancouver_data.head(5)

Unnamed: 0,YEAR,MONTH,DAY
0,2019,3,7
1,2019,8,27
2,2017,11,14
3,2018,3,2
4,2015,2,4


In [4]:
# Removing duplicate dates
vancouver_data.drop_duplicates(subset=['YEAR', 'MONTH', 'DAY'], inplace = True)

In [5]:
print(vancouver_data.shape[0])

1871


In [6]:
#trying to get the day of the week attribute
import datetime
calender = { 0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday' }
days = vancouver_data['DAY'].values
months = vancouver_data['MONTH'].values
years = vancouver_data['YEAR'].values
days_of_the_week = []
for day, month, year in zip(days, months, years):
    day_of_week = calender[datetime.date(year, month, day).weekday()]
    days_of_the_week.append(day_of_week)
    

In [7]:
#adding day_of_the_week attribute to the data frame
vancouver_data['day_of_week'] = days_of_the_week
vancouver_data.head()

Unnamed: 0,YEAR,MONTH,DAY,day_of_week
0,2019,3,7,Thursday
1,2019,8,27,Tuesday
2,2017,11,14,Tuesday
3,2018,3,2,Friday
4,2015,2,4,Wednesday


In [8]:
#adding weekend attribute to dataframe
is_weekend = []
for day in days_of_the_week:
    if day == 'Saturday' or day == 'Sunday':
        is_weekend.append(True)
    else:
        is_weekend.append(False)
vancouver_data['is_weekend'] = is_weekend
vancouver_data.head(5)

Unnamed: 0,YEAR,MONTH,DAY,day_of_week,is_weekend
0,2019,3,7,Thursday,False
1,2019,8,27,Tuesday,False
2,2017,11,14,Tuesday,False
3,2018,3,2,Friday,False
4,2015,2,4,Wednesday,False


In [9]:
#install the holidays needed; this library is included in requirements.txt

In [10]:
#setting up the holidays dictionary for british columbia across the appropriate dates
import holidays
min_year_of_ds = vancouver_data['YEAR'].min()
max_year_of_ds = vancouver_data['YEAR'].max()
year_interval = [i for i in range(min_year_of_ds, max_year_of_ds+1, 1)]
bc_holidays = holidays.Canada(years=year_interval, prov='BC')

In [11]:
#working on the is-holiday attribute and holiday_name attribute
is_holiday = []
holiday_name = []
for day, month, year in zip(days, months, years):
    is_holiday.append(datetime.date(year, month, day) in bc_holidays)
    if datetime.date(year, month, day) in bc_holidays:
        holiday_name.append(bc_holidays[datetime.date(year, month, day)])
    else:
        holiday_name.append(None)
vancouver_data['is_holiday'] = is_holiday
vancouver_data['holiday_name'] = holiday_name

In [12]:
vancouver_data.head(15)

Unnamed: 0,YEAR,MONTH,DAY,day_of_week,is_weekend,is_holiday,holiday_name
0,2019,3,7,Thursday,False,False,
1,2019,8,27,Tuesday,False,False,
2,2017,11,14,Tuesday,False,False,
3,2018,3,2,Friday,False,False,
4,2015,2,4,Wednesday,False,False,
5,2016,2,16,Tuesday,False,False,
6,2015,3,24,Tuesday,False,False,
7,2015,7,3,Friday,False,False,
8,2018,6,16,Saturday,True,False,
9,2016,9,17,Saturday,True,False,


In [13]:
#making sure dataframe column titles are all of the same heading
vancouver_data = vancouver_data.rename(columns={"MONTH": "month", "DAY": "day", "YEAR": "year"}, errors="raise")
vancouver_data.head(10)

Unnamed: 0,year,month,day,day_of_week,is_weekend,is_holiday,holiday_name
0,2019,3,7,Thursday,False,False,
1,2019,8,27,Tuesday,False,False,
2,2017,11,14,Tuesday,False,False,
3,2018,3,2,Friday,False,False,
4,2015,2,4,Wednesday,False,False,
5,2016,2,16,Tuesday,False,False,
6,2015,3,24,Tuesday,False,False,
7,2015,7,3,Friday,False,False,
8,2018,6,16,Saturday,True,False,
9,2016,9,17,Saturday,True,False,


In [14]:
#add date_key
import uuid
date_keys = []
for i in range(vancouver_data.shape[0]):
    id = uuid.uuid4() 
    date_keys.append(id)

vancouver_data['date_key'] = date_keys

In [15]:
#rearrange columns of the dataframe
cols = vancouver_data.columns.tolist()
new_cols = [cols[7]]
new_cols = new_cols + cols[0:7]
vancouver_data = vancouver_data[new_cols]
vancouver_data.head()
print(vancouver_data.shape[0])

1871


In [16]:
# Finally here we convert the dataframe to a csv file to store in our repo
vancouver_date_csv = vancouver_data.to_csv(r'./vancouver-date.csv', index = None, header=True)

In [None]:
# End