# Predicting Electricity Consumption in Toronto: Comparing Machine Learning to Time Series Forecasting
**Group:** G16

**By:** Greta Avetisian and Cesare Spinoso-Di Piano

In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

## Loading in all relevant datasets

In [2]:
#We have data for 2004 to 2018
yearRange = range(2004, 2019) #2019, because range doesn't include the stop number you give it

In [3]:
#Let's read all our Demand files and store them in a dataframe(df)
for i in yearRange:
    locals()["Demand_" + str(i)] = pd.read_csv("Data/Demand/PUB_Demand_" + str(i) + ".csv", header = 3)

In [4]:
#Since this is an hourly data set, it is expected for all datasets to have the same number of rows
#except the leap year which will have an extra 24 rows for the extra 24 hours that occur within that year.
for i in yearRange:
    print(len(locals()["Demand_" + str(i)]), i)

8784 2004
8760 2005
8760 2006
8760 2007
8784 2008
8760 2009
8760 2010
8760 2011
8784 2012
8760 2013
8760 2014
8760 2015
8784 2016
8760 2017
8760 2018


In [5]:
#Printing out 2004 to see if it worked for the first instance of Demand
Demand_2004

Unnamed: 0,Date,Hour,Market Demand,Ontario Demand
0,2004-01-01,1,15787,14703
1,2004-01-01,2,15194,14130
2,2004-01-01,3,14589,13575
3,2004-01-01,4,14294,13280
4,2004-01-01,5,14233,13219
...,...,...,...,...
8779,2004-12-31,20,19547,18131
8780,2004-12-31,21,18918,17390
8781,2004-12-31,22,18439,16737
8782,2004-12-31,23,17718,16112


In [6]:
#Printing out 2018 to see if it worked for the last instance of Demand
Demand_2018

Unnamed: 0,Date,Hour,Market Demand,Ontario Demand
0,2018-01-01,1,18974,16627
1,2018-01-01,2,18447,16084
2,2018-01-01,3,18453,15866
3,2018-01-01,4,18662,15725
4,2018-01-01,5,18060,15470
...,...,...,...,...
8755,2018-12-31,20,18276,16195
8756,2018-12-31,21,18046,15668
8757,2018-12-31,22,17516,14987
8758,2018-12-31,23,17506,14560


In [7]:
#Repeating the previous process for Demand_Zonal
for i in yearRange:
    locals()["Zonal_Demand_" + str(i)] = pd.read_csv("Data/Zonal_Demand/PUB_DemandZonal_" + str(i) + ".csv", header = 3)

In [8]:
#Since this is an hourly data set, it is expected for all datasets to have the same number of rows
#except the leap year which will have an extra 24 rows for the extra 24 hours that occur within that year.
for i in yearRange:
    print(len(locals()["Zonal_Demand_" + str(i)]), i)

8784 2004
8760 2005
8760 2006
8760 2007
8784 2008
8760 2009
8760 2010
8760 2011
8784 2012
8760 2013
8760 2014
8760 2015
8784 2016
8760 2017
8760 2018


In [9]:
#Printing out 2004 to see if it worked for the first instance of Zonal Demand
Zonal_Demand_2004

Unnamed: 0,Date,Hour,Ontario Demand,Northwest,Northeast,Ottawa,East,Toronto,Essa,Bruce,Southwest,Niagara,West,Zone Total,Diff
0,2004-01-01,1,14703,811,1406,1197,903,4606,859,53,2790,542,1670,14837,134
1,2004-01-01,2,14130,805,1412,1095,889,4366,794,53,2698,516,1603,14231,101
2,2004-01-01,3,13575,795,1358,1053,842,4188,747,50,2582,507,1553,13675,100
3,2004-01-01,4,13280,789,1355,1027,817,4046,718,52,2549,517,1512,13382,102
4,2004-01-01,5,13219,779,1354,1018,811,3974,709,50,2534,529,1501,13259,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8779,2004-12-31,20,18131,972,1524,1602,1077,5806,1110,43,3488,571,1934,18127,-4
8780,2004-12-31,21,17390,960,1505,1529,1011,5576,1038,43,3308,553,1874,17397,7
8781,2004-12-31,22,16737,943,1477,1387,1021,5358,983,40,3161,548,1825,16743,6
8782,2004-12-31,23,16112,956,1443,1295,991,5176,940,38,3014,579,1766,16198,86


In [10]:
#Printing out 2018 to see if it worked for the last instance of Zonal Demand
Zonal_Demand_2018

Unnamed: 0,Date,Hour,Ontario Demand,Northwest,Northeast,Ottawa,East,Toronto,Essa,Bruce,Southwest,Niagara,West,Zone Total,Diff
0,2018-01-01,1,16627,591,1466,1066,1268,5340,1579,89,3157,477,1489,16522,-105
1,2018-01-01,2,16084,577,1420,985,1300,5211,1516,91,3061,462,1465,16086,2
2,2018-01-01,3,15866,613,1417,943,1316,5096,1471,86,3032,446,1441,15862,-4
3,2018-01-01,4,15725,656,1430,943,1303,4987,1451,81,2911,439,1413,15614,-112
4,2018-01-01,5,15470,657,1425,932,1343,4926,1422,69,2907,449,1391,15522,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2018-12-31,20,16195,648,1413,1133,1057,5755,1152,76,3134,472,1462,16302,107
8756,2018-12-31,21,15668,646,1419,1103,1026,5544,1101,71,3034,446,1421,15811,143
8757,2018-12-31,22,14987,630,1331,1059,991,5338,1050,78,2917,447,1321,15162,175
8758,2018-12-31,23,14560,627,1312,1010,992,5091,1013,74,2804,405,1249,14577,17


In [11]:
#Repeating the previous process for Price
for i in yearRange:
    locals()["Price_" + str(i)] = pd.read_csv("Data/Price/PUB_PriceHOEPPredispOR_" + str(i) + ".csv", header = 3)

In [12]:
#Since this is an hourly data set, it is expected for all datasets to have the same number of rows
#except the leap year which will have an extra 24 rows for the extra 24 hours that occur within that year.
for i in yearRange:
    print(len(locals()["Price_" + str(i)]), i)

8784 2004
8760 2005
8760 2006
8760 2007
8784 2008
8760 2009
8760 2010
8760 2011
8784 2012
8760 2013
8760 2014
8760 2015
8784 2016
8760 2017
8760 2018


In [13]:
#Printing out 2004 to see if it worked for the first instance of Price
Price_2004

Unnamed: 0,Date,Hour,HOEP,Hour 1 Predispatch,Hour 2 Predispatch,Hour 3 Predispatch,OR 10 Min Sync,OR 10 Min non-sync,OR 30 Min
0,2004-01-01,1,30.90,32.82,32.84,35.00,,,
1,2004-01-01,2,27.13,32.08,32.10,32.15,,,
2,2004-01-01,3,25.23,33.55,33.63,30.57,,,
3,2004-01-01,4,24.29,30.00,32.32,28.00,,,
4,2004-01-01,5,24.42,27.63,28.27,30.00,,,
...,...,...,...,...,...,...,...,...,...
8779,2004-12-31,20,31.15,37.26,50.00,50.00,,,
8780,2004-12-31,21,27.08,38.32,38.40,37.17,,,
8781,2004-12-31,22,26.72,37.48,38.00,39.14,,,
8782,2004-12-31,23,31.97,35.50,36.00,36.64,,,


In [14]:
#Printing out 2018 to see if it worked for the last instance of Price
Price_2018

Unnamed: 0,Date,Hour,HOEP,Hour 1 Predispatch,Hour 2 Predispatch,Hour 3 Predispatch,OR 10 Min Sync,OR 10 Min non-sync,OR 30 Min
0,2018-01-01,1,51.29,40.62,35.50,40.32,0.20,0.20,0.19
1,2018-01-01,2,43.59,40.93,40.57,30.22,0.20,0.20,0.20
2,2018-01-01,3,93.60,49.84,45.30,40.20,0.25,0.25,0.22
3,2018-01-01,4,54.78,55.60,40.04,45.30,0.20,0.20,0.20
4,2018-01-01,5,14.35,40.04,34.80,47.47,0.20,0.20,0.20
...,...,...,...,...,...,...,...,...,...
8755,2018-12-31,20,5.62,5.92,8.80,13.33,1.44,1.44,1.44
8756,2018-12-31,21,8.95,5.83,5.96,5.92,1.24,1.24,1.24
8757,2018-12-31,22,5.81,6.00,5.57,5.00,0.45,0.20,0.20
8758,2018-12-31,23,2.87,5.86,5.92,0.00,1.26,0.19,0.15


## Stacking All the Years Together for each DF

In [15]:
#Stacking all the Demand dataframes
#We're lucky and all of these dataframes are formatted in the same way, so no further data manipulation is required.
Demand = Demand_2004
for i in yearRange[1:]:
    Demand = pd.concat([Demand, locals()["Demand_" + str(i)]], axis = 0)

In [16]:
Demand

Unnamed: 0,Date,Hour,Market Demand,Ontario Demand
0,2004-01-01,1,15787,14703
1,2004-01-01,2,15194,14130
2,2004-01-01,3,14589,13575
3,2004-01-01,4,14294,13280
4,2004-01-01,5,14233,13219
...,...,...,...,...
8755,2018-12-31,20,18276,16195
8756,2018-12-31,21,18046,15668
8757,2018-12-31,22,17516,14987
8758,2018-12-31,23,17506,14560


In [17]:
#Stacking all the Zonal_Demand dataframes
#We're lucky and all of these dataframes are formatted in the same way, so no further data manipulation is required.
Zonal_Demand = Zonal_Demand_2004
for i in yearRange[1:]:
    Zonal_Demand = pd.concat([Zonal_Demand, locals()["Zonal_Demand_" + str(i)]], axis = 0)

In [18]:
Zonal_Demand

Unnamed: 0,Date,Hour,Ontario Demand,Northwest,Northeast,Ottawa,East,Toronto,Essa,Bruce,Southwest,Niagara,West,Zone Total,Diff
0,2004-01-01,1,14703,811,1406,1197,903,4606,859,53,2790,542,1670,14837,134
1,2004-01-01,2,14130,805,1412,1095,889,4366,794,53,2698,516,1603,14231,101
2,2004-01-01,3,13575,795,1358,1053,842,4188,747,50,2582,507,1553,13675,100
3,2004-01-01,4,13280,789,1355,1027,817,4046,718,52,2549,517,1512,13382,102
4,2004-01-01,5,13219,779,1354,1018,811,3974,709,50,2534,529,1501,13259,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2018-12-31,20,16195,648,1413,1133,1057,5755,1152,76,3134,472,1462,16302,107
8756,2018-12-31,21,15668,646,1419,1103,1026,5544,1101,71,3034,446,1421,15811,143
8757,2018-12-31,22,14987,630,1331,1059,991,5338,1050,78,2917,447,1321,15162,175
8758,2018-12-31,23,14560,627,1312,1010,992,5091,1013,74,2804,405,1249,14577,17


In [19]:
#Stacking all the Zonal_Demand dataframes
#We're lucky and all of these dataframes are formatted in the same way, so no further data manipulation is required.
Price = Price_2004
for i in yearRange[1:]:
    Price = pd.concat([Price, locals()["Price_" + str(i)]], axis = 0)

In [20]:
Price

Unnamed: 0,Date,Hour,HOEP,Hour 1 Predispatch,Hour 2 Predispatch,Hour 3 Predispatch,OR 10 Min Sync,OR 10 Min non-sync,OR 30 Min
0,2004-01-01,1,30.9,32.82,32.84,35,,,
1,2004-01-01,2,27.13,32.08,32.1,32.15,,,
2,2004-01-01,3,25.23,33.55,33.63,30.57,,,
3,2004-01-01,4,24.29,30,32.32,28,,,
4,2004-01-01,5,24.42,27.63,28.27,30,,,
...,...,...,...,...,...,...,...,...,...
8755,2018-12-31,20,5.62,5.92,8.8,13.33,1.44,1.44,1.44
8756,2018-12-31,21,8.95,5.83,5.96,5.92,1.24,1.24,1.24
8757,2018-12-31,22,5.81,6,5.57,5,0.45,0.20,0.20
8758,2018-12-31,23,2.87,5.86,5.92,0,1.26,0.19,0.15


In [21]:
#Checking that all our Date/Hour columns are exactly the same to see if we can simply merge these dataframes horizontally
print("Demand & Demand_Zonal have the same Date column: ", (Demand['Date']).equals(Zonal_Demand['Date']))
print("Demand & Price have the same Date column: ", (Demand['Date']).equals(Price['Date']))
print("Price & Demand_Zonal have the same Date column: ", (Price['Date']).equals(Zonal_Demand['Date']))
print("Demand & Demand_Zonal have the same Hour column: ", (Demand['Hour']).equals(Zonal_Demand['Hour']))
print("Demand & Price have the same Hour column: ", (Demand['Hour']).equals(Price['Hour']))
print("Price & Demand_Zonal have the same Hour column: ", (Price['Hour']).equals(Zonal_Demand['Hour']))

Demand & Demand_Zonal have the same Date column:  True
Demand & Price have the same Date column:  True
Price & Demand_Zonal have the same Date column:  True
Demand & Demand_Zonal have the same Hour column:  True
Demand & Price have the same Hour column:  True
Price & Demand_Zonal have the same Hour column:  True


In [22]:
#Since they're all true, we perform a horizontal stack & remove the common cols from the 2nd & 3rd df ('Date' & 'Hour')
###TODO: This creates two Ontario Demand columns. In fact we don't really need any of the other
###region electricity demands, could you just keep the Toronto demand here? (It would be cheating/trivial
###to predict the toronto electricity demand with ontario electricity demand)
Electricity = pd.concat([Demand, Zonal_Demand.drop(['Date', 'Hour'], axis=1), Price.drop(['Date', 'Hour'], axis=1)], axis = 1)

In [23]:
#We're expecting 24 columns
print("We expect ", len(Demand.columns) + len(Zonal_Demand.columns) + len(Price.columns) - 4 ," columns.")
Electricity

We expect  24  columns.


Unnamed: 0,Date,Hour,Market Demand,Ontario Demand,Ontario Demand.1,Northwest,Northeast,Ottawa,East,Toronto,...,West,Zone Total,Diff,HOEP,Hour 1 Predispatch,Hour 2 Predispatch,Hour 3 Predispatch,OR 10 Min Sync,OR 10 Min non-sync,OR 30 Min
0,2004-01-01,1,15787,14703,14703,811,1406,1197,903,4606,...,1670,14837,134,30.9,32.82,32.84,35,,,
1,2004-01-01,2,15194,14130,14130,805,1412,1095,889,4366,...,1603,14231,101,27.13,32.08,32.1,32.15,,,
2,2004-01-01,3,14589,13575,13575,795,1358,1053,842,4188,...,1553,13675,100,25.23,33.55,33.63,30.57,,,
3,2004-01-01,4,14294,13280,13280,789,1355,1027,817,4046,...,1512,13382,102,24.29,30,32.32,28,,,
4,2004-01-01,5,14233,13219,13219,779,1354,1018,811,3974,...,1501,13259,40,24.42,27.63,28.27,30,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2018-12-31,20,18276,16195,16195,648,1413,1133,1057,5755,...,1462,16302,107,5.62,5.92,8.8,13.33,1.44,1.44,1.44
8756,2018-12-31,21,18046,15668,15668,646,1419,1103,1026,5544,...,1421,15811,143,8.95,5.83,5.96,5.92,1.24,1.24,1.24
8757,2018-12-31,22,17516,14987,14987,630,1331,1059,991,5338,...,1321,15162,175,5.81,6,5.57,5,0.45,0.20,0.20
8758,2018-12-31,23,17506,14560,14560,627,1312,1010,992,5091,...,1249,14577,17,2.87,5.86,5.92,0,1.26,0.19,0.15


In [24]:
Electricity.to_csv('IescoMerged.csv', index=False)

## Loading the Weather Data

In [25]:
for i in range(2004, 2020): #2020, because the files are UTC based, so 2019 still has 2018 local_time data:
    locals()["weather_" + str(i)] = pd.read_csv("Data/Weather/ninja_weather_43.6535_-79.3839_" + str(i) + ".csv", header = 3)

## Stacking All the Years Together for each Weather DF

In [26]:
weather = weather_2004
for i in range(2005, 2020):
    weather = pd.concat([weather, locals()["weather_" + str(i)]], axis = 0)

In [27]:
weather

Unnamed: 0,time,local_time,temperature,precipitation,snowfall,snow_mass,air_density,radiation_surface,radiation_toa,cloud_cover
0,2004-01-01 00:00,2003-12-31 19:00,0.421,0.000,0.000,1.154,1.272,0.000,0.000,0.039
1,2004-01-01 01:00,2003-12-31 20:00,0.137,0.000,0.000,1.154,1.275,0.000,0.000,0.035
2,2004-01-01 02:00,2003-12-31 21:00,0.085,0.000,0.000,1.154,1.276,0.000,0.000,0.033
3,2004-01-01 03:00,2003-12-31 22:00,0.036,0.000,0.000,1.155,1.277,0.000,0.000,0.064
4,2004-01-01 04:00,2003-12-31 23:00,0.087,0.000,0.000,1.155,1.278,0.000,0.000,0.094
...,...,...,...,...,...,...,...,...,...,...
8755,2019-12-31 19:00,2019-12-31 14:00,0.327,0.194,0.184,4.193,1.253,60.453,400.510,0.902
8756,2019-12-31 20:00,2019-12-31 15:00,0.314,0.115,0.106,4.325,1.253,38.847,241.941,0.825
8757,2019-12-31 21:00,2019-12-31 16:00,0.296,0.070,0.062,4.400,1.253,7.414,51.768,0.824
8758,2019-12-31 22:00,2019-12-31 17:00,0.224,0.048,0.031,4.440,1.254,0.000,0.000,0.748


## Redefining the Weather Dataset to Only Keep what we Need

In [28]:
#Redefining the first two columns to be datetime
weather = weather.astype({'time': 'datetime64[ns]', 'local_time': 'datetime64[ns]'})

In [29]:
#Subsetting the dataframe for only the years that are relevant to us (2004-2018)
weather = weather[(weather['local_time'].dt.year > 2003) & (weather['local_time'].dt.year < 2019)]
#Also need to reindex the dataframe
weather = weather.reset_index(drop=True)
weather

Unnamed: 0,time,local_time,temperature,precipitation,snowfall,snow_mass,air_density,radiation_surface,radiation_toa,cloud_cover
0,2004-01-01 05:00:00,2004-01-01 00:00:00,0.198,0.001,0.000,1.156,1.279,0.0,0.0,0.118
1,2004-01-01 06:00:00,2004-01-01 01:00:00,0.339,0.001,0.000,1.156,1.279,0.0,0.0,0.148
2,2004-01-01 07:00:00,2004-01-01 02:00:00,0.502,0.001,0.001,1.156,1.280,0.0,0.0,0.144
3,2004-01-01 08:00:00,2004-01-01 03:00:00,0.534,0.000,0.000,1.157,1.280,0.0,0.0,0.159
4,2004-01-01 09:00:00,2004-01-01 04:00:00,0.494,0.000,0.000,1.157,1.281,0.0,0.0,0.194
...,...,...,...,...,...,...,...,...,...,...
131491,2019-01-01 00:00:00,2018-12-31 19:00:00,2.908,2.279,0.566,3.426,1.243,0.0,0.0,0.988
131492,2019-01-01 01:00:00,2018-12-31 20:00:00,2.383,1.402,0.263,3.773,1.242,0.0,0.0,0.981
131493,2019-01-01 02:00:00,2018-12-31 21:00:00,2.611,0.244,0.041,3.866,1.237,0.0,0.0,0.985
131494,2019-01-01 03:00:00,2018-12-31 22:00:00,3.384,0.283,0.076,3.872,1.227,0.0,0.0,0.990


In [30]:
#Checking that we only have the relevvant years
weather['local_time'].dt.year.unique()

array([2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2016, 2017, 2018], dtype=int64)

In [31]:
#Checking that we have the same number of rows as our Electricity dataset
len(Electricity) == len(weather)

True

In [32]:
#TODO IF WE HAVE TIME AT THE END: Figure out how to merge both datasets based on date and hour (problem cuz of daylight savings time)
#Adding a 'Date' & 'Hour' column so we can easily merge the weather dataset to our Electricity one
#weather['Date'] = weather['local_time'].dt.date.astype(str) 
#weather['Hour'] = (weather['local_time'].dt.hour + 1).astype('int64')

In [43]:
#Reseting indexes for both dataframes just in case (need this so we can compare the same exact rows)
weather = weather.reset_index(drop=True)
Electricity = Electricity.reset_index(drop=True)

In [34]:
#Electricity.Date

In [35]:
#weather.Date

In [36]:
#Checking that all our Date/Hour columns are exactly the same to see if we can simply merge these dataframes horizontally
#Electricity.Date is weather.Date
#Electricity.Hour.isin(weather.Hour).value_counts()
#print("Electricity & weather have the same Date column: ", (Electricity['Date']).eq(weather['Date']))
#result = (((Electricity['Date']).eq(weather['Date'])))
#result[result == False].index
#weather[result[result == False].index]
#print("Electricity & weather have the same Hour column: ", (Electricity['Hour']).eq(weather['Hour']))

In [37]:
#Electricity.loc[2255:2290, ['Date', 'Hour']]

In [38]:
#weather.loc[2255:2290, ['Date', 'Hour']]

In [44]:
#Since they're both true, we can merge both datasets to create a single huge dataset
#mergedDataset = pd.concat([Electricity, weather.drop(['Date', 'Hour'], axis=1)], axis = 1)
mergedDataset = pd.concat([Electricity, weather], axis = 1)

In [45]:
mergedDataset

Unnamed: 0,Date,Hour,Market Demand,Ontario Demand,Ontario Demand.1,Northwest,Northeast,Ottawa,East,Toronto,...,time,local_time,temperature,precipitation,snowfall,snow_mass,air_density,radiation_surface,radiation_toa,cloud_cover
0,2004-01-01,1,15787,14703,14703,811,1406,1197,903,4606,...,2004-01-01 05:00:00,2004-01-01 00:00:00,0.198,0.001,0.000,1.156,1.279,0.0,0.0,0.118
1,2004-01-01,2,15194,14130,14130,805,1412,1095,889,4366,...,2004-01-01 06:00:00,2004-01-01 01:00:00,0.339,0.001,0.000,1.156,1.279,0.0,0.0,0.148
2,2004-01-01,3,14589,13575,13575,795,1358,1053,842,4188,...,2004-01-01 07:00:00,2004-01-01 02:00:00,0.502,0.001,0.001,1.156,1.280,0.0,0.0,0.144
3,2004-01-01,4,14294,13280,13280,789,1355,1027,817,4046,...,2004-01-01 08:00:00,2004-01-01 03:00:00,0.534,0.000,0.000,1.157,1.280,0.0,0.0,0.159
4,2004-01-01,5,14233,13219,13219,779,1354,1018,811,3974,...,2004-01-01 09:00:00,2004-01-01 04:00:00,0.494,0.000,0.000,1.157,1.281,0.0,0.0,0.194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131491,2018-12-31,20,18276,16195,16195,648,1413,1133,1057,5755,...,2019-01-01 00:00:00,2018-12-31 19:00:00,2.908,2.279,0.566,3.426,1.243,0.0,0.0,0.988
131492,2018-12-31,21,18046,15668,15668,646,1419,1103,1026,5544,...,2019-01-01 01:00:00,2018-12-31 20:00:00,2.383,1.402,0.263,3.773,1.242,0.0,0.0,0.981
131493,2018-12-31,22,17516,14987,14987,630,1331,1059,991,5338,...,2019-01-01 02:00:00,2018-12-31 21:00:00,2.611,0.244,0.041,3.866,1.237,0.0,0.0,0.985
131494,2018-12-31,23,17506,14560,14560,627,1312,1010,992,5091,...,2019-01-01 03:00:00,2018-12-31 22:00:00,3.384,0.283,0.076,3.872,1.227,0.0,0.0,0.990


In [46]:
mergedDataset.to_csv('MergedDataset.csv')

## Adding columns for isWeekend and isHoliday

In [48]:
#First, converting the Date&Time as datetime object
mergedDataset.dtypes
#mergedDataset['Date&Time'] = pd.to_datetime(mergedDataset['Date&Time'])

Date                          object
Hour                           int64
Market Demand                  int64
Ontario Demand                 int64
Ontario Demand                 int64
Northwest                      int64
Northeast                      int64
Ottawa                         int64
East                           int64
Toronto                        int64
Essa                           int64
Bruce                          int64
Southwest                      int64
Niagara                        int64
West                           int64
Zone Total                     int64
Diff                          object
HOEP                          object
Hour 1 Predispatch            object
Hour 2 Predispatch            object
Hour 3 Predispatch            object
OR 10 Min Sync               float64
OR 10 Min non-sync           float64
OR 30 Min                    float64
time                  datetime64[ns]
local_time            datetime64[ns]
temperature                  float64
p

In [49]:
#Adding a column for isWeekend 
#dayofweek returns a number from 0-6 where 5 & 6 respectively represent Saturday and Sunday
mergedDataset['isWeekend'] = mergedDataset['local_time'].dt.dayofweek.isin([5, 6])  

In [None]:
#Dealing with holidays now
#Installing the following holiday library https://pypi.org/project/holidays/
#!pip install holidays

In [50]:
!pip install holidays



In [51]:
#Importing the newly added library
import holidays

In [54]:
Toronto_Holidays = holidays.Canada(prov = 'ON', years = mergedDataset['local_time'].dt.year.unique())
print('There are ', len(Toronto_Holidays), ' holidays (days) in the year range we selected.')
print('There should be around ', len(Toronto_Holidays) * 24, ' rows that are true for isHoliday.')

There are  161  holidays (days) in the year range we selected.
There should be around  3864  rows that are true for isHoliday.


In [55]:
#Printing out the holidays for a year to make sure it's legit
for date, name in sorted(Toronto_Holidays.items()):
    print(date, name)

2004-01-01 New Year's Day
2004-04-09 Good Friday
2004-05-24 Victoria Day
2004-07-01 Canada Day
2004-08-02 Civic Holiday
2004-09-06 Labour Day
2004-10-11 Thanksgiving
2004-12-24 Christmas Day (Observed)
2004-12-25 Christmas Day
2004-12-27 Boxing Day (Observed)
2004-12-31 New Year's Day (Observed)
2005-01-01 New Year's Day
2005-03-25 Good Friday
2005-05-23 Victoria Day
2005-07-01 Canada Day
2005-08-01 Civic Holiday
2005-09-05 Labour Day
2005-10-10 Thanksgiving
2005-12-25 Christmas Day
2005-12-26 Christmas Day (Observed)
2005-12-27 Boxing Day (Observed)
2006-01-01 New Year's Day
2006-01-02 New Year's Day (Observed)
2006-04-14 Good Friday
2006-05-22 Victoria Day
2006-07-01 Canada Day
2006-07-03 Canada Day (Observed)
2006-08-07 Civic Holiday
2006-09-04 Labour Day
2006-10-09 Thanksgiving
2006-12-25 Christmas Day
2006-12-26 Boxing Day
2007-01-01 New Year's Day
2007-04-06 Good Friday
2007-05-21 Victoria Day
2007-07-01 Canada Day
2007-07-02 Canada Day (Observed)
2007-08-06 Civic Holiday
2007-09

In [57]:
#Adding the isHoliday column
mergedDataset['isHoliday'] = mergedDataset['local_time'].dt.date.isin(Toronto_Holidays)  

In [58]:
mergedDataset[mergedDataset['isHoliday'] == True]

Unnamed: 0,Date,Hour,Market Demand,Ontario Demand,Ontario Demand.1,Northwest,Northeast,Ottawa,East,Toronto,...,temperature,precipitation,snowfall,snow_mass,air_density,radiation_surface,radiation_toa,cloud_cover,isWeekend,isHoliday
0,2004-01-01,1,15787,14703,14703,811,1406,1197,903,4606,...,0.198,0.001,0.000,1.156,1.279,0.0,0.0,0.118,False,True
1,2004-01-01,2,15194,14130,14130,805,1412,1095,889,4366,...,0.339,0.001,0.000,1.156,1.279,0.0,0.0,0.148,False,True
2,2004-01-01,3,14589,13575,13575,795,1358,1053,842,4188,...,0.502,0.001,0.001,1.156,1.280,0.0,0.0,0.144,False,True
3,2004-01-01,4,14294,13280,13280,789,1355,1027,817,4046,...,0.534,0.000,0.000,1.157,1.280,0.0,0.0,0.159,False,True
4,2004-01-01,5,14233,13219,13219,779,1354,1018,811,3974,...,0.494,0.000,0.000,1.157,1.281,0.0,0.0,0.194,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131371,2018-12-26,20,18698,16414,16414,573,1448,1193,1094,5704,...,-1.023,0.001,0.001,3.779,1.295,0.0,0.0,0.400,False,True
131372,2018-12-26,21,18268,16158,16158,575,1461,1166,1068,5619,...,-1.588,0.001,0.001,3.781,1.299,0.0,0.0,0.712,False,True
131373,2018-12-26,22,17952,15744,15744,560,1426,1133,1038,5478,...,-2.113,0.001,0.001,3.783,1.302,0.0,0.0,0.883,False,True
131374,2018-12-26,23,17336,14972,14972,557,1438,1075,984,5254,...,-2.588,0.000,0.000,3.785,1.305,0.0,0.0,0.911,False,True


In [59]:
mergedDataset.head(25)

Unnamed: 0,Date,Hour,Market Demand,Ontario Demand,Ontario Demand.1,Northwest,Northeast,Ottawa,East,Toronto,...,temperature,precipitation,snowfall,snow_mass,air_density,radiation_surface,radiation_toa,cloud_cover,isWeekend,isHoliday
0,2004-01-01,1,15787,14703,14703,811,1406,1197,903,4606,...,0.198,0.001,0.0,1.156,1.279,0.0,0.0,0.118,False,True
1,2004-01-01,2,15194,14130,14130,805,1412,1095,889,4366,...,0.339,0.001,0.0,1.156,1.279,0.0,0.0,0.148,False,True
2,2004-01-01,3,14589,13575,13575,795,1358,1053,842,4188,...,0.502,0.001,0.001,1.156,1.28,0.0,0.0,0.144,False,True
3,2004-01-01,4,14294,13280,13280,789,1355,1027,817,4046,...,0.534,0.0,0.0,1.157,1.28,0.0,0.0,0.159,False,True
4,2004-01-01,5,14233,13219,13219,779,1354,1018,811,3974,...,0.494,0.0,0.0,1.157,1.281,0.0,0.0,0.194,False,True
5,2004-01-01,6,14398,13354,13354,776,1351,1034,829,3966,...,0.331,0.0,0.0,1.158,1.282,0.0,0.0,0.137,False,True
6,2004-01-01,7,14931,13462,13462,777,1339,1067,862,4023,...,0.079,0.0,0.0,1.158,1.284,0.0,0.0,0.124,False,True
7,2004-01-01,8,14777,13538,13538,756,1334,1135,857,4082,...,-0.103,0.0,0.0,1.159,1.286,0.297,1.386,0.131,False,True
8,2004-01-01,9,14926,13675,13675,764,1347,1216,842,4112,...,0.145,0.0,0.0,1.159,1.288,61.136,129.696,0.126,False,True
9,2004-01-01,10,15058,14234,14234,769,1379,1280,879,4301,...,0.885,0.0,0.0,1.158,1.288,185.368,314.938,0.131,False,True


In [60]:
#Reconvert our table to csv
mergedDataset.to_csv('MergedDataset.csv', index=False)