### Analysis of energy demand and temperature

The purpose of this script is to analyse both energy demand and weather data, performing exploratory analysis on these two datasets

In [2]:
# Prepare the working environment
import os
import psycopg2 as pg
import sqlalchemy as sa
import numpy as np
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as po
from plotly import tools

# To run plotly offline
po.init_notebook_mode(connected=True)

# Set a working directory
os.chdir("C:/Users/Blake/Documents/UTS/36102_iLab_1/Client/")

# Define input parameters
user_name = r"postgres"
user_pass = r"password"
db_name = r"endgame"
server = r"localhost"

# Establishing a connection to postgres
strEngine = r'postgresql://' + user_name+ "@" + server + "/" + db_name
engine = sa.create_engine(strEngine)
connstring = "host='%s' dbname='%s' user='%s' password='%s'" % (server, db_name, user_name, user_pass)
conn = pg.connect(connstring)

Let's begin by looking at long term trends in energy demand for each region. The demand data will be aggregated to months and measured as average megawatts (MW)

In [2]:
# Aggregate to an average monthly demand
sqlAllDemand = r"SELECT a.regionid \
, a.s_perd \
, sum(a.totaldemand) / count(a.totaldemand) as totaldemand_mean \
FROM \
	( \
	SELECT regionid \
	, extract('month' from settlementdate) as s_month \
	, extract('year' from settlementdate) as s_year \
	, cast(extract('year' from settlementdate) || '-' || extract('month' from settlementdate) || '-01' as date) as s_perd \
	, totaldemand \
	FROM aemo.demand \
	) a \
group by 1,2 \
order by 1,2"

# Run the query and retrieve the data
dataAllDemand = pd.read_sql_query(sqlAllDemand, con=engine)

# Correct some column types
dataAllDemand.s_perd = pd.to_datetime(dataAllDemand['s_perd'])
dataAllDemand = dataAllDemand.set_index(['s_perd'], drop = False)


In [3]:
# View the resultant output
dataAllDemand.info()
dataAllDemand.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 425 entries, 2010-01-01 to 2017-01-01
Data columns (total 3 columns):
regionid            425 non-null object
s_perd              425 non-null datetime64[ns]
totaldemand_mean    425 non-null float64
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 13.3+ KB


Unnamed: 0_level_0,regionid,s_perd,totaldemand_mean
s_perd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,NSW1,2010-01-01,8953.783408
2010-02-01,NSW1,2010-02-01,9167.354818
2010-03-01,NSW1,2010-03-01,8712.672161
2010-04-01,NSW1,2010-04-01,8205.721965
2010-05-01,NSW1,2010-05-01,8806.852456


In [4]:
# Visualise the result using a line chart
# Create objects for NSW
LTTrendsNSW = dataAllDemand[dataAllDemand['regionid'] == 'NSW1']
plotLTTrendsNSW = go.Scatter(x = LTTrendsNSW.s_perd, y = LTTrendsNSW.totaldemand_mean, name = "NSW")

# Create objects for VIC
LTTrendsVIC = dataAllDemand[dataAllDemand['regionid'] == 'VIC1']
plotLTTrendsVIC = go.Scatter(x = LTTrendsVIC.s_perd, y = LTTrendsVIC.totaldemand_mean, name = "VIC")

# Create objects for QLD
LTTrendsQLD = dataAllDemand[dataAllDemand['regionid'] == 'QLD1']
plotLTTrendsQLD = go.Scatter(x = LTTrendsQLD.s_perd, y = LTTrendsQLD.totaldemand_mean, name = "QLD")

# Create objects for SA
LTTrendsSA = dataAllDemand[dataAllDemand['regionid'] == 'SA1']
plotLTTrendsSA = go.Scatter(x = LTTrendsSA.s_perd, y = LTTrendsSA.totaldemand_mean, name = "SA")

# Create objects for TAS
LTTrendsTAS = dataAllDemand[dataAllDemand['regionid'] == 'TAS1']
plotLTTrendsTAS = go.Scatter(x = LTTrendsTAS.s_perd, y = LTTrendsTAS.totaldemand_mean, name = "TAS")

# Plot all of the objects together
plotLayout = go.Layout(title="Average monthly demand by region",
                xaxis=dict(title='Time (Months)'),
                yaxis=dict(title='Average demand (MW)'))

# Combine the traces together for plotly
plotNational = [plotLTTrendsNSW, plotLTTrendsVIC, plotLTTrendsQLD, plotLTTrendsSA, plotLTTrendsTAS]

plotFig = go.Figure(data = plotNational, layout = plotLayout)

po.iplot(plotFig)

The plot above shows the seasonality present in energy demand, amidst a broader trend lower since 2010. This decline has been driven by improvements in energy efficiency of appliances, changing consumer preferences and the falling cost of rooftop solar as well as economic (industry composition particularly the decline of manufacturing) changes.  
  
The QLD region is the only region where demand has increased, which has largely been driven by industrial energy demand (LNG).  
  
At the start of 2017, NSW had the highest demnd of the regions closely followed by QLD, and VIC. SA and TAS have the smallest demand of the NEM with both requiring less than 2000MW on average. 

### Analysis of NSW Energy Demand 
The subsequent analysis focusses on NSW, but has been parameterised so that other regions in other time periods can also be explored

#### Extract and prepare the demand data

In [5]:
# Retrieve a year of demand data
sqlRegion = "NSW1"
sqlLowerDateGE = "2015-11-01"
sqlUpperDateLE = "2016-10-31"

# Build a query for data extraction
sqlDemand = r"select to_char(settlementdate, 'YYYY-MM-DD HH24:MI:SS') as settlementdate \
, totaldemand \
, date(settlementdate) as s_date \
, date_part('year', settlementdate) as s_year \
, date_part('month', settlementdate) as s_month \
, date_part('isodow', settlementdate) as s_dow \
, date_part('hour', settlementdate) as s_hour \
from aemo.demand \
where regionid = '" + sqlRegion + "' \
and settlementdate >= '" + sqlLowerDateGE + "' and settlementdate <= '" + sqlUpperDateLE + "'"

# Read the data into a dataframe
dataDemand = pd.read_sql_query(sqlDemand, con=engine)

The following chunk processes the data for convenient downstream visualisation

In [6]:
# Correct some datatypes
dataDemand.s_month = dataDemand.s_month.astype(int)
dataDemand.s_year = dataDemand.s_year.astype(int)
dataDemand.s_dow = dataDemand.s_dow.astype(int)
dataDemand.s_hour = dataDemand.s_hour.astype(int)
dataDemand.s_date = pd.to_datetime(dataDemand['s_date'])
dataDemand.settlementdate = pd.to_datetime(dataDemand['settlementdate'])

# Assign the settlement date as an index 
dataDemand = dataDemand.set_index(['settlementdate'])

# Create a new variable for weekday/weekend
dataDemand['weekday'] = np.where(dataDemand['s_dow'] <= 5, 1, 0)

# Create a new variable that bins the hour of day
# First variable are the conditions
hrConditions = [(dataDemand['s_hour'] >= 0) & (dataDemand['s_hour'] < 4)
, (dataDemand['s_hour'] >= 4) & (dataDemand['s_hour'] < 8)
, (dataDemand['s_hour'] >= 8) & (dataDemand['s_hour'] < 12)
, (dataDemand['s_hour'] >= 12) & (dataDemand['s_hour'] < 16)
, (dataDemand['s_hour'] >= 16) & (dataDemand['s_hour'] < 20)
, (dataDemand['s_hour'] >= 20) & (dataDemand['s_hour'] < 24)]

# Next create a vector of results
hrBin = list(range(1,7))
hrLabel = ['Midnight-4am', '4am-8am', '8am-12pm', '12pm-4pm', '4pm-8pm', '8pm-Midnight']

# Add the new column(s)
dataDemand['hr_bin'] = np.select(hrConditions, hrBin)
dataDemand['hr_label'] = np.select(hrConditions, hrLabel)

# Create a label for the month
mnthLabel = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}

# Replace the dataframe and now there is a month label
dataDemand['mnth_label'] = dataDemand['s_month'].replace(mnthLabel)
del mnthLabel

In [7]:
# See the outputs
dataDemand.info()
dataDemand.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 107137 entries, 2015-11-01 00:00:00 to 2016-10-31 00:00:00
Data columns (total 10 columns):
totaldemand    107137 non-null float64
s_date         107137 non-null datetime64[ns]
s_year         107137 non-null int32
s_month        107137 non-null int32
s_dow          107137 non-null int32
s_hour         107137 non-null int32
weekday        107137 non-null int32
hr_bin         107137 non-null int32
hr_label       107137 non-null object
mnth_label     107137 non-null object
dtypes: datetime64[ns](1), float64(1), int32(6), object(2)
memory usage: 11.5+ MB


Unnamed: 0_level_0,totaldemand,s_date,s_year,s_month,s_dow,s_hour,weekday,hr_bin,hr_label,mnth_label
settlementdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-11-01 00:00:00,6525.18,2015-11-01,2015,11,7,0,0,1,Midnight-4am,Nov
2015-11-01 00:05:00,6490.08,2015-11-01,2015,11,7,0,0,1,Midnight-4am,Nov
2015-11-01 00:10:00,6550.95,2015-11-01,2015,11,7,0,0,1,Midnight-4am,Nov
2015-11-01 00:15:00,6463.13,2015-11-01,2015,11,7,0,0,1,Midnight-4am,Nov
2015-11-01 00:20:00,6352.35,2015-11-01,2015,11,7,0,0,1,Midnight-4am,Nov


These measures of demand are observed every 5 minutes. The following plot describes how demand varies over a few days

In [8]:
# Plot a sample of the most granular data
snapStart = '2016-01-05'
snapEnd = '2016-01-15'
granDemand = dataDemand[snapStart:snapEnd]

plotGranDemand = go.Scatter(x = granDemand.index, y = granDemand.totaldemand, name = sqlRegion)

plotLayout = go.Layout(title="Energy demand in " + sqlRegion,
                xaxis=dict(title='Time (5min)'),
                yaxis=dict(title='Demand (MW)'))
plotGran = [plotGranDemand]
plotFig = go.Figure(data = plotGran, layout = plotLayout)
po.iplot(plotFig)


What we can see from this plot is that on ordinary days (5th to 11th), the demand averages around the 8000MW level, which is consistent with the NSW average. There are however, days where this peak exceeds 11,000MW, which is the result of higher temperatures driving up the use of cooling appliances.   

The BOM confirms this weather anomaly in their January 2016 weather summary:  
"Two heatwaves, over 11-14 and 19-21 January, resulted Observatory Hill recording 8 days above 30 °C, well above the average of 3 days and the most hot days since January 1991."  

http://www.bom.gov.au/climate/current/month/nsw/archive/201601.sydney.shtml  
  
Also interesting in this plot is the difference in the intra-day pattern between a 'regular' day (e.g. January 6th) and an extreme weather day (e.g. January 11th). Regular days tend to have a slightly bi-modal pattern with peaks early in the day (around 9am), and then again the early evening (around 3 to 5pm). Compare this to the extreme weather days where demand rises in a linear fashion to peak around 4 to 5pm. 

#### Visualise Max, Mean and Min demand
The purpose of this next plot is to explore how maximum daily demand compares to the minimum and average. It is also useful to understand how these values have trended over time. 

In [9]:
# Aggregate to an average daily demand
sqlAggDemand = r"SELECT date(settlementdate) as s_date \
, max(totaldemand) as max_demand \
, min(totaldemand) as min_demand \
, sum(totaldemand) / count(totaldemand) as avg_demand \
FROM aemo.demand \
WHERE regionid = '" + sqlRegion + "' \
GROUP BY 1 \
ORDER BY 1 \
"
dataAggDemand = pd.read_sql_query(sqlAggDemand, con=engine)

# Correct some column types
dataAggDemand.s_date = pd.to_datetime(dataAggDemand['s_date'])
dataAggDemand = dataAggDemand.set_index(['s_date'],drop=True)

dataAggDemand.head()

Unnamed: 0_level_0,max_demand,min_demand,avg_demand
s_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,8922.42,6157.36,7792.753066
2010-01-02,9326.64,6112.73,8012.314097
2010-01-03,8277.85,6014.91,7393.354514
2010-01-04,9522.3,6023.79,8254.502222
2010-01-05,10728.72,6287.12,8832.004931


In [10]:
# Visualise the result
# Create objects for Max demand
plotMax = go.Scatter(x = dataAggDemand.index
                     , y = dataAggDemand.max_demand
                     , name = "Max"
                     , mode = 'markers'
                     , opacity = 0.9
                     , marker = dict(size = 3, color = 'rgb(202,0,32)'))

# Create objects for Min demand
plotMin = go.Scatter(x = dataAggDemand.index
                     , y = dataAggDemand.min_demand
                     , name = "Min"
                     , mode = 'markers'
                     , opacity = 0.9
                     , marker = dict(size = 3, color = 'rgb(5,113,176)') )

# Create objects for Mean demand
plotMean = go.Scatter(x = dataAggDemand.index
                      , y = dataAggDemand.avg_demand
                      , name = "Mean"
                      , mode = 'markers'
                      , opacity = 0.5
                      , marker = dict(size = 3, color = 'rgb(180, 180, 180)'))

# Plot all of the objects together
plotLayout = go.Layout(title="Aggregate measures of daily demand for " + sqlRegion,
                xaxis=dict(title='Time (Days)'),
                yaxis=dict(title='Demand (MW)'))

# Collect all the traces for plotting
plotAggregates = [plotMax, plotMin, plotMean]

plotFig = go.Figure(data = plotAggregates, layout = plotLayout)
po.iplot(plotFig)

In NSW, there has been a downward trend for daily maximum demand. At the beginning of the series, it was not uncommon to see  observations over 12,000MW. By 2017 however, most of the time maximum demand is capped by the 12,000MW level. Minimum demand (less of an interest) has also trended downwards over time.

#### Extract and prepare the climate data
As the weather data is reported in half hour increments, the energy demand data will be aggregated to half hour intervals from five minute intervals. 

In [11]:
hfhrDemand = dataDemand.groupby(pd.TimeGrouper(freq="30min")).agg({'totaldemand':[np.mean, sum]
                                                                   , 's_date':[max]
                                                                   , 's_year':[max]
                                                                   , 's_month':[max]
                                                                   , 's_dow':[max]
                                                                   , 's_hour':[max]
                                                                   , 'weekday':[max]
                                                                   , 'hr_bin':[max]
})

# Rename the columns
hfhrDemand.columns = ['totaldemand_mean', 'totaldemand_sum', 's_date', 's_year', 's_month', 's_dow', 's_hour', 'weekday', 'hr_bin']

# Drop missing obs - seems to be missing obs in October every year
hfhrDemand = hfhrDemand.dropna(axis = 0, how = "all")

# Correct variable types 
hfhrDemand.s_year = hfhrDemand.s_year.astype(int)
hfhrDemand.s_month = hfhrDemand.s_month.astype(int)
hfhrDemand.s_dow = hfhrDemand.s_dow.astype(int)
hfhrDemand.s_hour = hfhrDemand.s_hour.astype(int)
hfhrDemand.weekday = hfhrDemand.weekday.astype(int)
hfhrDemand.hr_bin = hfhrDemand.hr_bin.astype(int)


Visualise this dataset over a year to understand how energy demand varies seasonally over a year. 

In [12]:
# Create a scatter plot
plotYrTrend = [go.Scatter(x = hfhrDemand.index
                          , y = hfhrDemand.totaldemand_mean
                          , name = sqlRegion + " Demand"
                          , mode = "line"
                          , line = dict(color = ('rgb(22, 96, 167)'), width = 1)
                          , opacity = 0.6)]

# Plot all of the objects together
plotLayout = go.Layout(title="Annual half hourly demand for energy - " + sqlRegion,
                xaxis=dict(title='Time (30min)'),
                yaxis=dict(title='Total demand (MW)'))

plotFig = go.Figure(data = plotYrTrend, layout = plotLayout)
po.iplot(plotFig)

There are two periods where energy demands peak. In the hottest of summer months (December, January) and the coldest of winter months (July, August). Interestingly, the peaks of demand in summer are far more volatile than the gradual build up to the peaks of winter. This is likely due to the nature of our climate where in summer we get bursts of very hot weather punctuated by moderately hot weather while in winter, the temperature gradually cools with fewer really cold snaps. 

### Analysis of NSW Climate Data 

The next part of this analysis will focus on air temperature data collected from the Bureau of Meterology. The weather data from Bankstown Airport has been used for this demonstration, but can be switched. Bankstown Airport was chosen for its approximation of being in the centre of Sydney.

In [13]:
# Extract and prepare the data
stationID = "66137"
stationName = "Bankstown Airport"

# Prepare a query for data extraction
sqlWeather = r"select to_char(a.index, 'YYYY-MM-DD HH24:MI:SS') as ts \
, air_temp \
, date(a.index) as i_date \
, date_part('year', a.index) as i_year \
, date_part('month', a.index) as i_month \
, date_part('isodow', a.index) as i_dow \
, date_part('hour', a.index) as i_hour \
from bom.weather a \
where a.station_id = '" + stationID + "' \
and a.index >= '" + sqlLowerDateGE + "' and a.index <= '" + sqlUpperDateLE + "';"

# Read the data into a dataframe
dataWeather = pd.read_sql_query(sqlWeather, con=engine)

# Correct some variable types
dataWeather.i_year = dataWeather.i_year.astype(int)
dataWeather.i_month = dataWeather.i_month.astype(int)
dataWeather.i_dow = dataWeather.i_dow.astype(int)
dataWeather.i_hour = dataWeather.i_hour.astype(int)
dataWeather.i_date = pd.to_datetime(dataWeather['i_date'])
dataWeather.ts = pd.to_datetime(dataWeather['ts'])

# Assign the settlement date as an index 
dataWeather = dataWeather.set_index(['ts'])

In [14]:
# Review the returned data
dataWeather.info()
dataWeather.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17473 entries, 2015-12-21 01:00:00 to 2016-10-31 00:00:00
Data columns (total 6 columns):
air_temp    17473 non-null float64
i_date      17473 non-null datetime64[ns]
i_year      17473 non-null int32
i_month     17473 non-null int32
i_dow       17473 non-null int32
i_hour      17473 non-null int32
dtypes: datetime64[ns](1), float64(1), int32(4)
memory usage: 682.5 KB


Unnamed: 0_level_0,air_temp,i_date,i_year,i_month,i_dow,i_hour
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-12-21 01:00:00,30.2,2015-12-21,2015,12,1,1
2015-11-01 00:00:00,15.1,2015-11-01,2015,11,7,0
2015-11-01 00:30:00,15.4,2015-11-01,2015,11,7,0
2015-11-01 01:00:00,15.0,2015-11-01,2015,11,7,1
2015-11-01 01:30:00,15.1,2015-11-01,2015,11,7,1


Let's re-visit that period in early January and confirm the heatwave:

In [15]:
# Visualise an equivalent snapshot of weather
granWeather = dataWeather[snapStart:snapEnd]

plotGranDemand = go.Scatter(x = granDemand.index, y = granDemand.totaldemand, name = sqlRegion, yaxis = 'y2')

plotGranWeather = go.Scatter(x = granWeather.index
                              , y = granWeather.air_temp
                              , name = stationName + " air temp"
                              , line = go.Line(color='rgb(240,59,32)')
                              , yaxis = 'y1')

plotLayout = go.Layout(title="Air temperature at " + stationName + " vs " + sqlRegion + " Energy Demand"
                       , xaxis = dict(title='Time')
                       , yaxis = dict(title='Temperature (Deg C)'
                                      , side = 'left'
                                      , showgrid=False)
                       , yaxis2 = dict(title='Demand (MW)'
                                      , side = 'right'
                                      , overlaying='y'
                                      , showgrid=False) )

plotBoth = [plotGranDemand, plotGranWeather]
plotFig = go.Figure(data = plotBoth, layout = plotLayout)
po.iplot(plotFig)

From this small window of data using only one weather station, we can see that energy correlates well with air temperature. Let's extend the window and further explore the relationship between these two measures. 

### Analysis of both climate and demand data

In [16]:
# Create a new object that brings the two together
dataModel = hfhrDemand.merge(dataWeather, how="left", left_index=True, right_index=True)

# Drop some unnecessary columns 
rmFields = ['i_date', 'i_year', 'i_month', 'i_dow', 'i_hour']
dataModel = dataModel.drop(rmFields, axis=1)
del rmFields

In [17]:
# View the result
dataModel.info()
dataModel.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17521 entries, 2015-11-01 00:00:00 to 2016-10-31 00:00:00
Data columns (total 10 columns):
totaldemand_mean    17521 non-null float64
totaldemand_sum     17521 non-null float64
s_date              17521 non-null datetime64[ns]
s_year              17521 non-null int32
s_month             17521 non-null int32
s_dow               17521 non-null int32
s_hour              17521 non-null int32
weekday             17521 non-null int32
hr_bin              17521 non-null int32
air_temp            17473 non-null float64
dtypes: datetime64[ns](1), float64(3), int32(6)
memory usage: 1.1 MB


Unnamed: 0,totaldemand_mean,totaldemand_sum,s_date,s_year,s_month,s_dow,s_hour,weekday,hr_bin,air_temp
2015-11-01 00:00:00,6446.443333,38678.66,2015-11-01,2015,11,7,0,0,1,15.1
2015-11-01 00:30:00,6306.366667,37838.2,2015-11-01,2015,11,7,0,0,1,15.4
2015-11-01 01:00:00,6038.29,36229.74,2015-11-01,2015,11,7,1,0,1,15.0
2015-11-01 01:30:00,5856.128333,35136.77,2015-11-01,2015,11,7,1,0,1,15.1
2015-11-01 02:00:00,5756.071667,34536.43,2015-11-01,2015,11,7,2,0,1,14.4


In [18]:
# Create an object for plotting
plotModel = dataModel[:].copy()

# Create a series of conditions for a time bin
hrConditions = [(plotModel['hr_bin'] == 1)
                , (plotModel['hr_bin'] == 2)
                , (plotModel['hr_bin'] == 3)
                , (plotModel['hr_bin'] == 4)
                , (plotModel['hr_bin'] == 5)
                , (plotModel['hr_bin'] == 6)]

# Next create a vector of results
hrLabel = ['Midnight-4am', '4am-8am', '8am-11pm', '12pm-4pm', '4pm-8pm', '8pm-Midnight']

# Add the new column
plotModel['hr_label'] = np.select(hrConditions, hrLabel)

# Create a series of conditions for a season bin - group autumn and spring together
snConditions = [(plotModel['s_month'].isin([12, 1, 2]))
               , (plotModel['s_month'].isin([3, 4, 5, 9, 10, 11]))
               , (plotModel['s_month'].isin([6, 7, 8]))]

# Create a vector of results
snBin = [1, 2, 3]
snLabel = ['Summer', 'Spring/Autumn', 'Winter']

# Add the new column
plotModel['sn_bin'] = np.select(snConditions, snBin)
plotModel['sn_label'] = np.select(snConditions, snLabel)

In [19]:
# View the result
plotModel.info()
plotModel.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17521 entries, 2015-11-01 00:00:00 to 2016-10-31 00:00:00
Data columns (total 13 columns):
totaldemand_mean    17521 non-null float64
totaldemand_sum     17521 non-null float64
s_date              17521 non-null datetime64[ns]
s_year              17521 non-null int32
s_month             17521 non-null int32
s_dow               17521 non-null int32
s_hour              17521 non-null int32
weekday             17521 non-null int32
hr_bin              17521 non-null int32
air_temp            17473 non-null float64
hr_label            17521 non-null object
sn_bin              17521 non-null int32
sn_label            17521 non-null object
dtypes: datetime64[ns](1), float64(3), int32(7), object(2)
memory usage: 2.0+ MB


Unnamed: 0,totaldemand_mean,totaldemand_sum,s_date,s_year,s_month,s_dow,s_hour,weekday,hr_bin,air_temp,hr_label,sn_bin,sn_label
2015-11-01 00:00:00,6446.443333,38678.66,2015-11-01,2015,11,7,0,0,1,15.1,Midnight-4am,2,Spring/Autumn
2015-11-01 00:30:00,6306.366667,37838.2,2015-11-01,2015,11,7,0,0,1,15.4,Midnight-4am,2,Spring/Autumn
2015-11-01 01:00:00,6038.29,36229.74,2015-11-01,2015,11,7,1,0,1,15.0,Midnight-4am,2,Spring/Autumn
2015-11-01 01:30:00,5856.128333,35136.77,2015-11-01,2015,11,7,1,0,1,15.1,Midnight-4am,2,Spring/Autumn
2015-11-01 02:00:00,5756.071667,34536.43,2015-11-01,2015,11,7,2,0,1,14.4,Midnight-4am,2,Spring/Autumn


In [20]:
# Visualise the relationship between air temperature and average demand over half hour period
# First plot weekends vs weekdays
plotWeekend = plotModel[plotModel['weekday'] == 0]
plotWeekendTrace = go.Scatter(x = plotWeekend.air_temp
                     , y = plotWeekend.totaldemand_mean
                     , mode = 'markers'
                     , opacity = 0.75
                     , marker = dict(size = 4, color = 'rgb(44,127,184)')
                     , name = "Weekends")

plotWeekday = plotModel[plotModel['weekday'] == 1]
plotWeekdayTrace = go.Scatter(x = plotWeekday.air_temp
                     , y = plotWeekday.totaldemand_mean
                     , mode = 'markers'
                     , opacity = 0.75
                     , marker = dict(size = 4, color = 'rgb(197,27,138)')
                     , name = "Weekdays")

plotData = [plotWeekdayTrace, plotWeekendTrace]

plotLayout = go.Layout(title="Relationship between air temperature at " + stationName + " vs " + sqlRegion + " energy demand"
                       , xaxis = dict(title='Air temperature (deg C)')
                       , yaxis = dict(title='Energy Demand (MW)'))

plotFig = go.Figure(data = plotData, layout = plotLayout)
po.iplot(plotFig)


Overall the U-shaped relationship between air temperature and energy demand. The first has divided the dataset by weekdays and weekends. As expected, the trend remains the same for weekends but the demand is generally lower.  

Let's break down the data further by time segments. The hours of the day have been binned into 6 equal 4 hour groups. This allows us to see how temperature and demand varies by time of day

In [21]:
# Prepare the data for the plot
atMax = plotModel.air_temp.max()
atMin = plotModel.air_temp.min()

# Create a series of traces for each combination of weekend (end or day) and hr_bin (1 to 6)
plotBin_1 = plotModel[plotModel['hr_bin'] == 1]
plotBin_2 = plotModel[plotModel['hr_bin'] == 2]
plotBin_3 = plotModel[plotModel['hr_bin'] == 3]
plotBin_4 = plotModel[plotModel['hr_bin'] == 4]
plotBin_5 = plotModel[plotModel['hr_bin'] == 5]
plotBin_6 = plotModel[plotModel['hr_bin'] == 6]


In [22]:
# Create the plot
# Set a marker size variable
globSize = 2
globColSc = 'Bluered'

plotBin_1Trace = go.Scatter(x = plotBin_1.air_temp
                            , y = plotBin_1.totaldemand_mean
                            , mode = 'markers'
                            , marker = dict(size = globSize, color = plotBin_1.air_temp, cmin = atMin, cmax = atMax
                                            , colorscale = globColSc)
                            , name = hrLabel[0])
plotBin_2Trace = go.Scatter(x = plotBin_2.air_temp
                            , y = plotBin_2.totaldemand_mean
                            , mode = 'markers'
                            , marker = dict(size = globSize, color = plotBin_2.air_temp, cmin = atMin, cmax = atMax
                                           , colorscale = globColSc)
                            , name = hrLabel[1])
plotBin_3Trace = go.Scatter(x = plotBin_3.air_temp
                            , y = plotBin_3.totaldemand_mean
                            , mode = 'markers'
                            , marker = dict(size = globSize, color = plotBin_3.air_temp, cmin = atMin, cmax = atMax
                                           , colorscale = globColSc)
                            , name = hrLabel[2])
plotBin_4Trace = go.Scatter(x = plotBin_4.air_temp
                            , y = plotBin_4.totaldemand_mean
                            , mode = 'markers'
                            , marker = dict(size = globSize, color = plotBin_4.air_temp, cmin = atMin, cmax = atMax
                                           , colorscale = globColSc)
                            , name = hrLabel[3])
plotBin_5Trace = go.Scatter(x = plotBin_5.air_temp
                            , y = plotBin_5.totaldemand_mean
                            , mode = 'markers'
                            , marker = dict(size = globSize, color = plotBin_5.air_temp, cmin = atMin, cmax = atMax
                                           , colorscale = globColSc)
                            , name = hrLabel[4])
plotBin_6Trace = go.Scatter(x = plotBin_6.air_temp
                            , y = plotBin_6.totaldemand_mean
                            , mode = 'markers'
                            , marker = dict(size = globSize, color = plotBin_6.air_temp, cmin = atMin, cmax = atMax
                                           , colorscale = globColSc)
                            , name = hrLabel[5])

# Define a figure for the plots
fig = tools.make_subplots(rows = 2, cols = 3, shared_yaxes=True, shared_xaxes=True, subplot_titles=hrLabel)

fig.append_trace(plotBin_1Trace, 1, 1)
fig.append_trace(plotBin_2Trace, 1, 2)
fig.append_trace(plotBin_3Trace, 1, 3)
fig.append_trace(plotBin_4Trace, 2, 1)
fig.append_trace(plotBin_5Trace, 2, 2)
fig.append_trace(plotBin_6Trace, 2, 3)

fig['layout'].update(title = "Relationship between air temperature at " + stationName + " vs " + sqlRegion + " energy demand")
fig['layout']['xaxis2'].update(title='Air temp (deg C)')
fig['layout']['yaxis2'].update(title='Demand (MW)')

po.iplot(fig)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y1 ]  [ (1,3) x3,y1 ]
[ (2,1) x1,y2 ]  [ (2,2) x2,y2 ]  [ (2,3) x3,y2 ]



What I find interesting about these plots is that when the weather gets hot (greater than 35 degrees), the range of energy demand is quite narrow (10,000 to 12,000 MW) whereas during extremely cold weather (less than 10 degrees), the variation in demand is wider. Here the range can vary from roughly 6,000 to 10,000 MW). 

#### Output the results as a HTML file

In [3]:
os.system('jupyter nbconvert --to html ExploratoryAnalysis.ipynb')

0