# Aggregate Usage
## Description
The Pecan Street usage data is given in high frequency sampling. At the utility scale we are still working through the challenges of collecting and maintaining a dataset of this quality. In practice we are lucky to even have daily data for some customers. Another challenge that comes with high frequency sampling is the massive volume and the memory and processing requirements associated with that. In order to resolve these we will aggregate the Pecan Street data to three separate tables: hourly, daily, and monthly.

## Imports

In [1]:
import json
import pandas as pd
from sqlalchemy import create_engine

## Database connection

In [2]:
# load credentials
credentials_file_path = '../credentials.json'
with open(credentials_file_path) as credentials_file:
    credentials = json.load(credentials_file)
    
# connect to database
engine = create_engine('mysql+mysqldb://{user}@{host}/{db}'.format(
    user = credentials['user'],
    host = credentials['host'],
    db = credentials['db']
))

conn = engine.connect()

## Remove tables if they exist
Used for refreshing the database.

In [3]:
# drop table if exists
conn.execute('DROP TABLE IF EXISTS usage_hourly')
conn.execute('DROP TABLE IF EXISTS usage_daily')
conn.execute('DROP TABLE IF EXISTS usage_monthly')

<sqlalchemy.engine.result.ResultProxy at 0x18125423438>

## Aggregate to hourly

In [4]:
hourly_usage = pd.read_sql('''
    SELECT 
        dataid,
        MIN(localminute) AS dt,
        year,
        month,
        day,
        hour,
        (COALESCE(SUM(air1),0) + COALESCE(SUM(air2),0) + COALESCE(SUM(air3),0) + COALESCE(SUM(airwindowunit1),0)) AS cooling,
        (COALESCE(SUM(furnace1),0) + COALESCE(SUM(furnace2),0)) AS furnace,
        (COALESCE(SUM(heater1),0) + COALESCE(SUM(heater2),0) + COALESCE(SUM(heater3),0)) AS heaters,
        (COALESCE(SUM(furnace1),0) + COALESCE(SUM(furnace2),0) + COALESCE(SUM(heater1),0) + COALESCE(SUM(heater2),0) + COALESCE(SUM(heater3),0)) AS heating,
        (COALESCE(SUM(solar),0) + COALESCE(SUM(solar2),0)) AS solar,
        COALESCE(SUM(grid),0) AS `usage`
    FROM onemin
    GROUP BY dataid, year, month, day, hour
''', conn)

hourly_usage

Unnamed: 0,dataid,dt,year,month,day,hour,cooling,furnace,heaters,heating,solar,usage
0,661,2018-01-01 00:00:00-06,2018,1,1,0,0.000,13.964,0.0,13.964,0.000,0.000
1,661,2018-01-01 01:00:00-06,2018,1,1,1,0.000,14.625,0.0,14.625,-0.130,92.778
2,661,2018-01-01 02:00:00-06,2018,1,1,2,0.000,19.866,0.0,19.866,-0.352,66.611
3,661,2018-01-01 03:00:00-06,2018,1,1,3,0.000,19.164,0.0,19.164,-0.267,58.050
4,661,2018-01-01 04:00:00-06,2018,1,1,4,0.000,20.028,0.0,20.028,-0.300,46.274
...,...,...,...,...,...,...,...,...,...,...,...,...
204932,9922,2018-12-31 19:00:00-06,2018,12,31,19,-0.152,10.372,0.0,10.372,0.000,159.199
204933,9922,2018-12-31 20:00:00-06,2018,12,31,20,-0.162,14.341,0.0,14.341,0.000,160.425
204934,9922,2018-12-31 21:00:00-06,2018,12,31,21,-0.281,14.364,0.0,14.364,0.000,258.051
204935,9922,2018-12-31 22:00:00-06,2018,12,31,22,-0.206,5.198,0.0,5.198,0.000,122.158


In [5]:
hourly_usage.to_sql('usage_hourly', conn, index=False)

## Aggregate to daily

In [6]:
daily_usage = pd.read_sql('''
    SELECT 
        dataid,
        MIN(dt) AS dt,
        year,
        month,
        day,
        SUM(cooling) AS cooling,
        SUM(furnace) AS furnace,
        SUM(heaters) AS heaters,
        SUM(heating) AS heating,
        SUM(solar) AS solar,
        SUM(`usage`) AS `usage`
    FROM usage_hourly
    GROUP BY dataid, year, month, day
''', conn)

daily_usage

Unnamed: 0,dataid,dt,year,month,day,cooling,furnace,heaters,heating,solar,usage
0,661,2018-01-01 00:00:00-06,2018,1,1,0.000,346.957,0.0,346.957,651.007,1630.086
1,661,2018-01-02 00:00:00-06,2018,1,2,0.000,337.424,0.0,337.424,275.095,1006.282
2,661,2018-01-03 00:00:00-06,2018,1,3,-0.089,266.800,0.0,266.800,1865.626,-415.662
3,661,2018-01-04 00:00:00-06,2018,1,4,-0.055,221.171,0.0,221.171,1491.558,-254.565
4,661,2018-01-05 00:00:00-06,2018,1,5,-0.079,169.175,0.0,169.175,1581.860,-324.504
...,...,...,...,...,...,...,...,...,...,...,...
8665,9922,2018-12-27 00:00:00-06,2018,12,27,38.109,247.978,0.0,247.978,0.000,2814.867
8666,9922,2018-12-28 00:00:00-06,2018,12,28,-4.615,432.708,0.0,432.708,0.000,2825.281
8667,9922,2018-12-29 00:00:00-06,2018,12,29,-4.765,503.116,0.0,503.116,0.000,2992.122
8668,9922,2018-12-30 00:00:00-06,2018,12,30,-4.570,443.991,0.0,443.991,0.000,2975.587


In [7]:
daily_usage.to_sql('usage_daily', conn, index=False)

## Aggregate to monthly

In [8]:
monthly_usage = pd.read_sql('''
    SELECT 
        dataid,
        MIN(dt) AS dt,
        year,
        month,
        SUM(cooling) AS cooling,
        SUM(furnace) AS furnace,
        SUM(heaters) AS heaters,
        SUM(heating) AS heating,
        SUM(solar) AS solar,
        SUM(`usage`) AS `usage`
    FROM usage_daily
    GROUP BY dataid, year, month
''', conn)

monthly_usage

Unnamed: 0,dataid,dt,year,month,cooling,furnace,heaters,heating,solar,usage
0,661,2018-01-01 00:00:00-06,2018,1,-1.848,5986.242,0.0,5986.242,40096.847,5295.361
1,661,2018-02-01 00:00:00-06,2018,2,136.424,4276.532,0.0,4276.532,19323.532,18581.356
2,661,2018-03-01 04:42:00-06,2018,3,1210.204,1835.335,0.0,1835.335,29010.627,-2113.905
3,661,2018-04-01 01:13:00-05,2018,4,3844.624,2905.674,0.0,2905.674,42229.823,-5099.918
4,661,2018-05-01 00:00:00-05,2018,5,28945.364,8306.782,0.0,8306.782,47116.600,28430.315
...,...,...,...,...,...,...,...,...,...,...
293,9922,2018-08-01 00:00:00-05,2018,8,42220.657,12341.621,0.0,12341.621,0.000,89525.003
294,9922,2018-09-01 00:00:00-05,2018,9,31251.492,14099.971,0.0,14099.971,0.000,113023.786
295,9922,2018-10-01 00:00:00-05,2018,10,15797.324,10865.428,0.0,10865.428,0.000,103272.510
296,9922,2018-11-04 14:15:00-06,2018,11,1946.987,9219.793,0.0,9219.793,0.000,75817.096


In [9]:
monthly_usage.to_sql('usage_monthly', conn, index=False)