# Pre-processing of Data Sets

In [1]:
import pandas as pd
from collections import namedtuple
import numpy as np
import time
import pickle
from importlib import reload

import sys
sys.path.insert(0, '../')
import bench_util

%load_ext line_profiler

In [65]:
reload(bench_util)
ut = bench_util.Util('data/ExampleDataExport.csv', '../data/Other_Building_Data.xlsx')


## Utility Data

In [3]:
# Read the CSV file and convert the billing period dates into 
# real Pandas dates
dfu = pd.read_csv('data/ExampleDataExport.csv', parse_dates=['From', 'Thru'])
dfu.head()

Unnamed: 0,Site ID,Site Name,Vendor Code,Vendor Name,Account Number,Bill Date,Due Date,Entry Date,Invoice #,Voucher #,From,Thru,Service Name,Item Description,Meter Number,Usage,Cost,Units,Account Financial Code,Site Financial Code
0,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),8/30/2010,8/30/2010,1/24/2011,,,2010-07-29,2010-08-30,Natural Gas,Natural gas (CCF),,2716.0,6222.36,CCF,61837,
1,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),8/30/2010,8/30/2010,1/24/2011,,,2010-07-29,2010-08-30,Natural Gas,Regulatory Cost Charge,,,21.53,,61837,
2,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),8/30/2010,8/30/2010,1/24/2011,,,2010-07-29,2010-08-30,Natural Gas,Customer Charge,,,17.25,,61837,
3,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),9/30/2010,9/30/2010,1/24/2011,,,2010-08-30,2010-09-30,Natural Gas,Natural gas (CCF),,3526.0,8078.07,CCF,61837,
4,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),9/30/2010,9/30/2010,1/24/2011,,,2010-08-30,2010-09-30,Natural Gas,Regulatory Cost Charge,,,27.93,,61837,


In [4]:
cols = ['Site ID', 'Vendor Code', 'Vendor Name', 'Account Number', 'Service Name', 'Item Description',
       'Meter Number', 'Units', 'Account Financial Code', 'Site Financial Code']
for col in cols:
    print('{0:24s}: {1}'.format(col, list(dfu[col].unique())))

Site ID                 : ['ANSBG1']
Vendor Code             : ['VF314940', 'VG372746', 'VG354933']
Vendor Name             : ['Fairbanks Natural Gas', 'Golden Heart Utilities', 'Golden Valley Electric']
Account Number          : ['10282  (2408 DAVIS)', '1311001100', '31850']
Service Name            : ['Natural Gas', 'Water', 'Sewer', 'Electricity']
Item Description        : ['Natural gas (CCF)', 'Regulatory Cost Charge', 'Customer Charge', 'Gas Charge (CCF)', 'Late charge', 'Water Usage (Gallons)', 'Sewer Usage (Gallons)', 'Water Fixed Charge', 'Sewer Fixed Charge', 'Fire Protection', 'Other charges', 'Misc. credit', 'Meter charge', 'Customer Charge - Water', 'Customer Charge - Sewer', 'Plant Replacement ADJ', 'Cost of Energy Adjustmen', 'Energy charge', 'KW Charge', 'Fuel Adjustment', 'On peak demand', 'Fuel cost adjustment', 'Fuel & Purchased Power', 'Utility Charge']
Meter Number            : [nan, 89672.0]
Units                   : ['CCF', nan, 'Gallons', 'kWh', 'kW']
Account Fina

In [5]:
# Filter down to the needed columns and rename them
cols = [
    ('Site ID', 'site_id'),
    ('Vendor Code', 'vendor_code'),
    ('From', 'from_dt'),
    ('Thru', 'thru_dt'),
    ('Service Name', 'service_type'),
    ('Item Description', 'item_desc'),
    ('Usage', 'usage'),
    ('Cost', 'cost'),
    ('Units', 'units'),
]

old_cols, new_cols = zip(*cols)         # unpack into old and new column names
dfu1 = dfu[list(old_cols)]              # select just those columns from the origina dataframe
dfu1 = dfu1.rename(columns=dict(cols))  # rename the columns
dfu1.head()

Unnamed: 0,site_id,vendor_code,from_dt,thru_dt,service_type,item_desc,usage,cost,units
0,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Natural gas (CCF),2716.0,6222.36,CCF
1,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Regulatory Cost Charge,,21.53,
2,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Customer Charge,,17.25,
3,ANSBG1,VF314940,2010-08-30,2010-09-30,Natural Gas,Natural gas (CCF),3526.0,8078.07,CCF
4,ANSBG1,VF314940,2010-08-30,2010-09-30,Natural Gas,Regulatory Cost Charge,,27.93,


### Collapse Non-Usage Changes into "Other Charge"

In [6]:
# Now collapse all the non-usage charges into one item_desc: Other Charge
# This cuts the processing time in half due to not having to split a whole 
# bunch of non-consumption charges.
dfu1.loc[np.isnan(dfu1.usage), 'item_desc'] = 'Other Charge'
dfu1.units.fillna('-', inplace=True)   # Pandas can't do a GroupBy on NaNs, so replace with something
dfu1 = dfu1.groupby(['site_id', 'vendor_code', 'from_dt', 'thru_dt', 'service_type', 'item_desc', 'units']).sum()
dfu1.reset_index(inplace=True)
dfu1.head(20)

Unnamed: 0,site_id,vendor_code,from_dt,thru_dt,service_type,item_desc,units,usage,cost
0,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Natural gas (CCF),CCF,2716.0,6222.36
1,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Other Charge,-,,38.78
2,ANSBG1,VF314940,2010-08-30,2010-09-30,Natural Gas,Natural gas (CCF),CCF,3526.0,8078.07
3,ANSBG1,VF314940,2010-08-30,2010-09-30,Natural Gas,Other Charge,-,,45.18
4,ANSBG1,VF314940,2010-09-30,2010-10-29,Natural Gas,Natural gas (CCF),CCF,3921.0,8983.01
5,ANSBG1,VF314940,2010-09-30,2010-10-29,Natural Gas,Other Charge,-,,48.3
6,ANSBG1,VF314940,2010-10-29,2010-11-30,Natural Gas,Natural gas (CCF),CCF,5949.0,13629.16
7,ANSBG1,VF314940,2010-10-29,2010-11-30,Natural Gas,Other Charge,-,,64.33
8,ANSBG1,VF314940,2010-11-30,2010-12-29,Natural Gas,Natural gas (CCF),CCF,5098.0,11679.52
9,ANSBG1,VF314940,2010-11-30,2010-12-29,Natural Gas,Other Charge,-,,57.6


### Split Each Bill into Multiple Pieces, each within one Calendar Month

In [7]:
# Test the split_period function
bench_util.split_period('2016-01-25', '2016-06-26')
# this takes about 3.5 msec to due, which is pretty long

[PeriodSplit(cal_year=2016, cal_mo=1, bill_frac=0.042483660130718956, days_served=6.5),
 PeriodSplit(cal_year=2016, cal_mo=2, bill_frac=0.18954248366013071, days_served=29.0),
 PeriodSplit(cal_year=2016, cal_mo=3, bill_frac=0.20261437908496732, days_served=31.0),
 PeriodSplit(cal_year=2016, cal_mo=4, bill_frac=0.19607843137254902, days_served=30.0),
 PeriodSplit(cal_year=2016, cal_mo=5, bill_frac=0.20261437908496732, days_served=31.0),
 PeriodSplit(cal_year=2016, cal_mo=6, bill_frac=0.16666666666666666, days_served=25.5)]

In [8]:
# Split all the rows into calendar month pieces and make a new DataFrame
recs=[]
for ix, row in dfu1.iterrows():
    # it is *much* faster to modify a dictionary than a Pandas series
    row_tmpl = row.to_dict()   

    # Pull out start and end of billing period; don't need them anymore after
    # doing split-up of billing period across months.
    st = row_tmpl['from_dt']
    en = row_tmpl['thru_dt']
    del row_tmpl['from_dt']
    del row_tmpl['thru_dt']
    
    for piece in bench_util.split_period(st, en):
        new_row = row_tmpl.copy()
        new_row['cal_year'] = piece.cal_year
        new_row['cal_mo'] = piece.cal_mo
        new_row['days_served'] = piece.days_served
        new_row['usage'] *= piece.bill_frac
        new_row['cost'] *= piece.bill_frac
        recs.append(new_row)

dfu2 = pd.DataFrame(recs, index=range(len(recs)))
dfu2.head()

Unnamed: 0,cal_mo,cal_year,cost,days_served,item_desc,service_type,site_id,units,usage,vendor_code
0,7,2010,486.121875,2.5,Natural gas (CCF),Natural Gas,ANSBG1,CCF,212.1875,VF314940
1,8,2010,5736.238125,29.5,Natural gas (CCF),Natural Gas,ANSBG1,CCF,2503.8125,VF314940
2,7,2010,3.029688,2.5,Other Charge,Natural Gas,ANSBG1,-,,VF314940
3,8,2010,35.750312,29.5,Other Charge,Natural Gas,ANSBG1,-,,VF314940
4,8,2010,390.874355,1.5,Natural gas (CCF),Natural Gas,ANSBG1,CCF,170.612903,VF314940


### Sum Up the Pieces by Month

In [9]:
dfu3 = dfu2.groupby(
    ['site_id', 'vendor_code', 'service_type', 'cal_year', 'cal_mo', 'item_desc', 'units']
).sum()
dfu3 = dfu3.reset_index()
dfu3.head(10)

Unnamed: 0,site_id,vendor_code,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage
0,ANSBG1,VF314940,Natural Gas,2010,7,Natural gas (CCF),CCF,486.121875,2.5,212.1875
1,ANSBG1,VF314940,Natural Gas,2010,7,Other Charge,-,3.029688,2.5,
2,ANSBG1,VF314940,Natural Gas,2010,8,Natural gas (CCF),CCF,6127.11248,31.0,2674.425403
3,ANSBG1,VF314940,Natural Gas,2010,8,Other Charge,-,37.936442,31.0,
4,ANSBG1,VF314940,Natural Gas,2010,9,Natural gas (CCF),CCF,7842.075128,30.0,3422.990545
5,ANSBG1,VF314940,Natural Gas,2010,9,Other Charge,-,43.82663,30.0,
6,ANSBG1,VF314940,Natural Gas,2010,10,Natural gas (CCF),CCF,9892.908642,31.0,4318.162177
7,ANSBG1,VF314940,Natural Gas,2010,10,Other Charge,-,52.493023,31.0,
8,ANSBG1,VF314940,Natural Gas,2010,11,Natural gas (CCF),CCF,12765.752909,30.0,5572.130927
9,ANSBG1,VF314940,Natural Gas,2010,11,Other Charge,-,60.297322,30.0,


In [10]:
dfu3[dfu3.service_type=='Electricity'].head(10)

Unnamed: 0,site_id,vendor_code,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage
165,ANSBG1,VG354933,Electricity,2010,7,Energy charge,kWh,602.3,15.5,7290.0
166,ANSBG1,VG354933,Electricity,2010,7,KW Charge,kW,0.0,15.5,16.45
167,ANSBG1,VG354933,Electricity,2010,7,Other Charge,-,699.36,15.5,
168,ANSBG1,VG354933,Electricity,2010,8,Energy charge,kWh,1212.035,31.0,14670.0
169,ANSBG1,VG354933,Electricity,2010,8,KW Charge,kW,0.0,31.0,30.75
170,ANSBG1,VG354933,Electricity,2010,8,Other Charge,-,1387.745,31.0,
171,ANSBG1,VG354933,Electricity,2010,9,Energy charge,kWh,1179.815,30.0,14280.0
172,ANSBG1,VG354933,Electricity,2010,9,KW Charge,kW,0.0,30.0,27.6
173,ANSBG1,VG354933,Electricity,2010,9,Other Charge,-,1332.65,30.0,
174,ANSBG1,VG354933,Electricity,2010,10,Energy charge,kWh,1300.944839,31.0,15746.129032


### Add Fiscal Year Info and MMBtus

In [11]:
# Add Fiscal Year and month columns
fyr = []
fmo = []
for cyr, cmo in zip(dfu3.cal_year, dfu3.cal_mo):
    fis_yr, fis_mo = bench_util.calendar_to_fiscal(cyr, cmo)
    fyr.append(fis_yr)
    fmo.append(fis_mo)
dfu3['fiscal_year'] = fyr
dfu3['fiscal_mo'] = fmo
dfu3.head()

Unnamed: 0,site_id,vendor_code,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage,fiscal_year,fiscal_mo
0,ANSBG1,VF314940,Natural Gas,2010,7,Natural gas (CCF),CCF,486.121875,2.5,212.1875,2011,1
1,ANSBG1,VF314940,Natural Gas,2010,7,Other Charge,-,3.029688,2.5,,2011,1
2,ANSBG1,VF314940,Natural Gas,2010,8,Natural gas (CCF),CCF,6127.11248,31.0,2674.425403,2011,2
3,ANSBG1,VF314940,Natural Gas,2010,8,Other Charge,-,37.936442,31.0,,2011,2
4,ANSBG1,VF314940,Natural Gas,2010,9,Natural gas (CCF),CCF,7842.075128,30.0,3422.990545,2011,3


In [12]:
mmbtu = []
for ix, row in dfu3.iterrows():
    mmbtu.append(
        ut.fuel_btus_per_unit(row.service_type, row.units) * row.usage / 1e6
    )
dfu3['mmbtu'] = mmbtu
dfu3.head(10)

Unnamed: 0,site_id,vendor_code,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage,fiscal_year,fiscal_mo,mmbtu
0,ANSBG1,VF314940,Natural Gas,2010,7,Natural gas (CCF),CCF,486.121875,2.5,212.1875,2011,1,21.643125
1,ANSBG1,VF314940,Natural Gas,2010,7,Other Charge,-,3.029688,2.5,,2011,1,
2,ANSBG1,VF314940,Natural Gas,2010,8,Natural gas (CCF),CCF,6127.11248,31.0,2674.425403,2011,2,272.791391
3,ANSBG1,VF314940,Natural Gas,2010,8,Other Charge,-,37.936442,31.0,,2011,2,
4,ANSBG1,VF314940,Natural Gas,2010,9,Natural gas (CCF),CCF,7842.075128,30.0,3422.990545,2011,3,349.145036
5,ANSBG1,VF314940,Natural Gas,2010,9,Other Charge,-,43.82663,30.0,,2011,3,
6,ANSBG1,VF314940,Natural Gas,2010,10,Natural gas (CCF),CCF,9892.908642,31.0,4318.162177,2011,4,440.452542
7,ANSBG1,VF314940,Natural Gas,2010,10,Other Charge,-,52.493023,31.0,,2011,4,
8,ANSBG1,VF314940,Natural Gas,2010,11,Natural gas (CCF),CCF,12765.752909,30.0,5572.130927,2011,5,568.357355
9,ANSBG1,VF314940,Natural Gas,2010,11,Other Charge,-,60.297322,30.0,,2011,5,


In [13]:
dfu3[dfu3.service_type=='Electricity'].head(10)

Unnamed: 0,site_id,vendor_code,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage,fiscal_year,fiscal_mo,mmbtu
165,ANSBG1,VG354933,Electricity,2010,7,Energy charge,kWh,602.3,15.5,7290.0,2011,1,24.87348
166,ANSBG1,VG354933,Electricity,2010,7,KW Charge,kW,0.0,15.5,16.45,2011,1,
167,ANSBG1,VG354933,Electricity,2010,7,Other Charge,-,699.36,15.5,,2011,1,
168,ANSBG1,VG354933,Electricity,2010,8,Energy charge,kWh,1212.035,31.0,14670.0,2011,2,50.05404
169,ANSBG1,VG354933,Electricity,2010,8,KW Charge,kW,0.0,31.0,30.75,2011,2,
170,ANSBG1,VG354933,Electricity,2010,8,Other Charge,-,1387.745,31.0,,2011,2,
171,ANSBG1,VG354933,Electricity,2010,9,Energy charge,kWh,1179.815,30.0,14280.0,2011,3,48.72336
172,ANSBG1,VG354933,Electricity,2010,9,KW Charge,kW,0.0,30.0,27.6,2011,3,
173,ANSBG1,VG354933,Electricity,2010,9,Other Charge,-,1332.65,30.0,,2011,3,
174,ANSBG1,VG354933,Electricity,2010,10,Energy charge,kWh,1300.944839,31.0,15746.129032,2011,4,53.725792


In [14]:
dfu3.to_csv('dfu3.csv')
dfu3.to_pickle('dfu3.pkl')

## Other Building Info

In [15]:
df_bldg = pd.read_excel('../data/Other_Building_Data.xlsx', sheetname='Building', skiprows=3, index_col='site_ID')
df_bldg

Unnamed: 0_level_0,site_name,address,city,primary_func,year_built,sq_ft,onsite_gen,dd_site
site_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ANSBG1,Animal Control,2408 Davis Road,Fairbanks,Animal Shelter,1993,14000,,PAFA
BIGDIP,Big Dipper Ice Arena,1920 Lathrop Street,Fairbanks,Ice Arena,1990,23000,,PAFA


In [16]:
bldg_info = {}
BldgInfo = namedtuple('BldgInfo', list(df_bldg.columns))
for ix, row in df_bldg.iterrows():
    bldg_info[row.name] = BldgInfo(**row.to_dict())
bldg_info

{'ANSBG1': BldgInfo(site_name='Animal Control', address='2408 Davis Road', city='Fairbanks', primary_func='Animal Shelter', year_built=1993, sq_ft=14000, onsite_gen=nan, dd_site='PAFA'),
 'BIGDIP': BldgInfo(site_name='Big Dipper Ice Arena', address='1920 Lathrop Street', city='Fairbanks', primary_func='Ice Arena', year_built=1990, sq_ft=23000, onsite_gen=nan, dd_site='PAFA')}

## Work on Other Utility Functions

### Find All Fiscal Years and Months Present in a DataFrame

In [19]:
bench_util.months_present(dfu3)

[(2011, 1),
 (2011, 2),
 (2011, 3),
 (2011, 4),
 (2011, 5),
 (2011, 6),
 (2011, 7),
 (2011, 8),
 (2011, 9),
 (2011, 10),
 (2011, 11),
 (2011, 12),
 (2012, 1),
 (2012, 2),
 (2012, 3),
 (2012, 4),
 (2012, 5),
 (2012, 6),
 (2012, 7),
 (2012, 8),
 (2012, 9),
 (2012, 10),
 (2012, 11),
 (2012, 12),
 (2013, 1),
 (2013, 2),
 (2013, 3),
 (2013, 4),
 (2013, 5),
 (2013, 6),
 (2013, 7),
 (2013, 8),
 (2013, 9),
 (2013, 10),
 (2013, 11),
 (2013, 12),
 (2014, 1),
 (2014, 2),
 (2014, 3),
 (2014, 4),
 (2014, 5),
 (2014, 6),
 (2014, 7),
 (2014, 8),
 (2014, 9),
 (2014, 10),
 (2014, 11),
 (2014, 12),
 (2015, 1),
 (2015, 2),
 (2015, 3),
 (2015, 4),
 (2015, 5),
 (2015, 6),
 (2015, 7),
 (2015, 8),
 (2015, 9),
 (2015, 10),
 (2015, 11),
 (2015, 12),
 (2016, 1),
 (2016, 2),
 (2016, 3),
 (2016, 4),
 (2016, 5),
 (2016, 6),
 (2016, 7),
 (2016, 8),
 (2016, 9),
 (2016, 10),
 (2016, 11),
 (2016, 12),
 (2017, 1),
 (2017, 2),
 (2017, 3),
 (2017, 4),
 (2017, 5),
 (2017, 6),
 (2017, 7),
 (2017, 8),
 (2017, 9)]

In [21]:
bench_util.months_present(dfu3, 'cal_year', 'cal_mo')[-5:]

[(2016, 11), (2016, 12), (2017, 1), (2017, 2), (2017, 3)]

### Degree Day Summaries

In [42]:
mo_present = bench_util.months_present(dfu3)
site_id = 'ANSBG1'
recs = []
for yr, mo in mo_present:
    recs.append(
        {'fiscal_year': yr, 
         'fiscal_mo': mo, 
         'dd': ut.dd.get((yr, mo, ut.bldg_info[site_id].dd_site), np.NaN)
        }
    )
dfdd = pd.DataFrame(data=recs)
dfdd.tail(10)

Unnamed: 0,dd,fiscal_mo,fiscal_year
71,197.0,12,2016
72,134.0,1,2017
73,,2,2017
74,568.0,3,2017
75,1232.0,4,2017
76,1825.0,5,2017
77,2245.0,6,2017
78,2302.0,7,2017
79,1797.0,8,2017
80,2084.0,9,2017


In [30]:
# To add into a DataFrame
df_with_dd = pd.merge(dfu3, dfdd, how='left', on=['fiscal_year', 'fiscal_mo'])
df_with_dd.head()

Unnamed: 0,site_id,vendor_code,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage,fiscal_year,fiscal_mo,mmbtu,dd
0,ANSBG1,VF314940,Natural Gas,2010,7,Natural gas (CCF),CCF,486.121875,2.5,212.1875,2011,1,21.643125,134
1,ANSBG1,VF314940,Natural Gas,2010,7,Other Charge,-,3.029688,2.5,,2011,1,,134
2,ANSBG1,VF314940,Natural Gas,2010,8,Natural gas (CCF),CCF,6127.11248,31.0,2674.425403,2011,2,272.791391,169
3,ANSBG1,VF314940,Natural Gas,2010,8,Other Charge,-,37.936442,31.0,,2011,2,,169
4,ANSBG1,VF314940,Natural Gas,2010,9,Natural gas (CCF),CCF,7842.075128,30.0,3422.990545,2011,3,349.145036,568


In [62]:
dfdd.groupby('fiscal_year').agg({'dd': lambda x: np.sum(x.values)})

Unnamed: 0_level_0,dd
fiscal_year,Unnamed: 1_level_1
2011,13802.0
2012,12393.0
2013,12171.0
2014,13802.0
2015,12393.0
2016,12171.0
2017,


In [66]:
ut.degree_days_yearly(mo_present, 'ANSBG1')

fiscal_year
2011    13802
2012    12393
2013    12171
2014    13802
2015    12393
2016    12171
2017    12356
Name: dd, dtype: int64