# Pre-processing of Data Sets

In [155]:
import pandas as pd
from collections import namedtuple
import numpy as np
import time

### Utility Data

In [82]:
# Read the CSV file and convert the billing period dates into 
# real Pandas dates
dfu = pd.read_csv('data/ExampleDataExport.csv', parse_dates=['From', 'Thru'])
dfu.head()

Unnamed: 0,Site ID,Site Name,Vendor Code,Vendor Name,Account Number,Bill Date,Due Date,Entry Date,Invoice #,Voucher #,From,Thru,Service Name,Item Description,Meter Number,Usage,Cost,Units,Account Financial Code,Site Financial Code
0,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),8/30/2010,8/30/2010,1/24/2011,,,2010-07-29,2010-08-30,Natural Gas,Natural gas (CCF),,2716.0,6222.36,CCF,61837,
1,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),8/30/2010,8/30/2010,1/24/2011,,,2010-07-29,2010-08-30,Natural Gas,Regulatory Cost Charge,,,21.53,,61837,
2,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),8/30/2010,8/30/2010,1/24/2011,,,2010-07-29,2010-08-30,Natural Gas,Customer Charge,,,17.25,,61837,
3,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),9/30/2010,9/30/2010,1/24/2011,,,2010-08-30,2010-09-30,Natural Gas,Natural gas (CCF),,3526.0,8078.07,CCF,61837,
4,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),9/30/2010,9/30/2010,1/24/2011,,,2010-08-30,2010-09-30,Natural Gas,Regulatory Cost Charge,,,27.93,,61837,


In [83]:
cols = ['Site ID', 'Vendor Code', 'Vendor Name', 'Account Number', 'Service Name', 'Item Description',
       'Meter Number', 'Units', 'Account Financial Code', 'Site Financial Code']
for col in cols:
    print('{0:24s}: {1}'.format(col, list(dfu[col].unique())))

Site ID                 : ['ANSBG1']
Vendor Code             : ['VF314940', 'VG372746', 'VG354933']
Vendor Name             : ['Fairbanks Natural Gas', 'Golden Heart Utilities', 'Golden Valley Electric']
Account Number          : ['10282  (2408 DAVIS)', '1311001100', '31850']
Service Name            : ['Natural Gas', 'Water', 'Sewer', 'Electricity']
Item Description        : ['Natural gas (CCF)', 'Regulatory Cost Charge', 'Customer Charge', 'Gas Charge (CCF)', 'Late charge', 'Water Usage (Gallons)', 'Sewer Usage (Gallons)', 'Water Fixed Charge', 'Sewer Fixed Charge', 'Fire Protection', 'Other charges', 'Misc. credit', 'Meter charge', 'Customer Charge - Water', 'Customer Charge - Sewer', 'Plant Replacement ADJ', 'Cost of Energy Adjustmen', 'Energy charge', 'KW Charge', 'Fuel Adjustment', 'On peak demand', 'Fuel cost adjustment', 'Fuel & Purchased Power', 'Utility Charge']
Meter Number            : [nan, 89672.0]
Units                   : ['CCF', nan, 'Gallons', 'kWh', 'kW']
Account Fina

In [86]:
# Filter down to the needed columns and rename them
cols = [
    ('Site ID', 'site_id'),
    ('Vendor Code', 'vendor_code'),
    ('From', 'from_dt'),
    ('Thru', 'thru_dt'),
    ('Service Name', 'service_type'),
    ('Item Description', 'item_desc'),
    ('Usage', 'usage'),
    ('Cost', 'cost'),
    ('Units', 'units'),
]

old_cols, new_cols = zip(*cols)         # unpack into old and new column names
dfu1 = dfu[list(old_cols)]              # select just those columns from the origina dataframe
dfu1 = dfu1.rename(columns=dict(cols))  # rename the columns
dfu1.head()

Unnamed: 0,site_id,vendor_code,from_dt,thru_dt,service_type,item_desc,usage,cost,units
0,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Natural gas (CCF),2716.0,6222.36,CCF
1,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Regulatory Cost Charge,,21.53,
2,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Customer Charge,,17.25,
3,ANSBG1,VF314940,2010-08-30,2010-09-30,Natural Gas,Natural gas (CCF),3526.0,8078.07,CCF
4,ANSBG1,VF314940,2010-08-30,2010-09-30,Natural Gas,Regulatory Cost Charge,,27.93,


In [178]:
# Now collapse all the non-usage charges into one item_desc: Other Charge
# This cuts the processing time in half due to not having to split a whole 
# bunch of non-consumption charges.
dfu1.loc[np.isnan(dfu1.usage), 'item_desc'] = 'Other Charge'
dfu1.units.fillna('-', inplace=True)   # Pandas can't do a GroupBy on NaNs, so replace with something
dfu1 = dfu1.groupby(['site_id', 'vendor_code', 'from_dt', 'thru_dt', 'service_type', 'item_desc', 'units']).sum()
dfu1.reset_index(inplace=True)
dfu1.head(20)

Unnamed: 0,site_id,vendor_code,from_dt,thru_dt,service_type,item_desc,units,usage,cost
0,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Natural gas (CCF),CCF,2716.0,6222.36
1,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Other Charge,-,,38.78
2,ANSBG1,VF314940,2010-08-30,2010-09-30,Natural Gas,Natural gas (CCF),CCF,3526.0,8078.07
3,ANSBG1,VF314940,2010-08-30,2010-09-30,Natural Gas,Other Charge,-,,45.18
4,ANSBG1,VF314940,2010-09-30,2010-10-29,Natural Gas,Natural gas (CCF),CCF,3921.0,8983.01
5,ANSBG1,VF314940,2010-09-30,2010-10-29,Natural Gas,Other Charge,-,,48.3
6,ANSBG1,VF314940,2010-10-29,2010-11-30,Natural Gas,Natural gas (CCF),CCF,5949.0,13629.16
7,ANSBG1,VF314940,2010-10-29,2010-11-30,Natural Gas,Other Charge,-,,64.33
8,ANSBG1,VF314940,2010-11-30,2010-12-29,Natural Gas,Natural gas (CCF),CCF,5098.0,11679.52
9,ANSBG1,VF314940,2010-11-30,2010-12-29,Natural Gas,Other Charge,-,,57.6


In [128]:
PeriodSplit = namedtuple('PeriodSplit', 'cal_year cal_mo bill_frac days_served')
def split_period(start_date, end_date):
    """Splits a range of service dates from a utility bill into pieces that
    fit within calendar months. For each piece, the number of days in that piece 
    and the fraction of the original date range are returned in a namedtuple.
    For the first and last day in the date range, it is assumed that only half
    the day is served (this is typically the meter reading day).
    """
    # make a daily series.  The value is the fraction of the day served,
    # 1.0 for days except the first and last.
    ser = pd.Series(data=1.0, index=pd.date_range(start_date, end_date))
    
    # the half days at the beginning and end
    ser.iloc[0] = 0.5
    ser.iloc[-1] = 0.5
    
    tot_days = ser.sum()    # total days served in the bill
    
    # break into monthly pieces and add up the days served
    pieces = ser.resample('M').sum()
    
    result = []
    for dt, days in pieces.items():
        result.append(
            PeriodSplit(cal_year=dt.year, cal_mo=dt.month, bill_frac=days/tot_days, days_served=days)
        )
    return result

In [180]:
split_period('2016-01-25', '2016-06-26')
# this takes about 3.5 msec to due, which is pretty long

[PeriodSplit(cal_year=2016, cal_mo=1, bill_frac=0.042483660130718956, days_served=6.5),
 PeriodSplit(cal_year=2016, cal_mo=2, bill_frac=0.18954248366013071, days_served=29.0),
 PeriodSplit(cal_year=2016, cal_mo=3, bill_frac=0.20261437908496732, days_served=31.0),
 PeriodSplit(cal_year=2016, cal_mo=4, bill_frac=0.19607843137254902, days_served=30.0),
 PeriodSplit(cal_year=2016, cal_mo=5, bill_frac=0.20261437908496732, days_served=31.0),
 PeriodSplit(cal_year=2016, cal_mo=6, bill_frac=0.16666666666666666, days_served=25.5)]

In [181]:
# Split all the rows into calendar month pieces and make a new DataFrame

start = time.time()
recs=[]
for ix, row in dfu1.iterrows():
    st = row['from_dt']
    en = row['thru_dt']
    row_tmpl = row.drop(labels=['from_dt', 'thru_dt'])
    for piece in split_period(st, en):
        new_row = row_tmpl.copy()
        new_row['cal_year'] = piece.cal_year
        new_row['cal_mo'] = piece.cal_mo
        if piece.cal_mo <= 6:
            new_row['fiscal_year'] = piece.cal_year
            new_row['fiscal_mo'] = piece.cal_mo + 6
        else:
            new_row['fiscal_year'] = piece.cal_year + 1
            new_row['fiscal_mo'] = piece.cal_mo - 6
        new_row['days_served'] = piece.days_served
        new_row['usage'] *= piece.bill_frac
        new_row['cost'] *= piece.bill_frac

        recs.append(new_row)
    
dfu2 = pd.DataFrame(recs, index=range(len(recs)))
print('{:2f} seconds'.format(time.time() - start))

dfu2.to_csv('dfu2.csv')
dfu2.head()

6.920082 seconds


Unnamed: 0,site_id,vendor_code,service_type,item_desc,units,usage,cost,cal_year,cal_mo,fiscal_year,fiscal_mo,days_served
0,ANSBG1,VF314940,Natural Gas,Natural gas (CCF),CCF,212.1875,486.121875,2010,7,2011,1,2.5
1,ANSBG1,VF314940,Natural Gas,Natural gas (CCF),CCF,2503.8125,5736.238125,2010,8,2011,2,29.5
2,ANSBG1,VF314940,Natural Gas,Other Charge,-,,3.029688,2010,7,2011,1,2.5
3,ANSBG1,VF314940,Natural Gas,Other Charge,-,,35.750312,2010,8,2011,2,29.5
4,ANSBG1,VF314940,Natural Gas,Natural gas (CCF),CCF,170.612903,390.874355,2010,8,2011,2,1.5


In [90]:
dfu1.sort_values(by=['vendor_code', 'from_dt'])[dfu1.units=='kWh'].tail(20)

  """Entry point for launching an IPython kernel.


Unnamed: 0,site_id,vendor_code,from_dt,thru_dt,service_type,item_desc,usage,cost,units
1214,ANSBG1,VG354933,2015-05-15,2015-06-15,Electricity,Utility Charge,15600.0,1665.3,kWh
1218,ANSBG1,VG354933,2015-06-15,2015-07-15,Electricity,Utility Charge,14820.0,1586.33,kWh
1222,ANSBG1,VG354933,2015-07-15,2015-08-17,Electricity,Utility Charge,15000.0,1605.6,kWh
1227,ANSBG1,VG354933,2015-08-17,2015-09-16,Electricity,Utility Charge,13800.0,1477.15,kWh
1232,ANSBG1,VG354933,2015-09-16,2015-10-15,Electricity,Utility Charge,13020.0,1393.66,kWh
1237,ANSBG1,VG354933,2015-10-15,2015-11-16,Electricity,Utility Charge,13740.0,1470.73,kWh
1242,ANSBG1,VG354933,2015-11-16,2015-12-15,Electricity,Utility Charge,14520.0,1554.22,kWh
1247,ANSBG1,VG354933,2015-12-15,2016-01-18,Electricity,Utility Charge,16500.0,1881.99,kWh
1252,ANSBG1,VG354933,2016-01-18,2016-02-16,Electricity,Utility Charge,16680.0,1902.52,kWh
1257,ANSBG1,VG354933,2016-02-16,2016-03-15,Electricity,Utility Charge,15000.0,1710.9,kWh


### Other Building Data

In [55]:
df_bldg = pd.read_excel('data/Other_Building_Data.xlsx', sheetname='Building', skiprows=3, index_col='site_ID')
print(df_bldg.loc['ANSBG1'])
df_bldg.head()

site_name        Animal Control
address         2408 Davis Road
city                  Fairbanks
primary_func     Animal Shelter
year_built                 1993
sq_ft                     14000
onsite_gen                  NaN
dd_site                    PAFA
Name: ANSBG1, dtype: object


Unnamed: 0_level_0,site_name,address,city,primary_func,year_built,sq_ft,onsite_gen,dd_site
site_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ANSBG1,Animal Control,2408 Davis Road,Fairbanks,Animal Shelter,1993,14000,,PAFA
BIGDIP,Big Dipper Ice Arena,1920 Lathrop Street,Fairbanks,Ice Arena,1990,23000,,PAFA


In [64]:
Color = namedtuple('Color', 'red green blue')
c = Color._make([100, 85, 234])
print(c, c.red)

Color(red=100, green=85, blue=234) 100


### Degree Days

In [37]:
df_dd = pd.read_excel('data/Other_Building_Data.xlsx', sheetname='Degree Days', skiprows=3, parse_dates=['Month'])
df_dd['year'] = [d.year for d in df_dd.Month]
df_dd['mo'] = [d.month for d in df_dd.Month]
df_dd.set_index(['year', 'mo'], inplace=True)
df_dd.drop('Month', axis=1, inplace=True)

print(df_dd.loc[(2017, 7), 'PAFA'])
df_dd.tail()

86


Unnamed: 0_level_0,Unnamed: 1_level_0,PAFA
year,mo,Unnamed: 2_level_1
2017,4,863
2017,5,448
2017,6,135
2017,7,86
2017,8,248


### Fuel Information

In [53]:
df_fuel = pd.read_excel('data/Other_Building_Data.xlsx', sheetname='Fuel Types', skiprows=3, index_col=[0, 1])
print(df_fuel.loc[('Electricity', 'kWh'), 'btu_per_unit'])
df_fuel

3412


Unnamed: 0_level_0,Unnamed: 1_level_0,btu_per_unit
fuel,unit,Unnamed: 2_level_1
Natural Gas,CCF,102000
Electricity,kWh,3412
Fuel Oil,Gallons,135000
