# Pre-processing of Data Sets

In [24]:
import pandas as pd
from collections import namedtuple
import numpy as np
import time

%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


### Utility Data

In [2]:
# Read the CSV file and convert the billing period dates into 
# real Pandas dates
dfu = pd.read_csv('data/ExampleDataExport.csv', parse_dates=['From', 'Thru'])
dfu.head()

Unnamed: 0,Site ID,Site Name,Vendor Code,Vendor Name,Account Number,Bill Date,Due Date,Entry Date,Invoice #,Voucher #,From,Thru,Service Name,Item Description,Meter Number,Usage,Cost,Units,Account Financial Code,Site Financial Code
0,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),8/30/2010,8/30/2010,1/24/2011,,,2010-07-29,2010-08-30,Natural Gas,Natural gas (CCF),,2716.0,6222.36,CCF,61837,
1,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),8/30/2010,8/30/2010,1/24/2011,,,2010-07-29,2010-08-30,Natural Gas,Regulatory Cost Charge,,,21.53,,61837,
2,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),8/30/2010,8/30/2010,1/24/2011,,,2010-07-29,2010-08-30,Natural Gas,Customer Charge,,,17.25,,61837,
3,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),9/30/2010,9/30/2010,1/24/2011,,,2010-08-30,2010-09-30,Natural Gas,Natural gas (CCF),,3526.0,8078.07,CCF,61837,
4,ANSBG1,FNSB-Animal Control,VF314940,Fairbanks Natural Gas,10282 (2408 DAVIS),9/30/2010,9/30/2010,1/24/2011,,,2010-08-30,2010-09-30,Natural Gas,Regulatory Cost Charge,,,27.93,,61837,


In [3]:
cols = ['Site ID', 'Vendor Code', 'Vendor Name', 'Account Number', 'Service Name', 'Item Description',
       'Meter Number', 'Units', 'Account Financial Code', 'Site Financial Code']
for col in cols:
    print('{0:24s}: {1}'.format(col, list(dfu[col].unique())))

Site ID                 : ['ANSBG1']
Vendor Code             : ['VF314940', 'VG372746', 'VG354933']
Vendor Name             : ['Fairbanks Natural Gas', 'Golden Heart Utilities', 'Golden Valley Electric']
Account Number          : ['10282  (2408 DAVIS)', '1311001100', '31850']
Service Name            : ['Natural Gas', 'Water', 'Sewer', 'Electricity']
Item Description        : ['Natural gas (CCF)', 'Regulatory Cost Charge', 'Customer Charge', 'Gas Charge (CCF)', 'Late charge', 'Water Usage (Gallons)', 'Sewer Usage (Gallons)', 'Water Fixed Charge', 'Sewer Fixed Charge', 'Fire Protection', 'Other charges', 'Misc. credit', 'Meter charge', 'Customer Charge - Water', 'Customer Charge - Sewer', 'Plant Replacement ADJ', 'Cost of Energy Adjustmen', 'Energy charge', 'KW Charge', 'Fuel Adjustment', 'On peak demand', 'Fuel cost adjustment', 'Fuel & Purchased Power', 'Utility Charge']
Meter Number            : [nan, 89672.0]
Units                   : ['CCF', nan, 'Gallons', 'kWh', 'kW']
Account Fina

In [4]:
# Filter down to the needed columns and rename them
cols = [
    ('Site ID', 'site_id'),
    ('Vendor Code', 'vendor_code'),
    ('From', 'from_dt'),
    ('Thru', 'thru_dt'),
    ('Service Name', 'service_type'),
    ('Item Description', 'item_desc'),
    ('Usage', 'usage'),
    ('Cost', 'cost'),
    ('Units', 'units'),
]

old_cols, new_cols = zip(*cols)         # unpack into old and new column names
dfu1 = dfu[list(old_cols)]              # select just those columns from the origina dataframe
dfu1 = dfu1.rename(columns=dict(cols))  # rename the columns
dfu1.head()

Unnamed: 0,site_id,vendor_code,from_dt,thru_dt,service_type,item_desc,usage,cost,units
0,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Natural gas (CCF),2716.0,6222.36,CCF
1,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Regulatory Cost Charge,,21.53,
2,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Customer Charge,,17.25,
3,ANSBG1,VF314940,2010-08-30,2010-09-30,Natural Gas,Natural gas (CCF),3526.0,8078.07,CCF
4,ANSBG1,VF314940,2010-08-30,2010-09-30,Natural Gas,Regulatory Cost Charge,,27.93,


In [5]:
# Now collapse all the non-usage charges into one item_desc: Other Charge
# This cuts the processing time in half due to not having to split a whole 
# bunch of non-consumption charges.
dfu1.loc[np.isnan(dfu1.usage), 'item_desc'] = 'Other Charge'
dfu1.units.fillna('-', inplace=True)   # Pandas can't do a GroupBy on NaNs, so replace with something
dfu1 = dfu1.groupby(['site_id', 'vendor_code', 'from_dt', 'thru_dt', 'service_type', 'item_desc', 'units']).sum()
dfu1.reset_index(inplace=True)
dfu1.head(20)

Unnamed: 0,site_id,vendor_code,from_dt,thru_dt,service_type,item_desc,units,usage,cost
0,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Natural gas (CCF),CCF,2716.0,6222.36
1,ANSBG1,VF314940,2010-07-29,2010-08-30,Natural Gas,Other Charge,-,,38.78
2,ANSBG1,VF314940,2010-08-30,2010-09-30,Natural Gas,Natural gas (CCF),CCF,3526.0,8078.07
3,ANSBG1,VF314940,2010-08-30,2010-09-30,Natural Gas,Other Charge,-,,45.18
4,ANSBG1,VF314940,2010-09-30,2010-10-29,Natural Gas,Natural gas (CCF),CCF,3921.0,8983.01
5,ANSBG1,VF314940,2010-09-30,2010-10-29,Natural Gas,Other Charge,-,,48.3
6,ANSBG1,VF314940,2010-10-29,2010-11-30,Natural Gas,Natural gas (CCF),CCF,5949.0,13629.16
7,ANSBG1,VF314940,2010-10-29,2010-11-30,Natural Gas,Other Charge,-,,64.33
8,ANSBG1,VF314940,2010-11-30,2010-12-29,Natural Gas,Natural gas (CCF),CCF,5098.0,11679.52
9,ANSBG1,VF314940,2010-11-30,2010-12-29,Natural Gas,Other Charge,-,,57.6


In [6]:
PeriodSplit = namedtuple('PeriodSplit', 'cal_year cal_mo bill_frac days_served')
def split_period(start_date, end_date):
    """Splits a range of service dates from a utility bill into pieces that
    fit within calendar months. For each piece, the number of days in that piece 
    and the fraction of the original date range are returned in a namedtuple.
    For the first and last day in the date range, it is assumed that only half
    the day is served (this is typically the meter reading day).
    """
    # make a daily series.  The value is the fraction of the day served,
    # 1.0 for days except the first and last.
    ser = pd.Series(data=1.0, index=pd.date_range(start_date, end_date))
    
    # the half days at the beginning and end
    ser.iloc[0] = 0.5
    ser.iloc[-1] = 0.5
    
    tot_days = ser.sum()    # total days served in the bill
    
    # break into monthly pieces and add up the days served
    pieces = ser.resample('M').sum()
    
    result = []
    for dt, days in pieces.items():
        result.append(
            PeriodSplit(cal_year=dt.year, cal_mo=dt.month, bill_frac=days/tot_days, days_served=days)
        )
    return result

In [7]:
split_period('2016-01-25', '2016-06-26')
# this takes about 3.5 msec to due, which is pretty long

[PeriodSplit(cal_year=2016, cal_mo=1, bill_frac=0.042483660130718956, days_served=6.5),
 PeriodSplit(cal_year=2016, cal_mo=2, bill_frac=0.18954248366013071, days_served=29.0),
 PeriodSplit(cal_year=2016, cal_mo=3, bill_frac=0.20261437908496732, days_served=31.0),
 PeriodSplit(cal_year=2016, cal_mo=4, bill_frac=0.19607843137254902, days_served=30.0),
 PeriodSplit(cal_year=2016, cal_mo=5, bill_frac=0.20261437908496732, days_served=31.0),
 PeriodSplit(cal_year=2016, cal_mo=6, bill_frac=0.16666666666666666, days_served=25.5)]

In [28]:
# Split all the rows into calendar month pieces and make a new DataFrame
recs=[]
for ix, row in dfu1.iterrows():
    # it is *much* faster to modify a dictionary than a Pandas series
    row_tmpl = row.to_dict()   

    # Pull out start and end of billing period; don't need them anymore after
    # doing split-up of billing period across months.
    st = row_tmpl['from_dt']
    en = row_tmpl['thru_dt']
    del row_tmpl['from_dt']
    del row_tmpl['thru_dt']
    
    for piece in split_period(st, en):
        new_row = row_tmpl.copy()
        new_row['cal_year'] = piece.cal_year
        new_row['cal_mo'] = piece.cal_mo
        new_row['days_served'] = piece.days_served
        new_row['usage'] *= piece.bill_frac
        new_row['cost'] *= piece.bill_frac
        recs.append(new_row)

dfu2 = pd.DataFrame(recs, index=range(len(recs)))

dfu2.to_csv('dfu2.csv')
dfu2.head()

Unnamed: 0,cal_mo,cal_year,cost,days_served,item_desc,service_type,site_id,units,usage,vendor_code
0,7,2010,486.121875,2.5,Natural gas (CCF),Natural Gas,ANSBG1,CCF,212.1875,VF314940
1,8,2010,5736.238125,29.5,Natural gas (CCF),Natural Gas,ANSBG1,CCF,2503.8125,VF314940
2,7,2010,3.029688,2.5,Other Charge,Natural Gas,ANSBG1,-,,VF314940
3,8,2010,35.750312,29.5,Other Charge,Natural Gas,ANSBG1,-,,VF314940
4,8,2010,390.874355,1.5,Natural gas (CCF),Natural Gas,ANSBG1,CCF,170.612903,VF314940


In [39]:
dfu3 = dfu2.groupby(
    ['site_id', 'vendor_code', 'service_type', 'cal_year', 'cal_mo', 'item_desc', 'units']
).sum()
dfu3 = dfu3.reset_index()
dfu3.head(10)

Unnamed: 0,site_id,vendor_code,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage
0,ANSBG1,VF314940,Natural Gas,2010,7,Natural gas (CCF),CCF,486.121875,2.5,212.1875
1,ANSBG1,VF314940,Natural Gas,2010,7,Other Charge,-,3.029688,2.5,
2,ANSBG1,VF314940,Natural Gas,2010,8,Natural gas (CCF),CCF,6127.11248,31.0,2674.425403
3,ANSBG1,VF314940,Natural Gas,2010,8,Other Charge,-,37.936442,31.0,
4,ANSBG1,VF314940,Natural Gas,2010,9,Natural gas (CCF),CCF,7842.075128,30.0,3422.990545
5,ANSBG1,VF314940,Natural Gas,2010,9,Other Charge,-,43.82663,30.0,
6,ANSBG1,VF314940,Natural Gas,2010,10,Natural gas (CCF),CCF,9892.908642,31.0,4318.162177
7,ANSBG1,VF314940,Natural Gas,2010,10,Other Charge,-,52.493023,31.0,
8,ANSBG1,VF314940,Natural Gas,2010,11,Natural gas (CCF),CCF,12765.752909,30.0,5572.130927
9,ANSBG1,VF314940,Natural Gas,2010,11,Other Charge,-,60.297322,30.0,


In [38]:
dfu3[dfu3.service_type=='Electricity'].head(10)

Unnamed: 0,site_id,vendor_code,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage
165,ANSBG1,VG354933,Electricity,2010,7,Energy charge,kWh,602.3,15.5,7290.0
166,ANSBG1,VG354933,Electricity,2010,7,KW Charge,kW,0.0,15.5,16.45
167,ANSBG1,VG354933,Electricity,2010,7,Other Charge,-,699.36,15.5,
168,ANSBG1,VG354933,Electricity,2010,8,Energy charge,kWh,1212.035,31.0,14670.0
169,ANSBG1,VG354933,Electricity,2010,8,KW Charge,kW,0.0,31.0,30.75
170,ANSBG1,VG354933,Electricity,2010,8,Other Charge,-,1387.745,31.0,
171,ANSBG1,VG354933,Electricity,2010,9,Energy charge,kWh,1179.815,30.0,14280.0
172,ANSBG1,VG354933,Electricity,2010,9,KW Charge,kW,0.0,30.0,27.6
173,ANSBG1,VG354933,Electricity,2010,9,Other Charge,-,1332.65,30.0,
174,ANSBG1,VG354933,Electricity,2010,10,Energy charge,kWh,1300.944839,31.0,15746.129032


In [33]:
dfu3.days_served.value_counts()

31.0    370
30.0    213
28.0     36
29.0     17
14.5      9
15.5      8
28.5      5
5.5       4
26.5      4
22.5      4
0.5       4
2.5       3
30.5      3
29.5      2
16.5      2
17.5      2
1.0       2
27.5      1
13.5      1
Name: days_served, dtype: int64

In [26]:
# %lprun -f test_func test_func()
# %timeit test_func().head()

1 loop, best of 3: 1.98 s per loop


In [None]:
if piece.cal_mo <= 6:
    new_row['fiscal_year'] = piece.cal_year
    new_row['fiscal_mo'] = piece.cal_mo + 6
else:
    new_row['fiscal_year'] = piece.cal_year + 1
    new_row['fiscal_mo'] = piece.cal_mo - 6


### Other Building Data

In [55]:
df_bldg = pd.read_excel('data/Other_Building_Data.xlsx', sheetname='Building', skiprows=3, index_col='site_ID')
print(df_bldg.loc['ANSBG1'])
df_bldg.head()

site_name        Animal Control
address         2408 Davis Road
city                  Fairbanks
primary_func     Animal Shelter
year_built                 1993
sq_ft                     14000
onsite_gen                  NaN
dd_site                    PAFA
Name: ANSBG1, dtype: object


Unnamed: 0_level_0,site_name,address,city,primary_func,year_built,sq_ft,onsite_gen,dd_site
site_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ANSBG1,Animal Control,2408 Davis Road,Fairbanks,Animal Shelter,1993,14000,,PAFA
BIGDIP,Big Dipper Ice Arena,1920 Lathrop Street,Fairbanks,Ice Arena,1990,23000,,PAFA


In [64]:
Color = namedtuple('Color', 'red green blue')
c = Color._make([100, 85, 234])
print(c, c.red)

Color(red=100, green=85, blue=234) 100


### Degree Days

In [37]:
df_dd = pd.read_excel('data/Other_Building_Data.xlsx', sheetname='Degree Days', skiprows=3, parse_dates=['Month'])
df_dd['year'] = [d.year for d in df_dd.Month]
df_dd['mo'] = [d.month for d in df_dd.Month]
df_dd.set_index(['year', 'mo'], inplace=True)
df_dd.drop('Month', axis=1, inplace=True)

print(df_dd.loc[(2017, 7), 'PAFA'])
df_dd.tail()

86


Unnamed: 0_level_0,Unnamed: 1_level_0,PAFA
year,mo,Unnamed: 2_level_1
2017,4,863
2017,5,448
2017,6,135
2017,7,86
2017,8,248


### Fuel Information

In [53]:
df_fuel = pd.read_excel('data/Other_Building_Data.xlsx', sheetname='Fuel Types', skiprows=3, index_col=[0, 1])
print(df_fuel.loc[('Electricity', 'kWh'), 'btu_per_unit'])
df_fuel

3412


Unnamed: 0_level_0,Unnamed: 1_level_0,btu_per_unit
fuel,unit,Unnamed: 2_level_1
Natural Gas,CCF,102000
Electricity,kWh,3412
Fuel Oil,Gallons,135000
