# Pre-processing of Data Sets

In [1]:
import pandas as pd
from collections import namedtuple
import numpy as np
import time
import pickle
from importlib import reload

import sys
sys.path.insert(0, '../')
import bench_util

%load_ext line_profiler

## Utility Data

In [2]:
# Read the CSV file and convert the billing period dates into 
# real Pandas dates
fn = 'data/20171017 AllDataExport.csv'
dfu = pd.read_csv(fn, parse_dates=['From', 'Thru'])

# Pickle it for use in the other notebook
dfu.to_pickle('df_raw.pkl')
dfu.head()

Unnamed: 0,Site ID,Site Name,Vendor Code,Vendor Name,Account Number,Bill Date,Due Date,Entry Date,Invoice #,Voucher #,From,Thru,Service Name,Item Description,Meter Number,Usage,Cost,Units,Account Financial Code,Site Financial Code
0,TRGR,FNSB-Transit Garage,VP287678,Sourdough Fuel (Petro Star),00013297 (closed),09/28/2010,09/28/2010,01/26/2011,,,2008-11-19,2010-09-28,Oil #1,FED LUS TX,,,3.0,,,
1,TRGR,FNSB-Transit Garage,VP287678,Sourdough Fuel (Petro Star),00013297 (closed),09/28/2010,09/28/2010,01/26/2011,,,2008-11-19,2010-09-28,Oil #1,Fuel Oil #1 (Gallons),,3000.0,7950.0,Gallons,,
2,TRGR,FNSB-Transit Garage,VP287678,Sourdough Fuel (Petro Star),00013297 (closed),09/30/2010,09/30/2010,01/26/2011,,,2010-09-28,2010-09-30,Oil #1,FED LUS TX,,,1.31,,,
3,TRGR,FNSB-Transit Garage,VP287678,Sourdough Fuel (Petro Star),00013297 (closed),09/30/2010,09/30/2010,01/26/2011,,,2010-09-28,2010-09-30,Oil #1,Fuel Oil #1 (Gallons),,1307.0,3463.82,Gallons,,
4,TRGR,FNSB-Transit Garage,VP287678,Sourdough Fuel (Petro Star),00013297 (closed),01/14/2011,01/14/2011,07/28/2014,,,2010-09-30,2011-01-14,Oil #1,Fuel Oil #1 (Gallons),,1880.0,5545.41,Gallons,,


In [3]:
len(dfu)

117276

In [4]:
# Make a utility function object
reload(bench_util)
ut = bench_util.Util(dfu, '../data/Other_Building_Data.xlsx')

In [5]:
cols = ['Site ID', 'Vendor Code', 'Vendor Name', 'Account Number', 'Service Name', 'Item Description',
       'Meter Number', 'Units', 'Account Financial Code', 'Site Financial Code']
for col in cols:
    print('{0:24s}: {1}'.format(col, list(dfu[col].unique())))

Site ID                 : ['TRGR', 'CLXGP2', 'CLXES1', 'CLXSO1', 'CLXSM1', '11', 'TRPBG1', 'NWLBG1', '05', 'HSPSWP', '15A', 'BAOBG1', '15', '15B', 'DIPMP1', 'ANSBG1', 'MSRSWP', 'PRW', '03', '06', '09', '42', '04', '104', '13', '27', '28', '29', '44', '40', '47', '07', '08', 'CLX001', 'CLX002', 'CLX003', 'CLX004', 'VMP001', 'TRPAIR', 'GFP001', 'CACBG1', 'HEZ001', 'KWP001', 'ASLELC1', 'ASLPL1', 'ASLGP2', 'GRP001', '23', 'BALHHW', '12', 'KIP001', 'HEMBG1', '45', '22', 'WSPSWP', 'GSWNP', 'BHPCCS', 'NPP001', 'TRANS10', '14', '10', 'SHW001', 'TRANS06', 'BHPSKI3', 'BHPSKI4', '49', 'TRANS09', 'BAP001', 'ASLELC2', '34', 'KEP001', 'NWLP01', 'CBS001', 'MTP001', 'WSPP01', 'GF001', 'ASLTVR', 'ASLCHU', 'ASLCV1', 'ASLC18', 'ASLPIH', 'ASLHIS', 'ASLSEA', 'ASLC21', 'CSP001', 'WF001', 'MF001', 'MSLL001', '76', 'BHPSKI2', '37', 'GRPLFT', 'DOGPRK', 'MNPPRK', 'SF001', 'NBP001', 'STRBG1', 'MSWBG1', 'TWOCOM', 'NWP001', 'NRP001', 'MSWBG2', 'LF001', 'BENBG1', 'LEABG1', 'CRB001', 'WSPEMR', 'ASLGDM', 'ASLSQD', 'A

In [6]:
dfu[dfu['Service Name']=="Oil #2"]

Unnamed: 0,Site ID,Site Name,Vendor Code,Vendor Name,Account Number,Bill Date,Due Date,Entry Date,Invoice #,Voucher #,From,Thru,Service Name,Item Description,Meter Number,Usage,Cost,Units,Account Financial Code,Site Financial Code
18,TRGR,FNSB-Transit Garage,VP287678,Sourdough Fuel (Petro Star),00013297 (closed),04/11/2011,04/11/2011,11/19/2013,,,2011-03-12,2011-04-11,Oil #2,Tax: Regulatory,,,3.6,,,


In [7]:
# Save out the Unique Site IDs and Names
#df_sites = pd.DataFrame(data=list(set(zip(dfu['Site ID'], dfu['Site Name']))))
#df_sites.to_excel('sites.xlsx')

In [8]:
# Filter down to the needed columns and rename them
cols = [
    ('Site ID', 'site_id'),
    ('From', 'from_dt'),
    ('Thru', 'thru_dt'),
    ('Service Name', 'service_type'),
    ('Item Description', 'item_desc'),
    ('Usage', 'usage'),
    ('Cost', 'cost'),
    ('Units', 'units'),
]

old_cols, new_cols = zip(*cols)         # unpack into old and new column names
dfu1 = dfu[list(old_cols)].copy()              # select just those columns from the origina dataframe
dfu1.columns = new_cols                 # rename the columns
dfu1.head()

Unnamed: 0,site_id,from_dt,thru_dt,service_type,item_desc,usage,cost,units
0,TRGR,2008-11-19,2010-09-28,Oil #1,FED LUS TX,,3.0,
1,TRGR,2008-11-19,2010-09-28,Oil #1,Fuel Oil #1 (Gallons),3000.0,7950.0,Gallons
2,TRGR,2010-09-28,2010-09-30,Oil #1,FED LUS TX,,1.31,
3,TRGR,2010-09-28,2010-09-30,Oil #1,Fuel Oil #1 (Gallons),1307.0,3463.82,Gallons
4,TRGR,2010-09-30,2011-01-14,Oil #1,Fuel Oil #1 (Gallons),1880.0,5545.41,Gallons


In [9]:
dfu1.query('service_type == "Oil #2"')

Unnamed: 0,site_id,from_dt,thru_dt,service_type,item_desc,usage,cost,units
18,TRGR,2011-03-12,2011-04-11,Oil #2,Tax: Regulatory,,3.6,


In [10]:
# Unique sets of service_type and units
df_usage = dfu1.query('usage > 0')
set(zip(df_usage.service_type, df_usage.units))

{('Electricity', 'kVAR'),
 ('Electricity', 'kVARh'),
 ('Electricity', 'kW'),
 ('Electricity', 'kWh'),
 ('Natural Gas', 'CCF'),
 ('Oil #1', 'Gallons'),
 ('Refuse', 'Loads'),
 ('Refuse', 'Tons'),
 ('Sewer', 'Gallons'),
 ('Steam', 'MMBtu'),
 ('Steam', 'klbs'),
 ('Steam', 'lbs'),
 ('Water', 'Cgallons'),
 ('Water', 'Gallons'),
 ('Water', 'kGal')}

In [11]:
df_usage.query('service_type == "Electricity" and units == "kVARh"')

Unnamed: 0,site_id,from_dt,thru_dt,service_type,item_desc,usage,cost,units
10286,HSPSWP,2006-07-17,2006-08-15,Electricity,kVARh/Excess kVARh,1.0,2.47,kVARh
10298,HSPSWP,2006-09-15,2006-10-13,Electricity,kVARh/Excess kVARh,1.0,20.22,kVARh
74072,PRW,2012-09-21,2012-10-19,Electricity,kVARh/Excess kVARh,1.0,0.0,kVARh


### Collapse Non-Usage Changes into "Other Charge"

In [12]:
# Back to processing the main utility bill DataFrame

# Now collapse all the non-usage charges into one item_desc: Other Charge
# This cuts the processing time in half due to not having to split a whole 
# bunch of non-consumption charges.
dfu1.loc[np.isnan(dfu1.usage), 'item_desc'] = 'Other Charge'
dfu1.units.fillna('-', inplace=True)   # Pandas can't do a GroupBy on NaNs, so replace with something
dfu1 = dfu1.groupby(['site_id', 'from_dt', 'thru_dt', 'service_type', 'item_desc', 'units']).sum()
dfu1.reset_index(inplace=True)
dfu1.head(20)

Unnamed: 0,site_id,from_dt,thru_dt,service_type,item_desc,units,usage,cost
0,3,2005-11-28,2005-12-29,Sewer,Other Charge,-,,285.06
1,3,2005-11-28,2005-12-29,Water,Other Charge,-,,53.25
2,3,2005-11-28,2005-12-29,Water,Water Usage (Gallons),Gallons,32400.0,240.65
3,3,2005-12-12,2006-01-12,Electricity,Electricity charge,kWh,31.0,23.2
4,3,2005-12-13,2006-01-13,Electricity,Electricity charge,kWh,36.0,23.74
5,3,2005-12-20,2006-01-23,Electricity,Electricity charge,kWh,43608.0,5546.13
6,3,2005-12-29,2006-01-30,Sewer,Other Charge,-,,210.24
7,3,2005-12-29,2006-01-30,Water,Other Charge,-,,53.25
8,3,2005-12-29,2006-01-30,Water,Water Usage (Gallons),Gallons,23800.0,180.77
9,3,2006-01-01,2006-01-31,Natural Gas,Natural gas (CCF),CCF,7394.0,9412.56


### Split Each Bill into Multiple Pieces, each within one Calendar Month

In [13]:
# Test the split_period function
bench_util.split_period('2016-01-25', '2016-06-26')
# this takes about 3.5 msec to do, which is pretty long

[PeriodSplit(cal_year=2016, cal_mo=1, bill_frac=0.042483660130718956, days_served=6.5),
 PeriodSplit(cal_year=2016, cal_mo=2, bill_frac=0.18954248366013071, days_served=29.0),
 PeriodSplit(cal_year=2016, cal_mo=3, bill_frac=0.20261437908496732, days_served=31.0),
 PeriodSplit(cal_year=2016, cal_mo=4, bill_frac=0.19607843137254902, days_served=30.0),
 PeriodSplit(cal_year=2016, cal_mo=5, bill_frac=0.20261437908496732, days_served=31.0),
 PeriodSplit(cal_year=2016, cal_mo=6, bill_frac=0.16666666666666666, days_served=25.5)]

In [14]:
# Split all the rows into calendar month pieces and make a new DataFrame
recs=[]
for ix, row in dfu1.iterrows():
    # it is *much* faster to modify a dictionary than a Pandas series
    row_tmpl = row.to_dict()   

    # Pull out start and end of billing period; can drop the from & thru dates now
    # doing split-up of billing period across months.
    st = row_tmpl['from_dt']
    en = row_tmpl['thru_dt']
    del row_tmpl['from_dt']
    del row_tmpl['thru_dt']
    
    for piece in bench_util.split_period(st, en):
        new_row = row_tmpl.copy()
        new_row['cal_year'] = piece.cal_year
        new_row['cal_mo'] = piece.cal_mo
        new_row['days_served'] = piece.days_served
        new_row['usage'] *= piece.bill_frac
        new_row['cost'] *= piece.bill_frac
        recs.append(new_row)

dfu2 = pd.DataFrame(recs, index=range(len(recs)))
dfu2.head()

Unnamed: 0,cal_mo,cal_year,cost,days_served,item_desc,service_type,site_id,units,usage
0,11,2005,22.98871,2.5,Other Charge,Sewer,3,-,
1,12,2005,262.07129,28.5,Other Charge,Sewer,3,-,
2,11,2005,4.294355,2.5,Other Charge,Water,3,-,
3,12,2005,48.955645,28.5,Other Charge,Water,3,-,
4,11,2005,19.407258,2.5,Water Usage (Gallons),Water,3,Gallons,2612.903226


In [23]:
dfu2.to_csv('dfu2.csv')

### Sum Up the Pieces by Month

In [15]:
dfu3 = dfu2.groupby(
    ['site_id', 'service_type', 'cal_year', 'cal_mo', 'item_desc', 'units']
).sum()
dfu3 = dfu3.reset_index()
dfu3.head(10)

Unnamed: 0,site_id,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage
0,3,Electricity,2005,12,Electricity charge,kWh,1904.65788,49.5,14790.748577
1,3,Electricity,2006,1,Electricity charge,kWh,5430.493797,93.0,42665.790911
2,3,Electricity,2006,2,Electricity charge,kWh,5764.40673,84.0,45010.439348
3,3,Electricity,2006,3,Electricity charge,kWh,6349.255299,93.0,46311.547557
4,3,Electricity,2006,4,Electricity charge,kWh,5529.385224,90.0,40392.812893
5,3,Electricity,2006,5,Electricity charge,kWh,5114.850768,93.0,37585.009199
6,3,Electricity,2006,6,Electricity charge,-,23.225806,36.0,0.0
7,3,Electricity,2006,6,Electricity charge,kWh,3711.073939,54.0,26419.530303
8,3,Electricity,2006,7,Electricity charge,-,16.774194,26.0,0.0
9,3,Electricity,2006,7,Electricity charge,kWh,2982.66747,67.0,18455.905417


In [16]:
dfu3[dfu3.service_type=='Electricity'].head(10)

Unnamed: 0,site_id,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage
0,3,Electricity,2005,12,Electricity charge,kWh,1904.65788,49.5,14790.748577
1,3,Electricity,2006,1,Electricity charge,kWh,5430.493797,93.0,42665.790911
2,3,Electricity,2006,2,Electricity charge,kWh,5764.40673,84.0,45010.439348
3,3,Electricity,2006,3,Electricity charge,kWh,6349.255299,93.0,46311.547557
4,3,Electricity,2006,4,Electricity charge,kWh,5529.385224,90.0,40392.812893
5,3,Electricity,2006,5,Electricity charge,kWh,5114.850768,93.0,37585.009199
6,3,Electricity,2006,6,Electricity charge,-,23.225806,36.0,0.0
7,3,Electricity,2006,6,Electricity charge,kWh,3711.073939,54.0,26419.530303
8,3,Electricity,2006,7,Electricity charge,-,16.774194,26.0,0.0
9,3,Electricity,2006,7,Electricity charge,kWh,2982.66747,67.0,18455.905417


### Add Fiscal Year Info and MMBtus

In [17]:
# Add Fiscal Year and month columns
fyr = []
fmo = []
for cyr, cmo in zip(dfu3.cal_year, dfu3.cal_mo):
    fis_yr, fis_mo = bench_util.calendar_to_fiscal(cyr, cmo)
    fyr.append(fis_yr)
    fmo.append(fis_mo)
dfu3['fiscal_year'] = fyr
dfu3['fiscal_mo'] = fmo
dfu3.head()

Unnamed: 0,site_id,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage,fiscal_year,fiscal_mo
0,3,Electricity,2005,12,Electricity charge,kWh,1904.65788,49.5,14790.748577,2006,6
1,3,Electricity,2006,1,Electricity charge,kWh,5430.493797,93.0,42665.790911,2006,7
2,3,Electricity,2006,2,Electricity charge,kWh,5764.40673,84.0,45010.439348,2006,8
3,3,Electricity,2006,3,Electricity charge,kWh,6349.255299,93.0,46311.547557,2006,9
4,3,Electricity,2006,4,Electricity charge,kWh,5529.385224,90.0,40392.812893,2006,10


In [40]:
mmbtu = []
for ix, row in dfu3.iterrows():
    row_mmbtu = ut.fuel_btus_per_unit(row.service_type, row.units) * row.usage / 1e6
    if np.isnan(row_mmbtu): row_mmbtu = 0.0
    mmbtu.append(row_mmbtu)
dfu3['mmbtu'] = mmbtu
dfu3.head(10)

Unnamed: 0,site_id,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage,fiscal_year,fiscal_mo,mmbtu
0,3,Electricity,2005,12,Electricity charge,kWh,1904.65788,49.5,14790.748577,2006,6,50.466034
1,3,Electricity,2006,1,Electricity charge,kWh,5430.493797,93.0,42665.790911,2006,7,145.575679
2,3,Electricity,2006,2,Electricity charge,kWh,5764.40673,84.0,45010.439348,2006,8,153.575619
3,3,Electricity,2006,3,Electricity charge,kWh,6349.255299,93.0,46311.547557,2006,9,158.015
4,3,Electricity,2006,4,Electricity charge,kWh,5529.385224,90.0,40392.812893,2006,10,137.820278
5,3,Electricity,2006,5,Electricity charge,kWh,5114.850768,93.0,37585.009199,2006,11,128.240051
6,3,Electricity,2006,6,Electricity charge,-,23.225806,36.0,0.0,2006,12,0.0
7,3,Electricity,2006,6,Electricity charge,kWh,3711.073939,54.0,26419.530303,2006,12,90.143437
8,3,Electricity,2006,7,Electricity charge,-,16.774194,26.0,0.0,2007,1,0.0
9,3,Electricity,2006,7,Electricity charge,kWh,2982.66747,67.0,18455.905417,2007,1,62.971549


In [19]:
dfu3[dfu3.service_type=='Electricity'].head(10)

Unnamed: 0,site_id,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage,fiscal_year,fiscal_mo,mmbtu
0,3,Electricity,2005,12,Electricity charge,kWh,1904.65788,49.5,14790.748577,2006,6,50.466034
1,3,Electricity,2006,1,Electricity charge,kWh,5430.493797,93.0,42665.790911,2006,7,145.575679
2,3,Electricity,2006,2,Electricity charge,kWh,5764.40673,84.0,45010.439348,2006,8,153.575619
3,3,Electricity,2006,3,Electricity charge,kWh,6349.255299,93.0,46311.547557,2006,9,158.015
4,3,Electricity,2006,4,Electricity charge,kWh,5529.385224,90.0,40392.812893,2006,10,137.820278
5,3,Electricity,2006,5,Electricity charge,kWh,5114.850768,93.0,37585.009199,2006,11,128.240051
6,3,Electricity,2006,6,Electricity charge,-,23.225806,36.0,0.0,2006,12,0.0
7,3,Electricity,2006,6,Electricity charge,kWh,3711.073939,54.0,26419.530303,2006,12,90.143437
8,3,Electricity,2006,7,Electricity charge,-,16.774194,26.0,0.0,2007,1,0.0
9,3,Electricity,2006,7,Electricity charge,kWh,2982.66747,67.0,18455.905417,2007,1,62.971549


In [20]:
dfu3.to_csv('dfu3.csv')
dfu3.to_pickle('dfu3.pkl')

### Compare to Old DataFrame to make sure format was the same

In [33]:
df_test = pd.pivot_table(dfu3, index='site_id', values='cost', columns='fiscal_year')
dfu3_old = pd.read_pickle('dfu3_old.pkl')
df_test_old = pd.pivot_table(dfu3_old, index='site_id', values='cost', columns='fiscal_year')
df_diff = df_test - df_test_old
df_diff.to_csv('df_diff.csv')
df_diff

fiscal_year,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
03,,,,,,29.033144,0.0,0.000000,0.000000,0.000000,-5.152914,446.678910,
04,,,,,,31.642611,0.0,0.000000,0.000000,0.000000,-19.859050,835.383678,
05,,,,,,98.504327,0.0,0.000000,0.000000,0.000000,3.932541,1869.011458,
06,,,,,,22.854131,0.0,0.000000,0.000000,0.000000,-6.026963,676.229957,
07,,,,,,90.041916,0.0,0.000000,0.000000,0.000000,25.288608,779.637596,
08,,,,,,26.022863,0.0,0.000000,0.000000,0.000000,-13.580637,598.963289,
09,,,,,,104.593805,0.0,0.000000,0.000000,0.000000,-6.884378,748.988652,
10,,,,,,117.685866,0.0,0.000000,0.000000,0.000000,-16.576713,766.081392,
104,,,,,1986.776884,26.881476,0.0,0.000000,0.000000,0.000000,0.000000,770.819877,
11,,,,,,176.522276,0.0,0.000000,0.000000,0.000000,-38.732128,1259.936928,


In [41]:
dfu3.query("site_id=='13' and fiscal_year==2016 and fiscal_mo==1")

Unnamed: 0,site_id,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage,fiscal_year,fiscal_mo,mmbtu
10799,13,Electricity,2015,7,Electricity charge,kWh,4668.356356,31.0,18638.004032,2016,1,63.59287
11076,13,Refuse,2015,7,Other Charge,-,133.754194,31.0,,2016,1,0.0
11204,13,Sewer,2015,7,Other Charge,-,411.637923,31.0,,2016,1,0.0
11464,13,Water,2015,7,Other Charge,-,190.007838,31.0,,2016,1,0.0
11465,13,Water,2015,7,Water Usage (Gallons),Gallons,386.452288,31.0,29310.554435,2016,1,0.0


In [42]:
dfu3_old.query("site_id=='13' and fiscal_year==2016 and fiscal_mo==1")

Unnamed: 0,site_id,service_type,cal_year,cal_mo,item_desc,units,cost,days_served,usage,fiscal_year,fiscal_mo,mmbtu
6211,13,Electricity,2015,7,Electricity charge,kWh,4668.356356,31.0,18638.004032,2016,1,63.59287
6345,13,Refuse,2015,7,Other Charge,-,133.754194,31.0,,2016,1,
6413,13,Sewer,2015,7,Other Charge,-,411.637923,31.0,,2016,1,
6553,13,Water,2015,7,Other Charge,-,190.007838,31.0,,2016,1,
6554,13,Water,2015,7,Water Usage (Gallons),Gallons,386.452288,31.0,29310.554435,2016,1,0.0


## Work on Other Utility Functions

### Find All Fiscal Years and Months Present in a DataFrame

In [21]:
bench_util.months_present(dfu3)

[(2006, 2),
 (2006, 3),
 (2006, 4),
 (2006, 5),
 (2006, 6),
 (2006, 7),
 (2006, 8),
 (2006, 9),
 (2006, 10),
 (2006, 11),
 (2006, 12),
 (2007, 1),
 (2007, 2),
 (2007, 3),
 (2007, 4),
 (2007, 5),
 (2007, 6),
 (2007, 7),
 (2007, 8),
 (2007, 9),
 (2007, 10),
 (2007, 11),
 (2007, 12),
 (2008, 1),
 (2008, 2),
 (2008, 3),
 (2008, 4),
 (2008, 5),
 (2008, 6),
 (2008, 7),
 (2008, 8),
 (2008, 9),
 (2008, 10),
 (2008, 11),
 (2008, 12),
 (2009, 1),
 (2009, 2),
 (2009, 3),
 (2009, 4),
 (2009, 5),
 (2009, 6),
 (2009, 7),
 (2009, 8),
 (2009, 9),
 (2009, 10),
 (2009, 11),
 (2009, 12),
 (2010, 1),
 (2010, 2),
 (2010, 3),
 (2010, 4),
 (2010, 5),
 (2010, 6),
 (2010, 7),
 (2010, 8),
 (2010, 9),
 (2010, 10),
 (2010, 11),
 (2010, 12),
 (2011, 1),
 (2011, 2),
 (2011, 3),
 (2011, 4),
 (2011, 5),
 (2011, 6),
 (2011, 7),
 (2011, 8),
 (2011, 9),
 (2011, 10),
 (2011, 11),
 (2011, 12),
 (2012, 1),
 (2012, 2),
 (2012, 3),
 (2012, 4),
 (2012, 5),
 (2012, 6),
 (2012, 7),
 (2012, 8),
 (2012, 9),
 (2012, 10),
 (2012, 1

In [22]:
bench_util.months_present(dfu3, 'cal_year', 'cal_mo')[-5:]

[(2017, 8), (2017, 9), (2017, 10), (2017, 11), (2017, 12)]

## Convert Notebook to Script

In [43]:
# Convert the notebook to a script.
#!jupyter nbconvert --to script preprocess_data.ipynb

[NbConvertApp] Converting notebook preprocess_data.ipynb to script
[NbConvertApp] Writing 5511 bytes to preprocess_data.py
