In [1]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

In [2]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..','..','..')))
import pudl
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import dask.dataframe as dd
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

In [3]:
plt.style.use('ggplot')
mpl.rcParams['figure.figsize'] = (10,4)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 56

In [4]:
cems_init_dtypes = {
    'unitid': str,
    'facility_id': str,
    'unit_id_epa': str,
}

cems_dtypes = {
    'year': 'uint16',
    'state': 'category',
    'plant_name': 'category',
    'plant_id_eia': 'uint16',
    'unitid': 'category',
    'gross_load_mw': 'float32',
    'steam_load_1000_lbs': 'float32',
    'so2_mass_lbs': 'float32',
    'so2_mass_measurement_code': 'category',
    'nox_rate_lbs_mmbtu': 'float32',
    'nox_rate_measurement_code': 'category',
    'nox_mass_lbs': 'float32',
    'nox_mass_measurement_code': 'category',
    'co2_mass_tons': 'float32',
    'co2_mass_measurement_code': 'category',
    'heat_content_mmbtu': 'float32',
    'facility_id': 'category',
    'unit_id_epa': 'category',
    'operating_datetime': 'datetime64',
    'operating_time_hours': 'float32'
}
wecc_states = ['CA','OR','WA','ID','NV','AZ','UT','WY','MT','CO','NM']
small_states = ['ID','MT','WY','NV']
big_states = ['CA','TX','NY','FL']

cems_df_template = pd.DataFrame(columns=cems_dtypes.keys())
cems_df_template = cems_df_template.astype(cems_dtypes)
cems_table = pa.Table.from_pandas(cems_df_template)

def downcast_numeric(df, from_dtype, to_dtype):
    to_downcast = df.select_dtypes(include=[from_dtype])
    for col in to_downcast.columns:
        df[col] = pd.to_numeric(to_downcast[col], downcast=to_dtype)
    return df

def year_from_operating_datetime(df):
    df['year'] = df.operating_datetime.dt.year
    return df

def cems_to_parquet(transformed_df_dicts):
    for df_dict in transformed_df_dicts:
        for yr_st in df_dict:
            df = df_dict[yr_st]
            print(f'{yr_st}: {len(df)} records')
            if not df.empty:
                df = (df
                    .astype(cems_init_dtypes)
                    .pipe(downcast_numeric, from_dtype='float', to_dtype='float')
                    .pipe(downcast_numeric, from_dtype='int', to_dtype='unsigned')
                    .pipe(year_from_operating_datetime)
                    .astype(cems_dtypes)
                )
                pq.write_to_dataset(pa.Table.from_pandas(df,
                                                         preserve_index=False,
                                                         schema=cems_table.schema),
                                    root_path='epacems_dataset',
                                    partition_cols=['year'],
                                    compression='snappy')

In [30]:
%%time
# A little cleanup..
!rm -rf epacems_dataset/*
if 'epacems_df' in locals() or 'epacems_df' in globals():
    del epacems_df

raw_dfs = pudl.extract.epacems.extract(
    epacems_years=pudl.constants.working_years['epacems'],
#    epacems_years=[1995, 2001, 2008, 2017],
#    epacems_years=[2015, 2016, 2017],
#    states=small_states,
#    states=['ID'],
#    states=big_states,
#    states=wecc_states,
    states=pudl.constants.cems_states,
    verbose=True
)

transformed_dfs = pudl.transform.epacems.transform(raw_dfs, verbose=True)

cems_to_parquet(transformed_dfs)

Transforming tables from EPA CEMS:
Extracting EPA CEMS data...
    1995:
        AL: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'AL'): 122640 records
        AR: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'AR'): 0 records
        AZ: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'AZ'): 0 records
        CA: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'CA'): 0 records
        CO: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'CO'): 0 records
        CT: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'CT'): 0 records
        DC: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'DC'): 0 records
        DE: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'DE'): 0 records
        FL: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'FL'): 87600 records
        GA: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'GA'): 297840 records
        IA: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'IA'): 43800 records
        ID: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'ID'): 0 records
        IL: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'IL'): 350400 records
        IN: 1 2 3 4 5 6 7 8 9 10 11 12  
(1995, 'IN'): 3679

Timer unit: 1e-06 s

Total time: 3674.76 s
File: <ipython-input-29-48170c4ea0b9>
Function: cems_to_parquet at line 49

Line #      Hits         Time  Per Hit   % Time  Line Contents
    49                                           def cems_to_parquet(transformed_df_dicts):
    50      1128 2126722224.0 1885392.0     57.9      for df_dict in transformed_df_dicts:
    51      2254       7846.0      3.5      0.0          for yr_st in df_dict:
    52      1127     223192.0    198.0      0.0              df = df_dict[yr_st]
    53      1127     103571.0     91.9      0.0              print(f'{yr_st}: {len(df)} records')
    54      1127      19449.0     17.3      0.0              if not df.empty:
    55      1076       1134.0      1.1      0.0                  df = (df
    56      1076  630068811.0 585565.8     17.1                      .astype(cems_init_dtypes)
    57      1076   45919200.0  42675.8      1.2                      .pipe(downcast_numeric, from_dtype='float', to_dtype='float')

In [21]:
ops_cols = [
    'co2_mass_tons',
    'facility_id',
    'gross_load_mw',
    'heat_content_mmbtu',
    'operating_time_hours',
    'plant_id_eia',
    'state',
    'unit_id_epa',
    'unitid',
    'operating_datetime'
]
epacems_datadir = os.path.join(pudl.settings.PUDL_DIR,'results','parquet','epacems')
%time cems_dd = dd.read_parquet(epacems_datadir + '/*/*.parquet', columns=ops_cols)

CPU times: user 1 s, sys: 14.4 ms, total: 1.02 s
Wall time: 1.02 s


In [22]:
cems_dd.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 10 entries, co2_mass_tons to operating_datetime
dtypes: category(4), datetime64[ns](1), float32(4), uint16(1)

In [12]:
%time co_df = cems_dd[cems_dd.state=='CO'].compute()

CPU times: user 3min 48s, sys: 1min 38s, total: 5min 27s
Wall time: 1min 28s


In [14]:
co_df.sample(10)

Unnamed: 0,co2_mass_tons,facility_id,gross_load_mw,heat_content_mmbtu,operating_time_hours,plant_id_eia,state,unit_id_epa,unitid,operating_datetime
61255,,,,,0.0,6248,CO,,1,2005-02-24 07:00:00
490394,258.799988,80.0,247.0,2556.699951,1.0,470,CO,300.0,2,2012-10-03 02:00:00
268501,,,,,0.0,6761,CO,,C,2004-07-06 13:00:00
8759,40.299999,83.0,33.0,384.600006,1.0,492,CO,313.0,5,2013-01-24 23:00:00
238066,,1333.0,,,0.0,55200,CO,4125.0,CT6,2011-05-01 10:00:00
326165,,,,,0.0,55504,CO,,L1,2006-08-29 05:00:00
478978,,79.0,,,0.0,469,CO,298.0,4,2017-10-29 10:00:00
406207,52.700001,,46.0,513.900024,1.0,468,CO,,2,2007-10-19 07:00:00
85015,,8291.0,,,0.0,50707,CO,90508.0,S005,2012-02-04 07:00:00
577350,13.1,82.0,,220.600006,1.0,478,CO,310.0,1,2010-12-02 06:00:00


In [29]:
wy_datadirs = epacems_datadir + '/*/state=WY/*.parquet'
wy_dd = dd.read_parquet(wy_datadirs, columns=ops_cols)

In [31]:
wy_df = wy_dd.compute()

In [33]:
wy_df.sample(20)

Unnamed: 0,co2_mass_tons,facility_id,gross_load_mw,heat_content_mmbtu,operating_time_hours,plant_id_eia,state,unit_id_epa,unitid,operating_datetime
132324,324.899994,765,300.0,3097.800049,1.0,4162,WY,2639,3,2016-08-03 12:00:00
73809,153.699997,764,150.0,1465.099976,1.0,4158,WY,2635,BW43,2017-05-14 09:00:00
132963,430.100006,819,383.0,4101.200195,1.0,6101,WY,2777,BW91,2017-08-24 03:00:00
140192,,1508,,,0.0,55477,WY,4849,CT2,2017-08-15 08:00:00
66212,331.100006,1069,305.0,3156.699951,1.0,8066,WY,3458,BW74,2017-04-29 20:00:00
15722,447.299988,8296,434.0,4265.100098,1.0,56609,WY,90531,01,2017-01-05 02:00:00
79540,,847,,,0.0,6204,WY,2845,2,2016-05-11 04:00:00
118936,574.299988,1069,545.0,5475.5,1.0,8066,WY,3455,BW71,2017-07-28 16:00:00
124233,457.5,8296,448.0,4361.899902,1.0,56609,WY,90531,01,2017-07-01 09:00:00
169491,322.0,765,299.0,3070.5,1.0,4162,WY,2639,3,2016-10-27 03:00:00
