# Notebook Preamble

## IPython Magic

In [1]:
%load_ext autoreload
%autoreload 3

## Notebook Imports

In [2]:
# Standard Library Imports
import logging
import os
import sys
from pathlib import Path

# We need to set these environment variables prior to importing our intake catalog.
# You can also set them in your own shell environment instead.
os.environ["PUDL_INTAKE_CACHE"] = str(Path.home() / ".cache/intake")
os.environ["PUDL_INTAKE_PATH"] = "gs://intake.catalyst.coop/test"

# Local data if you've got it!
# os.environ["PUDL_INTAKE_PATH"] = str(Path.cwd().parent() / "data")

# 3rd Party Imports:
import intake
import pandas as pd
from pudl_catalog.helpers import year_state_filter

TEST_YEARS = [2019, 2020]
TEST_STATES = ["ID", "CO", "TX"]

## Set up a logger

In [3]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter("%(message)s")
handler.setFormatter(formatter)
logger.handlers = [handler]

# Test Intake & Parquet Functionality & Performance

This notebook demonstrates several different ways of organizing and accessing the same EPA CEMS data:
* Local storage on disk vs. remote storage in Google Cloud Storage buckets
* Directly accessing the data via `pandas.read_parquet()` vs. an Intake catalog.
* Using one big Parquet file for all data vs. separate small files for each combination of state & year.

## Data for local catalog testing
Download these files and place it in the `data` directory at the top level of the repo. Make sure you extract the tarball.
* Single Parquet file: https://storage.googleapis.com/catalyst.coop/intake/test/hourly_emissions_epacems.parquet
* Year-state partitioned data: https://storage.googleapis.com/catalyst.coop/intake/test/hourly_emissions_epacems.tar

## What Intake data sources are installed?

In [4]:
list(intake.cat)

['pudl_cat']

In [5]:
pudl_cat = intake.cat.pudl_cat
list(pudl_cat)

['hourly_emissions_epacems', 'hourly_emissions_epacems_partitioned']

In [6]:
pudl_cat

pudl_cat:
  args:
    path: /home/zane/code/catalyst/pudl-catalog/src/pudl_catalog/pudl_catalog.yaml
  description: A catalog of open energy system data for use by climate advocates,
    policymakers, journalists, researchers, and other members of civil society.
  driver: intake.catalog.local.YAMLFileCatalog
  metadata:
    creator:
      email: pudl@catalyst.coop
      path: https://catalyst.coop
      title: Catalyst Cooperative
    parameters:
      cache_method:
        allowed:
        - 'simplecache::'
        - ''
        default: 'simplecache::'
        description: Whether to cache data locally; empty string to disable caching.
        type: str


In [7]:
pudl_cat.hourly_emissions_epacems

hourly_emissions_epacems:
  args:
    engine: pyarrow
    storage_options:
      simplecache:
        cache_storage: /home/zane/.cache/intake
      token: anon
    urlpath: simplecache::gs://intake.catalyst.coop/test/hourly_emissions_epacems.parquet
  description: Hourly pollution emissions and plant operational data reported via
    Continuous Emissions Monitoring Systems (CEMS) as required by 40 CFR Part 75.
    Includes CO2, NOx, and SO2, as well as the heat content of fuel consumed and gross
    power output. Hourly values reported by US EIA ORISPL code and emissions unit
    (smokestack) ID.
  driver: intake_parquet.source.ParquetSource
  metadata:
    catalog_dir: /home/zane/code/catalyst/pudl-catalog/src/pudl_catalog/
    license:
      name: CC-BY-4.0
      path: https://creativecommons.org/licenses/by/4.0
      title: Creative Commons Attribution 4.0
    path: https://ampd.epa.gov/ampd
    provider: US Environmental Protection Agency Air Markets Program
    title: Continuous E

### Parquet metadata with `discover()`
* Categorical values showing up as integers.
* String values showing up as objects.
* No length in the shape, but 19 columns.
* `npartitions` is apparently referring to file, not row-group based partitions.

In [8]:
%%time
# This takes forever and downloads the whole dataset
pudl_cat.hourly_emissions_epacems.discover()

CPU times: user 714 ms, sys: 70.6 ms, total: 785 ms
Wall time: 1.75 s


{'dtype': {'plant_id_eia': 'int32',
  'unitid': 'object',
  'operating_datetime_utc': 'datetime64[ns, UTC]',
  'year': 'int32',
  'state': 'int64',
  'facility_id': 'int32',
  'unit_id_epa': 'object',
  'operating_time_hours': 'float32',
  'gross_load_mw': 'float32',
  'heat_content_mmbtu': 'float32',
  'steam_load_1000_lbs': 'float32',
  'so2_mass_lbs': 'float32',
  'so2_mass_measurement_code': 'int64',
  'nox_rate_lbs_mmbtu': 'float32',
  'nox_rate_measurement_code': 'int64',
  'nox_mass_lbs': 'float32',
  'nox_mass_measurement_code': 'int64',
  'co2_mass_tons': 'float32',
  'co2_mass_measurement_code': 'int64'},
 'shape': (None, 19),
 'npartitions': 1,
 'metadata': {'title': 'Continuous Emissions Monitoring System (CEMS) Hourly Data',
  'type': 'application/parquet',
  'provider': 'US Environmental Protection Agency Air Markets Program',
  'path': 'https://ampd.epa.gov/ampd',
  'license': {'name': 'CC-BY-4.0',
   'title': 'Creative Commons Attribution 4.0',
   'path': 'https://creat

## Normal usage

In [9]:
%%time
print(f"Reading data from {os.getenv('PUDL_INTAKE_PATH')}")
print(f"Caching data to {os.getenv('PUDL_INTAKE_CACHE')}")
filters = year_state_filter(
    years=TEST_YEARS,
    states=TEST_STATES,
)
display(filters)
epacems_df = (
    pudl_cat.hourly_emissions_epacems(
        filters=filters,
    )
    .to_dask().compute()
)

Reading data from gs://intake.catalyst.coop/test
Caching data to /home/zane/.cache/intake


[[('year', '=', 2019), ('state', '=', 'ID')],
 [('year', '=', 2019), ('state', '=', 'CO')],
 [('year', '=', 2019), ('state', '=', 'TX')],
 [('year', '=', 2020), ('state', '=', 'ID')],
 [('year', '=', 2020), ('state', '=', 'CO')],
 [('year', '=', 2020), ('state', '=', 'TX')]]

CPU times: user 3.21 s, sys: 1.11 s, total: 4.32 s
Wall time: 3.86 s


In [10]:
epacems_df.sample(20)

Unnamed: 0,plant_id_eia,unitid,operating_datetime_utc,year,state,facility_id,unit_id_epa,operating_time_hours,gross_load_mw,heat_content_mmbtu,steam_load_1000_lbs,so2_mass_lbs,so2_mass_measurement_code,nox_rate_lbs_mmbtu,nox_rate_measurement_code,nox_mass_lbs,nox_mass_measurement_code,co2_mass_tons,co2_mass_measurement_code
2536089,55154,2,2019-07-25 15:00:00+00:00,2019,TX,1304,4033,0.0,0.0,0.0,,,,,,,,,
1929384,50109,HRSG1,2019-05-04 06:00:00+00:00,2019,TX,8132,89668,0.0,0.0,0.0,,,,,,,,,
7073589,55120,CTG-1,2020-09-17 03:00:00+00:00,2020,TX,1280,3945,0.0,0.0,0.0,,,,,,,,,
798593,4937,CT-1,2019-01-07 23:00:00+00:00,2019,TX,774,90812,1.0,127.0,1447.900024,,0.869,Measured,0.005,Measured,7.2,Measured,86.0,Measured
2416671,3548,GT-4A,2019-07-09 21:00:00+00:00,2019,TX,704,89917,1.0,22.0,350.0,,,,0.7,Calculated,245.0,LME,,
3553711,3628,1,2019-11-01 13:00:00+00:00,2019,TX,720,2493,0.0,0.0,0.0,,,,,,,,,
3246961,3507,9,2019-10-16 07:00:00+00:00,2019,TX,697,2435,0.0,0.0,0.0,,,,,,,,,
6279339,55299,CTG3,2020-06-15 09:00:00+00:00,2020,TX,1411,4479,1.0,261.0,1603.800049,,0.96,Measured,0.006,Measured,9.6,Measured,95.300003,Measured
563711,56445,CT-02,2019-11-15 06:00:00+00:00,2019,CO,8152,89737,0.0,0.0,0.0,,,,,,,,,
2564077,55358,CT2,2019-07-13 19:00:00+00:00,2019,TX,1431,4546,1.0,231.0,1625.199951,,1.0,Substitute,0.011,Measured,17.9,Measured and Substitute,96.599998,Measured and Substitute


In [11]:
epacems_df.info(show_counts=True, memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8006424 entries, 0 to 8006423
Data columns (total 19 columns):
 #   Column                     Non-Null Count    Dtype              
---  ------                     --------------    -----              
 0   plant_id_eia               8006424 non-null  int32              
 1   unitid                     8006424 non-null  object             
 2   operating_datetime_utc     8006424 non-null  datetime64[ns, UTC]
 3   year                       8006424 non-null  int32              
 4   state                      8006424 non-null  category           
 5   facility_id                8006424 non-null  int32              
 6   unit_id_epa                8006424 non-null  object             
 7   operating_time_hours       8003928 non-null  float32            
 8   gross_load_mw              8006424 non-null  float32            
 9   heat_content_mmbtu         8006424 non-null  float32            
 10  steam_load_1000_lbs        33252 non-null 

## Test Performance of different sources

In [12]:
from pudl_catalog.hourly_emissions_epacems import TestEpaCemsParquet
epacems_tester = TestEpaCemsParquet()

In [13]:
epacems_tester.test_direct(years=TEST_YEARS, states=TEST_STATES, verify_df=False)

read_parquet, protocol='local', partition=False, years=[2019, 2020], states=['ID', 'CO', 'TX']:
    elapsed time: 2.37s
read_parquet, protocol='local', partition=True, years=[2019, 2020], states=['ID', 'CO', 'TX']:
    elapsed time: 2.50s
read_parquet, protocol='gs', partition=False, years=[2019, 2020], states=['ID', 'CO', 'TX']:
    elapsed time: 13.26s
read_parquet, protocol='gs', partition=True, years=[2019, 2020], states=['ID', 'CO', 'TX']:
    elapsed time: 35.42s


In [29]:
# With locally cached data, the times are similar to direct reads.
epacems_tester.test_intake(years=TEST_YEARS, states=TEST_STATES, verify_df=False)

read_parquet, protocol='local', partition=False, years=[2019, 2020], states=['ID', 'CO', 'TX']:
    elapsed time: 2.20s
intake, protocol='local', partition=False, years=[2019, 2020], states=['ID', 'CO', 'TX']:
    elapsed time: 4.90s
intake, protocol='local', partition=True, years=[2019, 2020], states=['ID', 'CO', 'TX']:
    elapsed time: 14.31s
intake, protocol='gs', partition=False, years=[2019, 2020], states=['ID', 'CO', 'TX']:
    elapsed time: 5.20s
intake, protocol='gs', partition=True, years=[2019, 2020], states=['ID', 'CO', 'TX']:
    elapsed time: 18.26s


### Verify we get the same data

In [26]:
%%time
df1 = pd.read_parquet("gs://intake.catalyst.coop/test/hourly_emissions_epacems/epacems-2020-ID.parquet")

CPU times: user 84.5 ms, sys: 39.3 ms, total: 124 ms
Wall time: 390 ms


In [27]:
%%time
df2 = pudl_cat.hourly_emissions_epacems(filters=year_state_filter(years=[2020], states=["ID"])).to_dask().compute()

CPU times: user 496 ms, sys: 12.3 ms, total: 509 ms
Wall time: 6.21 s


In [28]:
pd.testing.assert_frame_equal(df1, df2)

In [18]:
import pyarrow.parquet as pq
from pprint import pprint
epacems_pq = pq.read_table("../data/hourly_emissions_epacems/epacems-2020-ID.parquet")
dtype_dict = {name: dtype for name, dtype in zip(epacems_pq.schema.names, epacems_pq.schema.types)}
pprint(dtype_dict, indent=4, sort_dicts=False)

{   'plant_id_eia': DataType(int32),
    'unitid': DataType(string),
    'operating_datetime_utc': TimestampType(timestamp[ms, tz=UTC]),
    'year': DataType(int32),
    'state': DictionaryType(dictionary<values=string, indices=int32, ordered=0>),
    'facility_id': DataType(int32),
    'unit_id_epa': DataType(string),
    'operating_time_hours': DataType(float),
    'gross_load_mw': DataType(float),
    'heat_content_mmbtu': DataType(float),
    'steam_load_1000_lbs': DataType(float),
    'so2_mass_lbs': DataType(float),
    'so2_mass_measurement_code': DictionaryType(dictionary<values=string, indices=int32, ordered=0>),
    'nox_rate_lbs_mmbtu': DataType(float),
    'nox_rate_measurement_code': DictionaryType(dictionary<values=string, indices=int32, ordered=0>),
    'nox_mass_lbs': DataType(float),
    'nox_mass_measurement_code': DictionaryType(dictionary<values=string, indices=int32, ordered=0>),
    'co2_mass_tons': DataType(float),
    'co2_mass_measurement_code': DictionaryType(

In [19]:
epacems_pq.schema

plant_id_eia: int32 not null
  -- field metadata --
  description: 'The unique six-digit facility identification number, also' + 69
unitid: string not null
  -- field metadata --
  description: 'Facility-specific unit id (e.g. Unit 4)'
operating_datetime_utc: timestamp[ms, tz=UTC] not null
  -- field metadata --
  description: 'Date and time measurement began (UTC).'
year: int32 not null
  -- field metadata --
  description: 'Year the data was reported in, used for partitioning EPA ' + 5
state: dictionary<values=string, indices=int32, ordered=0>
  -- field metadata --
  description: 'Two letter US state abbreviation.'
facility_id: int32
  -- field metadata --
  description: 'New EPA plant ID.'
unit_id_epa: string
  -- field metadata --
  description: 'Emissions (smokestake) unit monitored by EPA CEMS.'
operating_time_hours: float
  -- field metadata --
  description: 'Length of time interval measured.'
gross_load_mw: float not null
  -- field metadata --
  description: 'Average power i

In [20]:
epacems_pq.schema.pandas_metadata is None

True

In [21]:
filters

[[('year', '=', 2019), ('state', '=', 'ID')],
 [('year', '=', 2019), ('state', '=', 'CO')],
 [('year', '=', 2019), ('state', '=', 'TX')],
 [('year', '=', 2020), ('state', '=', 'ID')],
 [('year', '=', 2020), ('state', '=', 'CO')],
 [('year', '=', 2020), ('state', '=', 'TX')]]

In [22]:
%%time
single_file_local = pd.read_parquet("../data/hourly_emissions_epacems.parquet", filters=filters)

CPU times: user 2.85 s, sys: 970 ms, total: 3.82 s
Wall time: 2.44 s


In [23]:
%%time
multi_file_local = pd.read_parquet("../data/hourly_emissions_epacems", filters=filters)

CPU times: user 4.71 s, sys: 967 ms, total: 5.68 s
Wall time: 2.34 s


In [24]:
%%time
single_file_remote = pd.read_parquet("gs://intake.catalyst.coop/test/hourly_emissions_epacems.parquet", filters=filters)

CPU times: user 4.21 s, sys: 1.19 s, total: 5.4 s
Wall time: 13.3 s


In [25]:
%%time
multi_file_remote = pd.read_parquet("gs://intake.catalyst.coop/test/hourly_emissions_epacems", filters=filters)

CPU times: user 14.9 s, sys: 2.79 s, total: 17.7 s
Wall time: 34.1 s
