# Notebook Preamble

## IPython Magic

In [None]:
%load_ext autoreload
%autoreload 3

## Notebook Imports

In [None]:
!pip install -e ../

In [None]:
# Standard Library Imports
import logging
import os
import sys
from pathlib import Path

# We need to set these environment variables prior to importing our intake catalog.
# You can also set them in your own shell environment instead.
os.environ["PUDL_INTAKE_CACHE"] = str(Path.home() / ".cache/intake-pudl")

# The fastest remote data, requires authentication for now.
os.environ["PUDL_INTAKE_PATH"] = "gcs://catalyst.coop/intake/test"

# Available to the anonymous public, but not yet working
#os.environ["PUDL_INTAKE_PATH"] = "https://storage.googleapis.com/catalyst.coop/intake/test"

# Local data if you've got it!
# os.environ["PUDL_INTAKE_PATH"] = str(Path.cwd().parent() / "data")

# 3rd Party Imports:
import intake
import pandas as pd
from pudl_catalog.helpers import year_state_filter

TEST_YEARS = [2019, 2020]
TEST_STATES = ["ID", "CO", "TX"]

## Set up a logger

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter("%(message)s")
handler.setFormatter(formatter)
logger.handlers = [handler]

# Test Intake & Parquet Functionality & Performance

This notebook demonstrates several different ways of organizing and accessing the same EPA CEMS data:
* Local storage on disk vs. remote storage in Google Cloud Storage buckets
* Directly accessing the data via `pandas.read_parquet()` vs. an Intake catalog.
* Using one big Parquet file for all data vs. separate small files for each combination of state & year.

## Data for local catalog testing
Download these files and place it in the `data` directory at the top level of the repo. Make sure you extract the tarball.
* Single Parquet file: https://storage.googleapis.com/catalyst.coop/intake/test/hourly_emissions_epacems.parquet
* Year-state partitioned data: https://storage.googleapis.com/catalyst.coop/intake/test/hourly_emissions_epacems.tar

## What Intake data sources are installed?

In [None]:
list(intake.cat)

In [None]:
pudl_cat = intake.cat.pudl_cat
list(pudl_cat)

In [None]:
pudl_cat

In [None]:
pudl_cat.hourly_emissions_epacems

### Parquet metdata with `discover()`
* Categorical values showing up as integers.
* String values showing up as objects.
* No length in the shape, but 19 columns.
* `npartitions` is apparently referring to file, not row-group based partitions.

In [None]:
%%time
pudl_cat.hourly_emissions_epacems.discover()

## Normal usage

In [None]:
%%time
print(f"Reading data from {os.getenv('PUDL_INTAKE_PATH')}")
filters = year_state_filter(
    years=TEST_YEARS,
    states=TEST_STATES,
)
display(filters)
epacems_df = (
    pudl_cat.hourly_emissions_epacems(filters=filters)
    .to_dask().compute()
)

In [None]:
epacems_df.sample(20)

In [None]:
epacems_df.info(show_counts=True, memory_usage="deep")

## Test Performance of different sources

In [None]:
from pudl_catalog.hourly_emissions_epacems import TestEpaCemsParquet
epacems_tester = TestEpaCemsParquet()

In [None]:
epacems_tester.test_direct(years=TEST_YEARS, states=TEST_STATES, verify_df=False)

In [None]:
# With locally cached data, the times are similar to direct reads.
epacems_tester.test_intake(years=TEST_YEARS, states=TEST_STATES, verify_df=False)

### Verify we get the same data

In [None]:
%%time
df1 = pd.read_parquet("gcs://catalyst.coop/intake/test/hourly_emissions_epacems/epacems-2020-ID.parquet")

In [None]:
%%time
df2 = pudl_cat.hourly_emissions_epacems_partitioned(filters=year_state_filter(years=[2020], states=["ID"])).to_dask().compute()

In [None]:
pd.testing.assert_frame_equal(df1, df2)

In [None]:
import pyarrow.parquet as pq
from pprint import pprint
epacems_pq = pq.read_table("../data/hourly_emissions_epacems/epacems-2020-ID.parquet")
dtype_dict = {name: dtype for name, dtype in zip(epacems_pq.schema.names, epacems_pq.schema.types)}
pprint(dtype_dict, indent=4, sort_dicts=False)

In [None]:
epacems_pq.schema

In [None]:
epacems_pq.schema.pandas_metadata is None

In [None]:
filters

In [None]:
%%time
single_file_local = pd.read_parquet("../data/hourly_emissions_epacems.parquet", filters=filters)

In [None]:
%%time
multi_file_local = pd.read_parquet("../data/hourly_emissions_epacems", filters=filters)

In [None]:
%%time
single_file_remote = pd.read_parquet("gcs://catalyst.coop/intake/test/hourly_emissions_epacems.parquet", filters=filters)

In [None]:
%%time
multi_file_remote = pd.read_parquet("gcs://catalyst.coop/intake/test/hourly_emissions_epacems", filters=filters)