# Notebook Preamble

## IPython Magic

In [None]:
%load_ext autoreload
%autoreload 3

## Notebook Imports

In [None]:
# Standard Library Imports
import logging
import os
import sys
from pathlib import Path

# We need to set these environment variables prior to importing our intake catalog.
# You can also set them in your own shell environment instead.
os.environ["PUDL_INTAKE_CACHE"] = str(Path.home() / ".cache/intake")
os.environ["PUDL_INTAKE_PATH"] = "gs://intake.catalyst.coop/test"

# Local data if you've got it!
# os.environ["PUDL_INTAKE_PATH"] = str(Path.cwd().parent() / "data")

# 3rd Party Imports:
import intake
import pandas as pd
from pudl_catalog.helpers import year_state_filter

TEST_YEARS = [2019, 2020]
TEST_STATES = ["ID", "CO", "TX"]

## Set up a logger

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter("%(message)s")
handler.setFormatter(formatter)
logger.handlers = [handler]

## What Intake data sources are installed?

In [None]:
list(intake.cat)

In [None]:
pudl_cat = intake.cat.pudl_cat
list(pudl_cat)

In [None]:
pudl_cat

In [None]:
pudl_cat.hourly_emissions_epacems

## Parquet metadata with `discover()`
* Categorical values showing up as integers.
* String values showing up as objects.
* No length in the shape, but 19 columns.
* `npartitions` is apparently referring to file, not row-group based partitions.

In [None]:
%%time
# This takes forever and downloads the whole dataset
pudl_cat.hourly_emissions_epacems.discover()

## Normal usage

In [None]:
%%time
print(f"Reading data from {os.getenv('PUDL_INTAKE_PATH')}")
print(f"Caching data to {os.getenv('PUDL_INTAKE_CACHE')}")
filters = year_state_filter(
    years=TEST_YEARS,
    states=TEST_STATES,
)
display(filters)
epacems_df = (
    pudl_cat.hourly_emissions_epacems(
        filters=filters,
    )
    .to_dask().compute()
)

In [None]:
epacems_df.sample(20)

In [None]:
epacems_df.info(show_counts=True, memory_usage="deep")

## Verify we get identical data via different methods

In [None]:
%%time
df1 = pd.read_parquet("gs://intake.catalyst.coop/test/hourly_emissions_epacems/epacems-2020-ID.parquet")

In [None]:
%%time
df2 = pudl_cat.hourly_emissions_epacems(filters=year_state_filter(years=[2020], states=["ID"])).to_dask().compute()

In [None]:
pd.testing.assert_frame_equal(df1, df2)

## Inspect the Parquet file metadata

In [None]:
import pyarrow.parquet as pq
from pprint import pprint
import fsspec
epacems_pq = pq.read_table(
    "gs://intake.catalyst.coop/test/hourly_emissions_epacems/epacems-2020-ID.parquet",
    filesystem=fsspec.filesystem("gs"),
)
dtype_dict = {name: dtype for name, dtype in zip(epacems_pq.schema.names, epacems_pq.schema.types)}
pprint(dtype_dict, indent=4, sort_dicts=False)

In [None]:
epacems_pq.schema

In [None]:
epacems_pq.schema.pandas_metadata is None