<a href="https://www.kaggle.com/code/catalystcooperative/01-pudl-data-access?scriptVersionId=146018996" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import sys

print(f"Python version: {sys.version}")
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sqlalchemy as sa

print(f"{np.__version__=}")
print(f"{pd.__version__=}")
print(f"{sa.__version__=}")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in sorted(filenames):
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import pathlib
pudl_path = pathlib.Path("/kaggle/input/pudl-project")

Python version: 3.10.12 | packaged by conda-forge | (main, Jun 23 2023, 22:40:32) [GCC 12.3.0]
np.__version__='1.23.5'
pd.__version__='2.0.3'
sa.__version__='2.0.17'
/kaggle/input/pudl-project/censusdp1tract.sqlite
/kaggle/input/pudl-project/ferc1.sqlite
/kaggle/input/pudl-project/ferc1_xbrl.sqlite
/kaggle/input/pudl-project/ferc1_xbrl_datapackage.json
/kaggle/input/pudl-project/ferc1_xbrl_taxonomy_metadata.json
/kaggle/input/pudl-project/ferc2.sqlite
/kaggle/input/pudl-project/ferc2_xbrl.sqlite
/kaggle/input/pudl-project/ferc2_xbrl_datapackage.json
/kaggle/input/pudl-project/ferc2_xbrl_taxonomy_metadata.json
/kaggle/input/pudl-project/ferc6.sqlite
/kaggle/input/pudl-project/ferc60.sqlite
/kaggle/input/pudl-project/ferc60_xbrl.sqlite
/kaggle/input/pudl-project/ferc60_xbrl_datapackage.json
/kaggle/input/pudl-project/ferc60_xbrl_taxonomy_metadata.json
/kaggle/input/pudl-project/ferc6_xbrl.sqlite
/kaggle/input/pudl-project/ferc6_xbrl_datapackage.json
/kaggle/input/pudl-project/ferc6_xbrl_

### Visualization settings

In [2]:
import matplotlib

In [3]:
%matplotlib inline

In [4]:
matplotlib.rcParams["figure.figsize"] = (16, 10)
matplotlib.rcParams["figure.dpi"] = 150
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", 1000)

# Access the PUDL DB
- Most of the PUDL Project data is distributed using SQLite databases.
- Python, pandas, and many other libraries have built-in support for reading data from SQLite.
- We can use the SQLAlchemy library to create a connection to the database that Pandas can use.
- Documentation of the available tables and columns can be found in the [PUDL Data Dictionary](https://catalystcoop-pudl.readthedocs.io/en/latest/data_dictionaries/pudl_db.html)

In [5]:
pudl_engine = sa.create_engine(f"sqlite:///{pudl_path}/pudl.sqlite")

## Read EIA plant data from the PUDL SQLite Database
* There are lots of different kinds of data in the PUDL DB.
* Some tables describe attributes of utilities, plants, generators, and balancing authorities.
* Other tables contain hourly, monthly, or yearly time series of fuel consumed, operating costs, or electricity generated.
* We'll read the EIA plants table, and use `.convert_dtypes()` to ensure that we don't get any `object` columns since SQLite's data types aren't as rich as those available from Pandas.
* Power plants are industrial facilities operated by a single utility, but they can have multiple owners, and host multiple generation units.
* The `plants_eia` table contains only information that pertains to all of the equipment at the plant.
* Most of these attributes are relatively stable, but they can change slowly over time, so each plant as one record for each `report_year`.

In [6]:
%%time
plants_eia = pd.read_sql("denorm_plants_eia", pudl_engine).convert_dtypes()
plants_eia.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200177 entries, 0 to 200176
Data columns (total 54 columns):
 #   Column                                        Non-Null Count   Dtype         
---  ------                                        --------------   -----         
 0   plant_id_eia                                  200177 non-null  Int64         
 1   plant_name_eia                                199398 non-null  string        
 2   city                                          190994 non-null  string        
 3   county                                        190479 non-null  string        
 4   latitude                                      192211 non-null  Float64       
 5   longitude                                     194840 non-null  Float64       
 6   state                                         199317 non-null  string        
 7   street_address                                184478 non-null  string        
 8   zip_code                                      193575 n

In [7]:
plants_eia.sample(10)

Unnamed: 0,plant_id_eia,plant_name_eia,city,county,latitude,longitude,state,street_address,zip_code,timezone,report_date,ash_impoundment,ash_impoundment_lined,ash_impoundment_status,balancing_authority_code_eia,balancing_authority_name_eia,datum,energy_storage,ferc_cogen_docket_no,ferc_cogen_status,ferc_exempt_wholesale_generator_docket_no,ferc_exempt_wholesale_generator,ferc_small_power_producer_docket_no,ferc_small_power_producer,ferc_qualifying_facility_docket_no,grid_voltage_1_kv,grid_voltage_2_kv,grid_voltage_3_kv,iso_rto_code,liquefied_natural_gas_storage,natural_gas_local_distribution_company,natural_gas_storage,natural_gas_pipeline_name_1,natural_gas_pipeline_name_2,natural_gas_pipeline_name_3,nerc_region,net_metering,pipeline_notes,primary_purpose_id_naics,regulatory_status_code,reporting_frequency_code,sector_id_eia,sector_name_eia,service_area,transmission_distribution_owner_id,transmission_distribution_owner_name,transmission_distribution_owner_state,utility_id_eia,water_source,data_maturity,plant_id_pudl,utility_name_eia,utility_id_pudl,balancing_authority_code_eia_consistent_rate
133524,56896,AMERESCO Jefferson City,Jefferson City,Cole,38.552371,-92.16,MO,8432 No More Victims Rd,65101,America/Chicago,2017-01-01,False,False,,MISO,"Midcontinent Independent Transmission System Operator, Inc..",,False,,False,,False,08-71-000,True,,34.5,,,,False,,False,,,,SERC,,,22,NR,,2.0,NAICS-22 Non-Cogen,,19436.0,Union Electric Co - (MO),MO,56102,,final,5291,AMERESCO Jefferson City LLC,470,1.0
163195,59505,Sun Devil Solar,Littleton,Warren,36.432,-77.9703,NC,US Highway 158,27850,America/New_York,2014-01-01,False,False,,CPLE,Progress Energy Carolinas - EAST,,,,False,,False,,False,,,,,,,,,,,,SERC,False,,22,NR,,2.0,IPP Non-CHP,,3046.0,Duke Energy Progress - (NC),NC,58658,,final,7236,Sunlight Partners,5042,1.0
120472,56040,Sweetheart Cup Owings Mills,Owings Mills,Baltimore,39.4,-76.7667,MD,10100 Reisterstown Rd,21117,America/New_York,2002-01-01,,,,PJM,,,,,True,,,,,,,,,,,,,,,,MAAC,,,22,,,,,Baltimore Gas & Electric Co,,,,19166,Municipality,final,8160,Trigen-Cinergy Solutions Owing,8262,1.0
160850,59242,Beverly,Beverly,Essex,42.59,-70.91194,MA,3 Cailin Road,1915,America/New_York,2013-01-01,,,,ISNE,ISO New England Inc.,,,,False,,False,QF-14-132-000,True,,23.0,,,,,,,,,,NPCC,,,22,NR,,2.0,IPP Non-CHP,,11804.0,Massachusetts Electric Co,NY,58801,,final,7069,"Integrys MA Solar, LLC",2109,1.0
89076,50819,ESI,Tracy,Alameda,37.739167,-121.4322,CA,14740 Altamont Pass Road,95376,America/Los_Angeles,2020-01-01,,False,,CISO,California Independent System Operator,,False,,False,,False,QF05-78,True,,,,,,,,,,,,WECC,,,22,NR,,2.0,IPP Non-CHP,,14328.0,Pacific Gas & Electric Co,CA,58661,Municipality,final,7967,"Sustainable Power Group, LLC",3381,1.0
191170,63697,Cranberry Solar,Middleborough,Plymouth,41.871955,-70.90779,MA,97 Wood Street,2346,America/New_York,2021-01-01,,,,ISNE,ISO New England Inc.,,False,,False,,False,,False,,13.8,,,,,,,,,,NPCC,,,22,NR,A,2.0,NAICS-22 Non-Cogen,,12473.0,Town of Middleborough - (MA),MA,63409,,final,14199,"GWE Cranberry Solar RT, LLC",6748,1.0
166819,59901,Albertson Solar LLC,,Duplin,35.093021,-77.81267,NC,,28508,America/New_York,2018-01-01,False,False,,CPLE,Duke Energy Progress East,,False,,False,,False,QF 14-637-000,True,,34.5,,,,False,,False,,,,SERC,,,22,NR,A,2.0,NAICS-22 Non-Cogen,,3046.0,Duke Energy Progress - (NC),NC,59674,,final,7448,Albertson Solar LLC,436,1.0
195936,64901,WAL3320,Palmhurst,Hidalgo,26.260742,-98.315859,TX,215 E Mile 3 Rd,78573,America/Chicago,2021-01-01,,,,ERCO,"Electric Reliability Council of Texas, Inc.",,False,,False,,False,,False,,0.48,,,,False,TEXAS GAS SERVICE,False,,,,TRE,,,441,NR,A,4.0,,,3278.0,AEP Texas Central Company,TX,64315,,final,16417,"Walmart Stores Texas, LLC",7914,1.0
127906,56496,ConocoPhillips Billings Refinery,Billings,Yellowstone,45.7769,-108.4911,MT,401 South 23rd Street,59101,America/Denver,2016-01-01,False,False,,NWMT,NorthWestern Energy (NWMT),,False,,False,,False,,False,,100.0,,,,False,,False,NORTHWESTERN ENERGY,,,WECC,,,32411,NR,,6.0,Industrial NAICS Non-Cogen,,13809.0,NorthWestern Energy - (SD),SD,54916,Municipality,final,5016,Phillips 66 Billings Refinery,2841,0.909091
19647,1407,A B Paterson,New Orleans,Orleans,30.015833,-90.02556,LA,5400 Dwyer road,70126,America/Chicago,2012-01-01,,,,MISO,,,,,False,,False,,False,,115.0,,,,,,,,,,SERC,,,22,RE,,1.0,Electric Utility,,13478.0,Entergy New Orleans Inc,LA,13478,Inner Harbor Navigation Canal,final,1057,Entergy New Orleans Inc,110,1.0


# Read Hourly Generation Timeseries from Parquet
* The full hourly emissions time series for thousands of US power plants covering 1995-2022 contains almost a billion records.
* The data is stored in a single [Apache Parquet file](https://parquet.apache.org/) with row-groups defined by year and state.
* This compressed columnar format enables very efficient queries with appropriate tooling, including [Dask](https://www.dask.org/) and [PyArrow](https://arrow.apache.org/docs/python/index.html).
* Reading the entire dataset into memory at once will probably exceed the available RAM.
* The filters use [Disjunctive Normal Form](https://blog.datasyndrome.com/python-and-parquet-performance-e71da65269ce)
* Using Dask's lazy evaluation and the filter criteria, we can minimize the data read off of disk and limit memory usage.
* The Dask project has lots of [tutorials and documentation](https://www.dask.org/get-started) if you want to learn more.
* Other tools like [DuckDB](https://duckdb.org/docs/data/parquet/overview.html) ([Python API](https://duckdb.org/docs/api/python/overview)) also provide good Parquet support. 

## Using Dask to selectively read Parquet data

In [8]:
%%time 
from dask import dataframe as dd
# In a DNF filter, the inner lists of conditions are combined with AND
# while the outer list of conditions are combined with OR
# So this filter will get all 2019 and 2020 records for CO and WY:
state_year_filters = [
    [('year', '=', 2019), ('state', '=', 'CO')],
    [('year', '=', 2019), ('state', '=', 'WY')],
    [('year', '=', 2020), ('state', '=', 'CO')],
    [('year', '=', 2020), ('state', '=', 'WY')],
]
co_wy_cems = dd.read_parquet(
    f"{pudl_path}/hourly_emissions_epacems.parquet",
    engine="pyarrow",
    dtype_backend="pyarrow",
    filters=state_year_filters,
).compute()
co_wy_cems.info()

Use the `index` argument to set a sorted column as your index to create a DataFrame collection with known `divisions`.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1662192 entries, 0 to 1662191
Data columns (total 16 columns):
 #   Column                     Non-Null Count    Dtype                                                       
---  ------                     --------------    -----                                                       
 0   plant_id_eia               1662192 non-null  int32[pyarrow]                                              
 1   plant_id_epa               1662192 non-null  int32[pyarrow]                                              
 2   emissions_unit_id_epa      1662192 non-null  string                                                      
 3   operating_datetime_utc     1662192 non-null  timestamp[ms, tz=UTC][pyarrow]                              
 4   year                       1662192 non-null  int32[pyarrow]                                              
 5   state                      1662192 non-null  dictionary<values=string, indices=int32, ordered=0>[pyarrow]

In [9]:
co_wy_cems.sample(10)

Unnamed: 0,plant_id_eia,plant_id_epa,emissions_unit_id_epa,operating_datetime_utc,year,state,operating_time_hours,gross_load_mw,heat_content_mmbtu,steam_load_1000_lbs,so2_mass_lbs,so2_mass_measurement_code,nox_mass_lbs,nox_mass_measurement_code,co2_mass_tons,co2_mass_measurement_code
1250337,55453,55453,2,2019-01-20 16:00:00+00:00,2019,CO,0.0,,,,,,,,,
1165261,50707,50707,S005,2019-05-05 20:00:00+00:00,2019,CO,1.0,31.0,262.899994,,0.158,Measured,22.872,Calculated,15.6,Measured
580384,56998,56998,CT06,2020-07-31 23:00:00+00:00,2020,CO,0.82,15.0,168.755997,,0.101,Measured,1.013,Calculated,10.004,Measured
59086,478,478,2,2020-03-25 05:00:00+00:00,2020,CO,0.0,,,,,,,,,
1054418,6761,6761,C,2019-09-09 09:00:00+00:00,2019,CO,0.0,,,,,,,,,
483851,55505,55505,BR2,2020-08-04 18:00:00+00:00,2020,CO,0.0,,,,,,,,,
652158,4162,4162,2,2019-10-08 13:00:00+00:00,2019,WY,1.0,222.0,2281.899902,,409.700012,Measured,559.065979,Calculated,239.300003,Measured
1107211,10682,10682,GT2,2019-09-19 02:00:00+00:00,2019,CO,0.0,,,,,,,,,
914406,492,492,7,2019-09-15 13:00:00+00:00,2019,CO,1.0,70.0,753.0,,64.900002,Measured,164.154007,Calculated,79.0,Measured
190691,6761,6761,A,2020-03-20 18:00:00+00:00,2020,CO,0.0,,,,,,,,,


## Read all Colorado Emissions Data

In [10]:
%%time
colorado_cems = dd.read_parquet(
    f"{pudl_path}/hourly_emissions_epacems.parquet",
    engine="pyarrow",
    dtype_backend="pyarrow",
    filters=[("state", "=", "CO")],
).compute()
colorado_cems.info()

Use the `index` argument to set a sorted column as your index to create a DataFrame collection with known `divisions`.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13631472 entries, 0 to 13631471
Data columns (total 16 columns):
 #   Column                     Dtype                                                       
---  ------                     -----                                                       
 0   plant_id_eia               int32[pyarrow]                                              
 1   plant_id_epa               int32[pyarrow]                                              
 2   emissions_unit_id_epa      string                                                      
 3   operating_datetime_utc     timestamp[ms, tz=UTC][pyarrow]                              
 4   year                       int32[pyarrow]                                              
 5   state                      dictionary<values=string, indices=int32, ordered=0>[pyarrow]
 6   operating_time_hours       float[pyarrow]                                              
 7   gross_load_mw              float[pyarrow]  

In [11]:
colorado_cems.sample(10)

Unnamed: 0,plant_id_eia,plant_id_epa,emissions_unit_id_epa,operating_datetime_utc,year,state,operating_time_hours,gross_load_mw,heat_content_mmbtu,steam_load_1000_lbs,so2_mass_lbs,so2_mass_measurement_code,nox_mass_lbs,nox_mass_measurement_code,co2_mass_tons,co2_mass_measurement_code
2093594,478,478,3,2015-02-21 09:00:00+00:00,2015,CO,0.0,,,,,,,,,
11041754,56445,56445,CT-01,2019-08-06 09:00:00+00:00,2019,CO,1.0,82.0,983.799988,,0.59,Substitute,38.368,Calculated,58.5,Measured and Substitute
9039193,6761,6761,C,2004-02-06 08:00:00+00:00,2004,CO,,,,,,,,,,
12255267,55453,55453,6,2007-08-11 10:00:00+00:00,2007,CO,0.0,,,,,,,,,
4743972,6112,6112,5,2013-11-01 19:00:00+00:00,2013,CO,0.0,,,,,,,,,
1549491,525,525,H1,2010-01-14 10:00:00+00:00,2010,CO,1.0,200.0,2207.0,,297.0,Substitute,1008.598999,Calculated,226.399994,Measured and Substitute
4005374,468,468,2,2008-06-07 21:00:00+00:00,2008,CO,1.0,48.0,567.299988,,606.5,Measured,221.813995,Calculated,58.200001,Measured
2256454,6761,6761,A,2015-09-25 05:00:00+00:00,2015,CO,0.0,,,,,,,,,
377360,55283,55283,2,2022-08-01 15:00:00+00:00,2022,CO,1.0,114.0,895.400024,,0.537,Measured,26.862,Calculated,53.200001,Measured
1630011,6112,6112,5,2010-03-25 10:00:00+00:00,2010,CO,0.0,,,,,,,,,


## Visualize Hourly Power Plant Operations
* Let's find a particular power plant and look at its long-term operations.
* Say we want to investigate [Xcel Energy's troubled Comanche coal plant](https://coloradosun.com/?s=comanche%20pueblo) in Pueblo, CO?
* The EPA CEMS data only has contains the EIA Plant ID, not its name or any ownership information.
* The PUDL database links these IDs to much more extensive EIA data.
* We can look for the Comanche plant in the PUDL DB and use that information to select the appropriate EPA CEMS data to plot.

In [12]:
plants_eia.loc[
    plants_eia.plant_name_eia.str.contains("comanche", case=False),
    [
        "plant_id_eia",
        "plant_name_eia",
        "utility_name_eia",
        "city",
        "state",
        "latitude",
        "longitude",
    ]
].drop_duplicates()

Unnamed: 0,plant_id_eia,plant_name_eia,utility_name_eia,city,state,latitude,longitude
7460,470,Comanche,Public Service Co of Colorado,Pueblo,CO,38.2081,-104.5747
50775,6145,Comanche Peak,Luminant Generation Company LLC,Glen Rose,TX,32.298365,-97.78552
50788,6145,Comanche Peak,TXU Generation Co LP,Glen Rose,TX,32.298365,-97.78552
67387,8059,Comanche,Public Service Co of Oklahoma,Lawton,OK,34.5431,-98.3244
164570,59656,Comanche Solar,Novatus Energy,Pueblo,CO,38.205278,-104.5667
164575,59656,Comanche Solar,Comanche LLC,Pueblo,CO,38.205278,-104.5667


In [13]:
comanche_cems = colorado_cems[colorado_cems.plant_id_eia==470]
comanche_cems.info()

<class 'pandas.core.frame.DataFrame'>
Index: 569760 entries, 26280 to 13044551
Data columns (total 16 columns):
 #   Column                     Non-Null Count   Dtype                                                       
---  ------                     --------------   -----                                                       
 0   plant_id_eia               569760 non-null  int32[pyarrow]                                              
 1   plant_id_epa               569760 non-null  int32[pyarrow]                                              
 2   emissions_unit_id_epa      569760 non-null  string                                                      
 3   operating_datetime_utc     569760 non-null  timestamp[ms, tz=UTC][pyarrow]                              
 4   year                       569760 non-null  int32[pyarrow]                                              
 5   state                      569760 non-null  dictionary<values=string, indices=int32, ordered=0>[pyarrow]
 6   ope

In [14]:
comanche_cems.head(20)

Unnamed: 0,plant_id_eia,plant_id_epa,emissions_unit_id_epa,operating_datetime_utc,year,state,operating_time_hours,gross_load_mw,heat_content_mmbtu,steam_load_1000_lbs,so2_mass_lbs,so2_mass_measurement_code,nox_mass_lbs,nox_mass_measurement_code,co2_mass_tons,co2_mass_measurement_code
26280,470,470,1,2022-01-01 07:00:00+00:00,2022,CO,1.0,253.0,2320.300049,,169.399994,Measured,292.358002,Calculated,238.5,Measured
26281,470,470,1,2022-01-01 08:00:00+00:00,2022,CO,1.0,254.0,2376.199951,,140.399994,Measured,304.153992,Calculated,244.5,Measured
26282,470,470,1,2022-01-01 09:00:00+00:00,2022,CO,1.0,254.0,2404.899902,,127.199997,Measured,317.446991,Calculated,247.300003,Measured
26283,470,470,1,2022-01-01 10:00:00+00:00,2022,CO,1.0,254.0,2427.399902,,117.599998,Measured,325.272003,Calculated,249.699997,Measured
26284,470,470,1,2022-01-01 11:00:00+00:00,2022,CO,1.0,254.0,2394.399902,,139.600006,Measured,304.088989,Calculated,246.5,Measured
26285,470,470,1,2022-01-01 12:00:00+00:00,2022,CO,1.0,254.0,2460.600098,,146.899994,Measured,327.26001,Calculated,253.300003,Measured
26286,470,470,1,2022-01-01 13:00:00+00:00,2022,CO,1.0,254.0,2452.0,,140.800003,Measured,338.376007,Calculated,252.399994,Measured
26287,470,470,1,2022-01-01 14:00:00+00:00,2022,CO,1.0,236.0,2399.600098,,145.800003,Measured,347.941986,Calculated,246.800003,Measured
26288,470,470,1,2022-01-01 15:00:00+00:00,2022,CO,1.0,209.0,2349.800049,,147.5,Measured,312.52301,Calculated,241.699997,Measured
26289,470,470,1,2022-01-01 16:00:00+00:00,2022,CO,1.0,214.0,2360.399902,,150.199997,Measured,302.131012,Calculated,242.899994,Measured
