# Transform EPA's EIA-EPA Crosswalk CSV

In [34]:
import pandas as pd
import pathlib
import sqlalchemy as sa
import importlib.resources

# local imports
import pudl
import pudl.transform.eia as pte
import pudl.constants as pc

import logging
import sys

In [35]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [37]:
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_in = pathlib.Path(pudl_settings['pudl_in'])
ds = pudl.workspace.datastore.Datastore(pudl_in, sandbox=True)
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine) # freq='monthly'/'annual' (maybe other abr.)

# Older notebook version

In [18]:
eia_epacems_crosswalk_csv = (
    importlib.resources.open_text(
        'pudl.package_data.glue', 
        'epa_eia_crosswalk_from_epa.csv')
)

eia_epacems_crosswalk = (
    pd.read_csv(eia_epacems_crosswalk_csv)
    .pipe(pudl.helpers.simplify_columns)
    .rename(columns={
        'oris_code': 'plant_id_epa',
        'eia_oris': 'plant_id_eia',
        'unit_id': 'epa_point_source_unit',
        'facility_name': 'plant_name_eia',
        'unit_type': 'prime_mover_code'})
    .pipe(pudl.helpers.convert_cols_dtypes, 'eia')

#       .filter([
#           'plant_name_eia',
#           'plant_id_epa',
#           'unit_id_epa', 
#           'generator_id',
#           'boiler_id',
#           'generator_id',
#           'prime_mover_code',
#       ])
    .drop([
        'fuel_type_primary', 
        'edat_mw_cap', 
        'way_gen_id_matched', 
        'unit_op_status_date', 
        'notes',
        'annual_heat_input',
        'op_status'], axis=1)
)

In [113]:
# eia_epacems_crosswalks_crosswalk.copy()
# test[test['plant_id_eia']==55306]

### Fill in plant id mapping gaps

1. Attempt to match based on **plant name strings**


In [28]:
# Prepare EIA plant tables for integration
eia_plants = pudl_out.plants_eia860().copy()
eia_plants = eia_plants.filter(['plant_id_eia','plant_name_eia']).copy()

In [29]:
# Make subset df of EPA-EIA crosswalk *WITH* EIA ids
pre_matched_plant_eia = eia_epacems_crosswalk[eia_epacems_crosswalk['plant_id_eia'].notna()]

# Make subset df of EPA-EIA crosswalk *WITHOUT* EIA ids
missing_plant_eia = (
    eia_epacems_crosswalk
    .query("plant_id_eia.isna()")
    .drop('plant_id_eia', axis=1)
    .reset_index()
)

In [30]:
# Merge missing data on plant name
missing_merge = (
    pd.merge(
        missing_plant_eia, 
        eia_plants, 
        on='plant_name_eia', 
        how='left')
    .drop_duplicates(subset='index')
)

# Plants with a match
merge_match = missing_merge[missing_merge['plant_id_eia'].notna()]

# Plant ids that are still missing
missing_plant_eia2 = (
    missing_merge[missing_merge['plant_id_eia'].isna()]
    .drop('plant_id_eia', axis=1)
)

2. In most cases, the EPA has already associated an EIA generator_id with each EPA unit_id. In this case, if we are able to find an **EIA plant_id generator_id pair that matches an EPA plant_id generator_id pair**, we will assume that the EPA and EIA plant_ids are identical.

In [32]:
# Prepare EIA generator table for integration
eia_gen = pudl_out.gens_eia860().copy()
eia_gen = eia_gen.filter(['plant_id_eia', 'generator_id']).copy()

In [33]:
# Merge missing data on plant id and generator id
missing_merge2 = (
    pd.merge(
        missing_plant_eia2,
        eia_gen,
        left_on=['plant_id_epa', 'generator_id'],
        right_on=['plant_id_eia', 'generator_id'],
        how='left')
    .drop_duplicates(subset='index')
)

# Plants with a match
merge_match2 = missing_merge2[missing_merge2['plant_id_eia'].notna()]

# Plant ids that are still missing
missing_plant_eia3 = (
    missing_merge2[missing_merge2['plant_id_eia'].isna()]
    .drop('plant_id_eia', axis=1)
)

In [34]:
missing_merge2

Unnamed: 0,index,plant_name_eia,plant_id_epa,epa_point_source_unit,generator_id,boiler_id,prime_mover_code,plant_id_eia
0,17,Copper Station,9,CTG-1,1,,GT,9
8,49,Dolet Hills Power Station,51,1,1,1,ST,51
18,50,Smith Generating Facility,54,SCT1,GT1,,GT,54
26,51,Smith Generating Facility,54,SCT10,GT10,,GT,54
34,52,Smith Generating Facility,54,SCT2,GT2,,GT,54
...,...,...,...,...,...,...,...,...
32758,6852,Albany Green Energy LLC,60340,B0004,1,,,60340
32762,6853,Port Comfort Peaking Facility,60459,CT1,PC1,,,60459
32766,6854,Port Comfort Peaking Facility,60459,CT2,PC2,,,60459
32770,6855,"Chamon Power, LLC",60460,CT1,CH1,,,60460


3. For whatever is left, we will simply look for instances where there is an **EIA plant_id that matches an EPA plant_id**. By comparing the plant names, we should be able to identify if these plants are the same.

In [12]:
# Merge missing data on plant_id_eia and plant_id_epa
missing_merge3 = (
    pd.merge(
        missing_plant_eia3,
        eia_plants,
        left_on='plant_id_epa',
        right_on='plant_id_eia',
        how='left',
        suffixes=['_epa', '_eia'])
    .drop_duplicates(subset='index')
)

# Plants with a match
merge_match3 = missing_merge3[missing_merge3['plant_id_eia'].notna()]

# Plant ids that are still missing
missing_plant_eia4 = missing_merge3[missing_merge3['plant_id_eia'].isna()]

In [13]:
# Combine all pre-matched plants with all found matches and leftovers without a match
cleaned_eia_epacems_crosswalk = (
    pd.concat([
        pre_matched_plant_eia,
        merge_match,
        merge_match2,
        merge_match3,
        missing_plant_eia4
    ])
    .drop(['index', 'plant_name_eia_eia', 'plant_name_eia_epa'], axis=1)
)

In [14]:
cleaned_eia_epacems_crosswalk[cleaned_eia_epacems_crosswalk['plant_id_eia'].isna()]

Unnamed: 0,plant_name_eia,plant_id_epa,plant_id_eia,epa_point_source_unit,generator_id,boiler_id,prime_mover_code
0,,1594,,11,Plant not in EIA,,ST
1,,1594,,12,Plant not in EIA,,ST
2,,2440,,001,Plant not in EIA,,ST
125,,50044,,B132,Plant not in EIA,,ST
198,,50607,,23,Plant not in EIA,,ST
199,,50607,,24,Plant not in EIA,,ST
200,,50607,,26,Plant not in EIA,,ST
201,,50607,,RSB1,Plant not in EIA,,ST
202,,50607,,RSB2,Plant not in EIA,,ST
454,,55703,,P049,Plant not in EIA,,ST


# Add FRS Registry IDs 

In [171]:
#eia_epacems_crosswalk

In [117]:
from json import JSONDecodeError
from functools import lru_cache
import requests
import pandas as pd
import numpy as np
import time
import progressbar

# Cache this so we don't have to keep loading the same page
@lru_cache(512)
def download(url):
    return requests.get(url)

def one_plant_reg_id(plant_id_eia):
    url_template = "https://ofmpub.epa.gov/enviro/frs_rest_services.get_facilities?pgm_sys_acrnm=EIA-860&program_output=yes&output=JSON&pgm_sys_id={}"
    url = url_template.format(plant_id_eia)
    request = download(url)
    if request.status_code > 399:
        #print("Failed to retrieve data for ", plant_id_eia)
        return np.nan
    try:
        json_data = request.json()["Results"]
    except (JSONDecodeError, KeyError):
        #print("Failed to parse json info for ", plant_id_eia)
        return np.nan
#     try:
#         json_data2 = request.json()["Results"]["ProgramFacilities"]
#     except (JSONDecodeError, KeyError):
#         #print("Failed to parse json info for ", plant_id_eia)
#         return np.nan
    if not json_data:
        return np.nan
    frs_id = json_data["FRSFacility"][0]["RegistryId"]
    # I think program id is the same as EIA id....
    program_id = json_data["FRSFacility"][0]["ProgramFacilities"][0]["ProgramSystemId"]
    #plant_info = pd.json_normalize(json_data)
    #reg_id = plant_info.iloc[0]['RegistryId']
    return json_data

In [132]:
test = one_plant_reg_id(2)

In [133]:
test["FRSFacility"][0]["ProgramFacilities"][0]["ProgramSystemId"]

'2'

In [54]:
pudl_out.plants_eia86

Unnamed: 0,plant_id_eia,plant_name_eia,balancing_authority_code_eia,balancing_authority_name_eia,city,county,ferc_cogen_status,ferc_exempt_wholesale_generator,ferc_small_power_producer,grid_voltage_kv,...,net_metering,pipeline_notes,regulatory_status_code,transmission_distribution_owner_id,transmission_distribution_owner_name,transmission_distribution_owner_state,utility_id_eia,water_source,plant_id_pudl,utility_id_pudl
0,2,Bankhead Dam,SOCO,"Southern Company Services, Inc. - Trans",Northport,Tuscaloosa,False,False,False,115.00,...,,,RE,195,Alabama Power Co,AL,195,Black Warrior River,852,18
1,2,Bankhead Dam,SOCO,"Southern Company Services, Inc. - Trans",Northport,Tuscaloosa,False,False,False,115.00,...,,,RE,195,Alabama Power Co,AL,195,Black Warrior River,852,18
2,2,Bankhead Dam,SOCO,"Southern Company Services, Inc. - Trans",Northport,Tuscaloosa,False,False,False,115.00,...,,,RE,195,Alabama Power Co,AL,195,Black Warrior River,852,18
3,2,Bankhead Dam,SOCO,"Southern Company Services, Inc. - Trans",Northport,Tuscaloosa,False,False,False,115.00,...,,,RE,195,Alabama Power Co,AL,195,Black Warrior River,852,18
4,2,Bankhead Dam,SOCO,"Southern Company Services, Inc. - Trans",Northport,Tuscaloosa,False,False,False,115.00,...,,,RE,195,Alabama Power Co,AL,195,Black Warrior River,852,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82116,62936,Rattlesnake,AVA,Avista Corporation,Ritzville,Adams,False,False,False,115.00,...,,,NR,20169,Avista Corp,WA,62778,,13850,
82117,62937,Athens BESS,MISO,Midcontinent Independent Transmission System O...,Athens,Isanti,False,False,False,12.47,...,,,NR,689,Connexus Energy,MN,62731,,13851,
82118,62938,Glen Ullin Energy Center,MISO,Midcontinent Independent Transmission System O...,Glen Ullin,Morton,False,False,False,230.00,...,,,NR,12647,"ALLETE, Inc.",MN,59496,,13852,450
82119,62939,South Peak Wind,NWMT,NorthWestern Energy (NWMT),Geyser,Judith Basin,False,True,True,230.00,...,,,NR,12825,NorthWestern Energy LLC - (MT),MT,59496,,13853,450


In [170]:
# HOW TO RUN THIS WITHOUT APPLY BECAUSE IT TAKES TOO LONG

#%%time
#test['FRS_regional_id'] = test['plant_id_eia'].apply(one_plant_reg_id)

#test_reg = test.assign(reg_id=lambda x: one_plant_reg_id(x.plant_id_eia))

#fill_reg_ids(eia_epacems_crosswalk)

# Test ETL
The following code is taken from the `_eia_etl` function in the `etl` module

In [42]:
eia923_years = list(range(2017, 2018))
eia860_years = list(range(2017, 2018))

In [43]:
datapkg_dir = pathlib.Path(pudl_settings["datapkg_dir"]) / 'pudl-example/eia-example'

In [46]:
# generate CSVs for the static EIA tables, return the list of tables
static_tables = pudl.etl._load_static_tables_eia(datapkg_dir)

Loading Static EIA Tables fuel_type_eia923 dataframe into CSV
Loading Static EIA Tables prime_movers_eia923 dataframe into CSV
Loading Static EIA Tables fuel_type_aer_eia923 dataframe into CSV
Loading Static EIA Tables energy_source_eia923 dataframe into CSV
Loading Static EIA Tables transport_modes_eia923 dataframe into CSV


In [47]:
sandbox = pudl_settings.get("sandbox", False)
ds = pudl.workspace.datastore.Datastore(
    pathlib.Path(pudl_settings["pudl_in"]),
    sandbox=sandbox)

In [48]:
# Extract EIA forms 923, 860
eia923_raw_dfs = pudl.extract.eia923.Extractor(ds).extract(eia923_years)
eia860_raw_dfs = pudl.extract.eia860.Extractor(ds).extract(eia860_years)


Extracting eia923 spreadsheet data.
Extracting eia860 spreadsheet data.
Columns for boiler_generator_assn are off: should be 4 but got 8
Columns for generator_existing are off: should be 76 but got 77
Columns for generator_proposed are off: should be 55 but got 56
Columns for generator_retired are off: should be 75 but got 76
Columns for ownership are off: should be 14 but got 15
Columns for plant are off: should be 46 but got 48
Columns for utility are off: should be 20 but got 21


In [49]:
# Transform EIA forms 923, 860
eia860_transformed_dfs = pudl.transform.eia860.transform(
    eia860_raw_dfs, eia860_tables=list(pc.pudl_tables['eia860']))
eia923_transformed_dfs = pudl.transform.eia923.transform(
    eia923_raw_dfs, eia923_tables=list(pc.pudl_tables['eia923']))

Transforming raw EIA 860 DataFrames for ownership_eia860 concatenated across all years.
Transforming raw EIA 860 DataFrames for generators_eia860 concatenated across all years.
Transforming raw EIA 860 DataFrames for plants_eia860 concatenated across all years.
Transforming raw EIA 860 DataFrames for boiler_generator_assn_eia860 concatenated across all years.
Transforming raw EIA 860 DataFrames for utilities_eia860 concatenated across all years.
Transforming raw EIA 923 DataFrames for generation_fuel_eia923 concatenated across all years.
Transforming raw EIA 923 DataFrames for boiler_fuel_eia923 concatenated across all years.
Transforming raw EIA 923 DataFrames for generation_eia923 concatenated across all years.
Transforming raw EIA 923 DataFrames for coalmine_eia923 concatenated across all years.
Transforming raw EIA 923 DataFrames for fuel_receipts_costs_eia923 concatenated across all years.


In [93]:
# create an eia transformed dfs dictionary
eia_transformed_dfs = eia860_transformed_dfs.copy()
eia_transformed_dfs.update(eia923_transformed_dfs.copy())

In [94]:
def _add_eia_epacems_crosswalk(eia_transformed_dfs):
    """Add the EIA-EPA crosswalk to the transformed dfs dict."""
    plants_eia = eia_transformed_dfs['plants_eia860'].copy()
    gens_eia = eia_transformed_dfs['generators_eia860'].copy()
    assn_dfs = (
        pudl.glue.eia_epacems.find_test_combine_id_matches(
            plants_eia, gens_eia)
    )
    eia_transformed_dfs.update(assn_dfs)
    
    return eia_transformed_dfs

# ADD CROSSWALK
eia_transformed_dfs = _add_eia_epacems_crosswalk(eia_transformed_dfs)

grabbing original crosswalk
separating matched from missing
running plant name match
separating matched from missing
running plant id and plant gen match
separating matched from missing
running plant id match
separating matched from missing
splitting crosswalk into three normalized tables


In [95]:
# convert types..
eia_transformed_dfs = pudl.helpers.convert_dfs_dict_dtypes(
    eia_transformed_dfs, 'eia')

  mask = arr == x
  mask = arr == x


In [96]:
# Check that the tables are there
eia_transformed_dfs.keys()

dict_keys(['ownership_eia860', 'generators_eia860', 'plants_eia860', 'boiler_generator_assn_eia860', 'utilities_eia860', 'generation_fuel_eia923', 'boiler_fuel_eia923', 'generation_eia923', 'coalmine_eia923', 'fuel_receipts_costs_eia923', 'plant_unit_epa', 'assn_plant_id_eia_epa', 'assn_gen_eia_unit_epa'])

In [98]:
entities_dfs, eia_transformed_dfs = pudl.transform.eia.transform(
    eia_transformed_dfs,
    eia860_years=eia860_years,
    eia923_years=eia923_years,
)

Harvesting IDs & consistently static attributes for EIA plants
Average consistency of static plants values is 99.98%
Harvesting IDs & consistently static attributes for EIA generators
Average consistency of static generators values is 100.00%
Harvesting IDs & consistently static attributes for EIA utilities
Average consistency of static utilities values is 100.00%
Harvesting IDs & consistently static attributes for EIA boilers
Average consistency of static boilers values is 97.58%
Inferring complete EIA boiler-generator associations.
Multiple EIA unit codes:plant_id_eia=10725, unit_id_pudl=1, unit_id_eia=['F801' 'F802']
Multiple EIA unit codes:plant_id_eia=55309, unit_id_pudl=1, unit_id_eia=['SMR1' 'SMR2']
Multiple EIA unit codes:plant_id_eia=60786, unit_id_pudl=1, unit_id_eia=['4343' '4141']


In [99]:
# convert types..
entities_dfs = pudl.helpers.convert_dfs_dict_dtypes(entities_dfs, 'eia')

# Compile transformed dfs for loading...
transformed_dfs = {"Entities": entities_dfs, "EIA": eia_transformed_dfs}
# Load step
for data_source, transformed_df in transformed_dfs.items():
    pudl.load.csv.dict_dump(transformed_df,
                            data_source,
                            datapkg_dir=datapkg_dir)

  mask = arr == x


Loading Entities plants_entity_eia dataframe into CSV
Loading Entities generators_entity_eia dataframe into CSV
Loading Entities utilities_entity_eia dataframe into CSV
Loading Entities boilers_entity_eia dataframe into CSV
Loading EIA ownership_eia860 dataframe into CSV
Loading EIA generators_eia860 dataframe into CSV
Loading EIA plants_eia860 dataframe into CSV
Loading EIA boiler_generator_assn_eia860 dataframe into CSV
Loading EIA utilities_eia860 dataframe into CSV
Loading EIA generation_fuel_eia923 dataframe into CSV
Loading EIA boiler_fuel_eia923 dataframe into CSV
Loading EIA generation_eia923 dataframe into CSV
Loading EIA coalmine_eia923 dataframe into CSV
Loading EIA fuel_receipts_costs_eia923 dataframe into CSV
Loading EIA plant_unit_epa dataframe into CSV
Loading EIA assn_plant_id_eia_epa dataframe into CSV
Loading EIA assn_gen_eia_unit_epa dataframe into CSV


# Primary Keys

In [5]:
eia_transformed_dfs.keys()

NameError: name 'eia_transformed_dfs' is not defined

In [None]:
test = eia_transformed_dfs['assn_eia_epacems']
test = test.drop_duplicates()

In [None]:
print(len(test))
print(len(test.groupby(['plant_id_epa', 'unit_id_epa', 'generator_id'])))

In [None]:
test.columns

#### Tentative Keys (to add to meta data)
**Primary Keys**: plant_id_epa, unit_id_epa, generator_id (boiler_id?...doesn't change the number of rows)
**Foreign Keys**: plant_id_eia, generator_id, prime_mover code (maybe just remove?) 

Question -- can foreign and primary keys overlap?