# Transform EPA's EIA-EPA Crosswalk CSV

In [1]:
import pandas as pd
from pathlib import Path
import sqlalchemy as sa
import importlib.resources

# local imports
import pudl
import pudl.transform.eia as pte

In [2]:
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_in = Path(pudl_settings['pudl_in'])
ds = pudl.workspace.datastore.Datastore(pudl_in, sandbox=True)
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine) # freq='monthly'/'annual' (maybe other abr.)

In [25]:
eia_epacems_crosswalk_csv = (
    importlib.resources.open_text(
        'pudl.package_data.glue', 
        'epa_eia_crosswalk_from_epa.csv')
)

eia_epacems_crosswalk = (
    pd.read_csv(eia_epacems_crosswalk_csv)
    .pipe(pudl.helpers.simplify_columns)
    .rename(columns={
        'oris_code': 'plant_id_epa',
        'eia_oris': 'plant_id_eia',
        'unit_id': 'epa_point_source_unit',
        'facility_name': 'plant_name_eia',
        'unit_type': 'prime_mover_code'})
    .drop([
        'fuel_type_primary', 
        'edat_mw_cap', 
        'way_gen_id_matched', 
        'unit_op_status_date', 
        'notes',
        'annual_heat_input',
        'op_status'], axis=1)
)

### Fill in plant id mapping gaps

1. Attempt to match based on **plant name strings**


In [31]:
# Prepare EIA plant tables for integration
eia_plants = pudl_out.plants_eia860().copy()
eia_plants = eia_plants.filter(['plant_id_eia','plant_name_eia']).copy()

In [32]:
# Make subset df of EPA-EIA crosswalk *WITH* EIA ids
pre_matched_plant_eia = eia_epacems_crosswalk[eia_epacems_crosswalk['plant_id_eia'].notna()]

# Make subset df of EPA-EIA crosswalk *WITHOUT* EIA ids
missing_plant_eia = (
    eia_epacems_crosswalk
    .query("plant_id_eia.isna()")
    .drop('plant_id_eia', axis=1)
    .reset_index()
)

In [35]:
# Merge missing data on plant name
missing_merge = (
    pd.merge(
        missing_plant_eia, 
        eia_plants, 
        on='plant_name_eia', 
        how='left')
    .drop_duplicates(subset='index')
)

# Plants with a match
merge_match = missing_merge[missing_merge['plant_id_eia'].notna()]

# Plant ids that are still missing
missing_plant_eia2 = (
    missing_merge[missing_merge['plant_id_eia'].isna()]
    .drop('plant_id_eia', axis=1)
)

2. In most cases, the EPA has already associated an EIA generator_id with each EPA unit_id. In this case, if we are able to find an **EIA plant_id generator_id pair that matches an EPA plant_id generator_id pair**, we will assume that the EPA and EIA plant_ids are identical.

In [37]:
# Prepare EIA generator table for integration
eia_gen = pudl_out.gens_eia860().copy()
eia_gen = eia_gen.filter(['plant_id_eia', 'generator_id']).copy()

In [38]:
# Merge missing data on plant id and generator id
missing_merge2 = (
    pd.merge(
        missing_plant_eia2,
        eia_gen,
        left_on=['plant_id_epa', 'generator_id'],
        right_on=['plant_id_eia', 'generator_id'],
        how='left')
    .drop_duplicates(subset='index')
)

# Plants with a match
merge_match2 = missing_merge2[missing_merge2['plant_id_eia'].notna()]

# Plant ids that are still missing
missing_plant_eia3 = (
    missing_merge2[missing_merge2['plant_id_eia'].isna()]
    .drop('plant_id_eia', axis=1)
)

3. For whatever is left, we will simply look for instances where there is an **EIA plant_id that matches an EPA plant_id**. By comparing the plant names, we should be able to identify if these plants are the same.

In [39]:
# Merge missing data on plant_id_eia and plant_id_epa
missing_merge3 = (
    pd.merge(
        missing_plant_eia3,
        eia_plants,
        left_on='plant_id_epa',
        right_on='plant_id_eia',
        how='left',
        suffixes=['_epa', '_eia'])
    .drop_duplicates(subset='index')
)

# Plants with a match
merge_match3 = missing_merge3[missing_merge3['plant_id_eia'].notna()]

# Plant ids that are still missing
missing_plant_eia4 = missing_merge3[missing_merge3['plant_id_eia'].isna()]

In [40]:
# Combine all pre-matched plants with all found matches and leftovers without a match
cleaned_eia_epacems_crosswalk = (
    pd.concat([
        pre_matched_plant_eia,
        merge_match,
        merge_match2,
        merge_match3,
        missing_plant_eia4
    ])
    .drop(['index', 'plant_name_eia_eia', 'plant_name_eia_epa'], axis=1)
)

In [42]:
cleaned_eia_epacems_crosswalk[cleaned_eia_epacems_crosswalk['plant_id_eia'].isna()]

Unnamed: 0,plant_name_eia,plant_id_epa,plant_id_eia,epa_point_source_unit,generator_id,boiler_id,prime_mover_code
0,,1594,,11,Plant not in EIA,,ST
1,,1594,,12,Plant not in EIA,,ST
2,,2440,,001,Plant not in EIA,,ST
125,,50044,,B132,Plant not in EIA,,ST
198,,50607,,23,Plant not in EIA,,ST
199,,50607,,24,Plant not in EIA,,ST
200,,50607,,26,Plant not in EIA,,ST
201,,50607,,RSB1,Plant not in EIA,,ST
202,,50607,,RSB2,Plant not in EIA,,ST
454,,55703,,P049,Plant not in EIA,,ST


# Fill Missing Boiler-Generator Associations (TODO)

# Add FRS Registry IDs 

In [171]:
#eia_epacems_crosswalk

In [168]:
from json import JSONDecodeError
from functools import lru_cache
import requests
import pandas as pd
import numpy as np
import time
import progressbar

# Cache this so we don't have to keep loading the same page
@lru_cache(512)
def download(url):
    return requests.get(url)

def one_plant_reg_id(plant_id_eia):
    url_template = "https://ofmpub.epa.gov/enviro/frs_rest_services.get_facilities?pgm_sys_acrnm=EIA-860&output=JSON&pgm_sys_id={}"
    url = url_template.format(plant_id_eia)
    request = download(url)
    if request.status_code > 399:
        #print("Failed to retrieve data for ", plant_id_eia)
        return np.nan
    try:
        json_data = request.json()["Results"]["FRSFacility"]
    except (JSONDecodeError, KeyError):
        #print("Failed to parse json info for ", plant_id_eia)
        return np.nan
    if not json_data:
        return np.nan
    plant_info = pd.json_normalize(json_data)
    reg_id = plant_info.iloc[0]['RegistryId']
    return reg_id

In [170]:
# HOW TO RUN THIS WITHOUT APPLY BECAUSE IT TAKES TOO LONG

#%%time
#test['FRS_regional_id'] = test['plant_id_eia'].apply(one_plant_reg_id)

#test_reg = test.assign(reg_id=lambda x: one_plant_reg_id(x.plant_id_eia))

#fill_reg_ids(eia_epacems_crosswalk)