# Transform EPA's EIA-EPA Crosswalk CSV

In [14]:
import pandas as pd
from pathlib import Path
import sqlalchemy as sa
import importlib.resources

# local imports
import pudl
import pudl.transform.eia as pte

In [15]:
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_in = Path(pudl_settings['pudl_in'])
ds = pudl.workspace.datastore.Datastore(pudl_in, sandbox=True)
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine) # freq='monthly'/'annual' (maybe other abr.)

In [23]:
eia_epacems_crosswalk = importlib.resources.open_text(
    'pudl.package_data.glue', 'epa_eia_crosswalk_from_epa.csv'
).readlines()

#eia_epacems_crosswalk = pd.read_csv(eia_epacems_crosswalk)

In [21]:
eia_epacems_crosswalk

Unnamed: 0,0
0,"Facility Name,ORIS Code,EIA ORIS,Unit ID,Gener..."
1,"Barry,3,,1,1,1,Pipeline Natural Gas,ST,180,184..."
2,"Barry,3,,2,2,2,Pipeline Natural Gas,ST,180,188..."
3,"Barry,3,,4,4,4,Coal,ST,400,12242519.45,X-walk ..."
4,"Barry,3,,5,5,5,Coal,ST,800,33224781.5,X-walk f..."
...,...
6859,"Salem Harbor Station NGCC,60903,,1,1,,,,,,X-wa..."
6860,"Salem Harbor Station NGCC,60903,,2,2,,,,,,X-wa..."
6861,"Salem Harbor Station NGCC,60903,,2,4,,,,,,X-wa..."
6862,"Hartwell Energy Facility,70454,54538,MAG1,MAG1..."


In [5]:
# Load raw EPA csv
path = Path.cwd().parent.parent / 'src/pudl/package_data/glue/epa_eia_crosswalk_from_epa.csv'
eia_epacems_crosswalk = pd.read_csv(path)

In [6]:
# Transform column headers
new_cols = eia_epacems_crosswalk.columns.str.lower()
new_cols = [col.replace(' ', '_') for col in new_cols]
eia_epacems_crosswalk.columns = new_cols
eia_epacems_crosswalk = eia_epacems_crosswalk.rename(
    columns={'oris_code': 'plant_id_epa',
             'eia_oris': 'plant_id_eia',
             'unit_id': 'epa_point_source_unit',
             'facility_name': 'plant_name_eia',
             'unit_type': 'prime_mover_code'}
)[['plant_name_eia', 'plant_id_eia', 'plant_id_epa', 'epa_point_source_unit', 'generator_id', 'boiler_id', 'prime_mover_code']]

### Fill in plant id mapping gaps

1. Attempt to match based on **plant name strings**# Prepare EIA plant tables for integration
eia_plants = pudl_out.plants_eia860().copy()
eia_plants = eia_plants.filter(['plant_id_eia','plant_name_eia']).copy() from EIA and EPA

In [7]:
# Prepare EIA plant tables for integration
eia_plants = pudl_out.plants_eia860().copy()
eia_plants = eia_plants.filter(['plant_id_eia','plant_name_eia']).copy()

In [8]:
# Establish subset of EPA-EIA crosswalk EIA ids
pre_matched_plant_eia = eia_epacems_crosswalk[eia_epacems_crosswalk['plant_id_eia'].notna()]

# Establish subset of EPA-EIA crosswalk with no EIA id
missing_plant_eia = (
    eia_epacems_crosswalk
    .query("plant_id_eia == 'NaN'")
    .drop('plant_id_eia', axis=1)
    .reset_index()
)

In [9]:
# Merge missing data on plant name
missing_merge = (
    pd.merge(
        missing_plant_eia, 
        eia_plants, 
        on='plant_name_eia', 
        how='left')
    .drop_duplicates(subset='index')
)

# Plants with a match
merge_match = missing_merge[missing_merge['plant_id_eia'].notna()]

# Plant ids that are still missing
missing_plant_eia2 = (
    missing_merge[missing_merge['plant_id_eia'].isna()]
    .drop('plant_id_eia', axis=1)
)

2. In most cases, the EPA has already associated an EIA generator_id with each EPA unit_id. In this case, if we are able to find an **EIA plant_id generator_id pair that matches an EPA plant_id generator_id pair**, we will assume that the EPA and EIA plant_ids are identical.

In [10]:
# Prepare EIA generator table for integration
eia_gen = pudl_out.gens_eia860().copy()
eia_gen = eia_gen.filter(['plant_id_eia', 'generator_id']).copy()

In [11]:
# Merge missing data on plant id and generator id
missing_merge2 = (
    pd.merge(
        missing_plant_eia2,
        eia_gen,
        left_on=['plant_id_epa', 'generator_id'],
        right_on=['plant_id_eia', 'generator_id'],
        how='left')
    .drop_duplicates(subset='index')
)

# Plants with a match
merge_match2 = missing_merge2[missing_merge2['plant_id_eia'].notna()]

# Plant ids that are still missing
missing_plant_eia3 = (
    missing_merge2[missing_merge2['plant_id_eia'].isna()]
    .drop('plant_id_eia', axis=1)
)

3. For whatever is left, we will simply look for instances where there is an **EIA plant_id that matches an EPA plant_id**. By comparing the plant names, we should be able to identify if these plants are the same.

In [10]:
# Merge missing data on plant_id_eia and plant_id_epa
missing_merge3 = (
    pd.merge(
        missing_plant_eia3,
        eia_plants,
        left_on='plant_id_epa',
        right_on='plant_id_eia',
        how='left',
        suffixes=['_epa', '_eia'])
    .drop_duplicates(subset='index')
)

# Plants with a match
merge_match3 = missing_merge3[missing_merge3['plant_id_eia'].notna()]

# Plant ids that are still missing
missing_plant_eia4 = missing_merge3[missing_merge3['plant_id_eia'].isna()]

In [17]:
# Combine all pre-matched plants with all found matches and leftovers without a match
cleaned_eia_epacems_crosswalk = (
    pd.concat([
        pre_matched_plant_eia,
        merge_match,
        merge_match2,
        merge_match3,
        missing_plant_eia4
    ])
    .drop(['index', 'plant_name_eia_eia', 'plant_name_eia_epa'], axis=1)
)

In [20]:
cleaned_eia_epacems_crosswalk['unit_type'].unique()

array(['CT', 'GT', 'ST', nan, 'OT'], dtype=object)

# Fill Missing Boiler-Generator Associations (TODO)