# Working with the EIA Extract / Transform
This notebook steps through PUDL's extract and transform steps for the EIA 860 and 923 datasets, to make it easier to test and add new years of data, or new tables from the various spreadsheets that haven't been integrated yet.

In [1]:
%load_ext autoreload
%autoreload 2
import pudl
from pudl import constants as pc
import logging
import sys
from pathlib import Path
import pandas as pd
pd.options.display.max_columns = None

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [3]:
pudl_settings = pudl.workspace.setup.get_defaults()

## Set the scope for the Extract-Transform:

In [54]:
eia860_tables = pc.pudl_tables['eia860']
eia860_years = pudl.constants.working_partitions['eia860']["years"]
eia923_tables = pc.pudl_tables['eia923']
eia923_years = pudl.constants.working_partitions['eia923']["years"]
eia860_ytd = True
eia860m_years = ["2020-08"]
sandbox = False

In [55]:
%%time
sandbox = pudl_settings.get("sandbox", False)
ds = pudl.workspace.datastore.Datastore(
    Path(pudl_settings["pudl_in"]),
    sandbox=sandbox)
# Extract EIA forms 923, 860
eia923_raw_dfs = pudl.extract.eia923.Extractor(ds).extract(
    year=eia923_years)
eia860_raw_dfs = pudl.extract.eia860.Extractor(ds).extract(
    year=eia860_years)
# if we are trying to add the EIA 860M YTD data, then extract it and append
if eia860_ytd:
    eia860m_raw_dfs = pudl.extract.eia860m.Extractor(ds).extract(
        year_month=pc.working_partitions['eia860m']['year_month'])
    eia860_raw_dfs = pudl.extract.eia860m.append_eia860m(
        eia860_raw_dfs=eia860_raw_dfs, eia860m_raw_dfs=eia860m_raw_dfs)

# Transform EIA forms 923, 860
eia860_transformed_dfs = pudl.transform.eia860.transform(
    eia860_raw_dfs, eia860_tables=eia860_tables)
eia923_transformed_dfs = pudl.transform.eia923.transform(
    eia923_raw_dfs, eia923_tables=eia923_tables)
# create an eia transformed dfs dictionary
eia_transformed_dfs = eia860_transformed_dfs.copy()
eia_transformed_dfs.update(eia923_transformed_dfs.copy())

# # Add EIA-EPA crosswalk tables
# eia_transformed_dfs = _add_eia_epacems_crosswalk(eia_transformed_dfs)

# # convert types..
# eia_transformed_dfs = pudl.helpers.convert_dfs_dict_dtypes(
#     eia_transformed_dfs, 'eia')

Extracting eia923 spreadsheet data.

boiler_fuel

fuel_receipts_costs

generation_fuel

generator

plant_frame

stocks
Extracting eia860 spreadsheet data.

boiler_generator_assn
Columns for boiler_generator_assn are off: should be 4 but got 9
Unmapped raw columns: {'utility_name', 'plant_name', 'generator_association', 'report_year', 'steam_plant_type'}

generator
Columns for generator are off: should be 67 but got 75
Unmapped raw columns: {'fercewgdoc', 'planned_derates_net_summer_cap', 'summer_capacity', 'fercdock', 'report_year', 'ferccogen', 'fercother', 'winter_capacity'}

generator_existing

generator_proposed
Columns for generator_proposed are off: should be 62 but got 68
Unmapped raw columns: {'summer_capacity', 'winter_estimated_capacity', 'report_year', 'data_source', 'summer_estimated_capacity', 'winter_capacity'}

generator_retired

ownership

plant
Columns for plant are off: should be 47 but got 50
Unmapped raw columns: {'ownertransdist', 'ferc_exempt_wholesale_generator_d

In [56]:
from pudl.etl import _add_eia_epacems_crosswalk

print(eia_transformed_dfs.keys())
eia_transformed_dfs = _add_eia_epacems_crosswalk(eia_transformed_dfs)

# convert types..
eia_transformed_dfs = pudl.helpers.convert_dfs_dict_dtypes(
    eia_transformed_dfs, 'eia')

print(eia_transformed_dfs.keys())

dict_keys(['ownership_eia860', 'generators_eia860', 'plants_eia860', 'boiler_generator_assn_eia860', 'utilities_eia860', 'generation_fuel_eia923', 'boiler_fuel_eia923', 'generation_eia923', 'coalmine_eia923', 'fuel_receipts_costs_eia923'])
grabbing original crosswalk
splitting crosswalk into three normalized tables
dict_keys(['ownership_eia860', 'generators_eia860', 'plants_eia860', 'boiler_generator_assn_eia860', 'utilities_eia860', 'generation_fuel_eia923', 'boiler_fuel_eia923', 'generation_eia923', 'coalmine_eia923', 'fuel_receipts_costs_eia923', 'plant_unit_epa', 'assn_plant_id_eia_epa', 'assn_gen_eia_unit_epa'])


In [57]:
debug=True
# create the empty entities df to fill up
entities_dfs = {}

# for each of the entities, harvest the static and annual columns.
# the order of the entities matter! the
# for entity in pc.entities.keys():
entity = 'plants'
logger.info(f"Harvesting IDs & consistently static attributes "
            f"for EIA {entity}")

entities_dfs, eia_transformed_dfs, col_dfs = pudl.transform.eia.harvesting(
    entity, eia_transformed_dfs, entities_dfs,
    debug=debug, eia860_ytd=eia860_ytd)

Harvesting IDs & consistently static attributes for EIA plants
service_area has low consistency: 0.816.
Average consistency of static plants values is 98.71%


In [67]:
service_area_df = col_dfs['service_area']

total_number_of_plant_ids = len(service_area_df.drop_duplicates(subset=['plant_id_eia']))

plants_with_consistent_service_area = service_area_df[service_area_df["service_area_consistent"]]

print(service_area_df.shape)
print(plants_with_consistent_service_area.shape)


(19024, 7)
(14996, 7)


In [96]:
service_area_df.sort_values(by="service_area_consistent_rate", ascending=True).sample(n=10)

Unnamed: 0,plant_id_eia,report_date,service_area,entity_occurences,record_occurences,service_area_consistent_rate,service_area_consistent
7189,54999,2003-01-01,Savannah Electric & Power Co,6,6,1.0,True
449,55224,2001-01-01,PSI Energy Inc,5,5,1.0,True
11697,10879,2005-01-01,Imperial Irrigation District,6,6,1.0,True
12050,50301,2004-01-01,Pacific Gas & Electric Co,6,6,1.0,True
11747,55521,2001-01-01,Pacific Gas & Electric Co,6,1,0.166667,False
9194,55012,2003-01-01,Massachusetts Electric Co,6,6,1.0,True
12498,54535,2006-01-01,Jacksonville Electric Authority,6,4,0.666667,False
3130,7153,2004-01-01,Delmarva Power & Light Company,6,5,0.833333,True
5870,55475,2004-01-01,Niagara Mohawk Power Corp,6,6,1.0,True
12813,52106,2001-01-01,PECO Energy Co,6,6,1.0,True


In [119]:
plant_id_eia = 56311
service_area_df[service_area_df["plant_id_eia"] == plant_id_eia]

Unnamed: 0,plant_id_eia,report_date,service_area,entity_occurences,record_occurences,service_area_consistent_rate,service_area_consistent
18676,56311,2004-01-01,"TXU Energy Trading Co., LP",3,1,0.333333,False
18677,56311,2005-01-01,TXU Electric Delivery Company,3,2,0.666667,False
18678,56311,2006-01-01,TXU Electric Delivery Company,3,2,0.666667,False


In [111]:
service_area_df.query("service_area_consistent_rate < 0.7").sort_values(by=["plant_id_eia", "service_area_consistent_rate"], ascending=False).head(20)

Unnamed: 0,plant_id_eia,report_date,service_area,entity_occurences,record_occurences,service_area_consistent_rate,service_area_consistent
18838,56417,2006-01-01,Commonwealth Edison Co,2,1,0.5,False
18837,56417,2005-01-01,Commonwealth Edison Co IN Inc,2,1,0.5,False
18748,56336,2006-01-01,Public Service Co of NM,2,1,0.5,False
18747,56336,2005-01-01,Arizona Public Service Co,2,1,0.5,False
18677,56311,2005-01-01,TXU Electric Delivery Company,3,2,0.666667,False
18678,56311,2006-01-01,TXU Electric Delivery Company,3,2,0.666667,False
18676,56311,2004-01-01,"TXU Energy Trading Co., LP",3,1,0.333333,False
18513,56307,2005-01-01,Dairyland Power Coop,3,2,0.666667,False
18514,56307,2006-01-01,Dairyland Power Coop,3,2,0.666667,False
18512,56307,2004-01-01,Dairyland Power Cooperative,3,1,0.333333,False


### Calculate the ratio
total number of entities with a consistent value / total number of entities with service area information?

In [112]:
n_plants_with_consistent_values = len(plants_with_consistent_service_area.drop_duplicates(subset=["plant_id_eia"]))

n_plants_with_values = len(service_area_df.drop_duplicates(subset=['plant_id_eia']))

ratio = n_plants_with_consistent_values / n_plants_with_values
print(ratio)

0.8161016949152542


## Is `service_area` static?

In [120]:
sample = ["TXU Energy Trading Co., LP", "TXU Electric Delivery Company", "TXU Electric Delivery Company"]

In [123]:
import itertools

list(itertools.combinations(sample, 2))

[('TXU Energy Trading Co., LP', 'TXU Electric Delivery Company'),
 ('TXU Energy Trading Co., LP', 'TXU Electric Delivery Company'),
 ('TXU Electric Delivery Company', 'TXU Electric Delivery Company')]

In [128]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def group_similarity(grp):
    similarities = []
    if len(grp) == 1:
        return 1
    else:
        for a,b in itertools.combinations(grp, 2):
            similarities.append(similar(a, b))
        return pd.Series(similarities).mean()

average_service_area_similarities = service_area_df.groupby("plant_id_eia")['service_area'].apply(group_similarity)

In [137]:
average_service_area_similarities = average_service_area_similarities.reset_index().rename(columns={"service_area": "average_similarity"})

In [149]:
average_service_area_similarities.average_similarity.describe()

count    3540.000000
mean        0.948972
std         0.115329
min         0.177778
25%         0.989116
50%         1.000000
75%         1.000000
max         1.000000
Name: average_similarity, dtype: float64

75% of plants have an average service area similarity of 98%. Given this it seems strange our etl process throws an error due to its inconsistency. 

### Inspect plants with the least similar service areas

In [147]:
service_area_df_with_similarities = service_area_df.merge(average_service_area_similarities, on="plant_id_eia", how="left", validate="m:1")

service_area_df_with_similarities.sort_values(by="average_similarity", ascending=True).head(5)

Unnamed: 0,plant_id_eia,report_date,service_area,entity_occurences,record_occurences,service_area_consistent_rate,service_area_consistent,average_similarity
1923,50219,2006-01-01,PG&E Energy Services,2,1,0.5,False,0.177778
1914,50219,2001-01-01,Pacific Gas & Electric Co,2,1,0.5,False,0.177778
1913,50218,2006-01-01,PG&E Energy Services,2,1,0.5,False,0.177778
1892,50218,2001-01-01,Pacific Gas & Electric Co,2,1,0.5,False,0.177778
1787,56107,2005-01-01,Tennessee Valley Authority,2,1,0.5,False,0.32


It looks like the plants with the least similar service_area fields refer to very similar service areas in reality. 

## play with `strictness` so `ratio > 0.9`

In [196]:
# default is 0.7. Higher means less strict.
strictness = 0.65


adj_strictness_service_area_df = service_area_df.copy()

adj_strictness_service_area_df['service_area_consistent'] = adj_strictness_service_area_df.service_area_consistent_rate > strictness

In [197]:
n_plants = len(adj_strictness_service_area_df.drop_duplicates(subset="plant_id_eia"))

n_consistent_plants = len(adj_strictness_service_area_df[adj_strictness_service_area_df['service_area_consistent']].drop_duplicates(subset="plant_id_eia"))

print(n_consistent_plants)
print(n_plants)
print(n_consistent_plants / n_plants)

3389
3540
0.9573446327683616


When strictness is > 0.66 the ratio is 0.816. When the strictness is <= 0.66, the ratio is 0.95.

This means there are a large number of plants where 2/3s of the service areas are the same and 1/3 are different.

In [207]:
import numpy as np

plant_ids_with_twothirds_consistency = adj_strictness_service_area_df[np.isclose(adj_strictness_service_area_df["service_area_consistent_rate"], 0.666666)].plant_id_eia

In [211]:
adj_strictness_service_area_df[adj_strictness_service_area_df.plant_id_eia.isin(plant_ids_with_twothirds_consistency)].sort_values(by="plant_id_eia").head(40)

Unnamed: 0,plant_id_eia,report_date,service_area,entity_occurences,record_occurences,service_area_consistent_rate,service_area_consistent
6722,594,2003-01-01,Delmarva Power & Light Company,6,4,0.666667,True
6725,594,2006-01-01,Delmarva Power & Light Company,6,4,0.666667,True
6724,594,2005-01-01,Delmarva Power & Light Company,6,4,0.666667,True
6723,594,2004-01-01,Delmarva Power & Light Company,6,4,0.666667,True
6720,594,2001-01-01,Conectiv,6,1,0.166667,False
6721,594,2002-01-01,Conectiv Energy Services Inc,6,1,0.166667,False
13731,1564,2003-01-01,Delmarva Power & Light Company,6,4,0.666667,True
13732,1564,2004-01-01,Delmarva Power & Light Company,6,4,0.666667,True
13733,1564,2005-01-01,Delmarva Power & Light Company,6,4,0.666667,True
13734,1564,2006-01-01,Delmarva Power & Light Company,6,4,0.666667,True


### Examples
- **plant_id_eia: 594, 1564:**"In 1998, Delmarva Power acquired Atlantic Energy, which owned Atlantic City Electric in New Jersey, for 968 million. The merged utility company became known as Conectiv Power Delivery. Conectiv Power Delivery was acquired by the Potomac Electric Power Company in 2002 for 5.4 billion, which resulted in Pepco Holdings being created as a holding company that owned both utility companies.[6] In 2005, Pepco Holdings brought back the Delmarva Power and Atlantic City Electric names in place of Conectiv Power Delivery.[7]" - https://en.wikipedia.org/wiki/Delmarva_Power
- **plant-id-eia: 1570:** The Potomac Edison Company" was a subsidiary of Allegheny Electric Coop. https://en.wikipedia.org/wiki/Allegheny_Energy
- **plant_id_eia: 1590, 1590** In 1999 Boston Edison CO was merged with Cambridge Electric Light Company, Commonwealth Electric Company, and NSTAR Gas Company to form NSTAR.[4] After subsequent mergers, what remains of the Boston Edison Company is now part of Eversource Energy.[4] https://en.wikipedia.org/wiki/Boston_Edison_Company


Seems like some of these change because of acquisitions and rebrandings. This does seems pretty rare given the vast majorty of plants have very consistent service areas. **How do we handle ownership changes of plants?** 

## Why was the consistency higher before adding 2001 - 2003? 

In [None]:
aaa

In [174]:
adj_strictness_service_area_df.drop_duplicates(subset='plant_id_eia').entity_occurences.describe()

count    3540.000000
mean        5.374011
std         1.452343
min         1.000000
25%         6.000000
50%         6.000000
75%         6.000000
max         6.000000
Name: entity_occurences, dtype: float64

In [173]:
adj_strictness_service_area_df.drop_duplicates(subset='plant_id_eia').record_occurences.describe()

count    3540.000000
mean        4.289831
std         2.125388
min         1.000000
25%         2.000000
50%         6.000000
75%         6.000000
max         6.000000
Name: record_occurences, dtype: float64

In [184]:
import numpy as np

adj_strictness_service_area_df[np.isclose(adj_strictness_service_area_df["service_area_consistent_rate"], 0.667)]

Unnamed: 0,plant_id_eia,report_date,service_area,entity_occurences,record_occurences,service_area_consistent_rate,service_area_consistent


In [199]:
adj_strictness_service_area_df.query("service_area_consistent_rate < 0.67 & service_area_consistent_rate > 0.65").sort_values(by="service_area_consistent_rate", ascending=False).drop_duplicates(.shape

(1966, 7)

In [34]:
from pudl.transform.eia import _compile_all_entity_records

compiled_plants = _compile_all_entity_records("plants", eia_transformed_dfs)



Unnamed: 0,ash_impoundment,ash_impoundment_lined,ash_impoundment_status,balancing_authority_code_eia,balancing_authority_name_eia,city,county,datum,energy_storage,ferc_cogen_docket_no,ferc_cogen_status,ferc_exempt_wholesale_generator,ferc_exempt_wholesale_generator_docket_no,ferc_small_power_producer,ferc_small_power_producer_docket_no,grid_voltage_2_kv,grid_voltage_3_kv,grid_voltage_kv,iso_rto_code,latitude,liquefied_natural_gas_storage,longitude,natural_gas_local_distribution_company,natural_gas_pipeline_name_1,natural_gas_pipeline_name_2,natural_gas_pipeline_name_3,natural_gas_storage,nerc_region,net_metering,pipeline_notes,plant_id_eia,plant_name_eia,primary_purpose_naics_id,regulatory_status_code,report_date,sector_id,sector_name,service_area,state,street_address,table,transmission_distribution_owner_id,transmission_distribution_owner_name,transmission_distribution_owner_state,utility_id_eia,water_source,zip_code
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,,,,2001-01-01,,,,,,ownership_eia860,,,,195,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,,,,2001-01-01,,,,,,ownership_eia860,,,,195,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,,,,2002-01-01,,,,,,ownership_eia860,,,,195,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,,,,2002-01-01,,,,,,ownership_eia860,,,,195,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,,,,2003-01-01,,,,,,ownership_eia860,,,,195,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5424867,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,60926,,,,2019-01-01,,,,,,fuel_receipts_costs_eia923,,,,,,
5424868,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,60926,,,,2019-01-01,,,,,,fuel_receipts_costs_eia923,,,,,,
5424869,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,61035,,,,2019-01-01,,,,,,fuel_receipts_costs_eia923,,,,,,
5424870,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,61643,,,,2019-01-01,,,,,,fuel_receipts_costs_eia923,,,,,,


In [39]:
compiled_plants['table'].value_counts()

generation_fuel_eia923          2401225
boiler_fuel_eia923              1214200
fuel_receipts_costs_eia923       536118
generation_eia923                513862
generators_eia860                459931
plants_eia860                    144699
boiler_generator_assn_eia860      80356
ownership_eia860                  74481
Name: table, dtype: int64

In [41]:
compiled_plants.groupby("table").count()

Unnamed: 0_level_0,ash_impoundment,ash_impoundment_lined,ash_impoundment_status,balancing_authority_code_eia,balancing_authority_name_eia,city,county,datum,energy_storage,ferc_cogen_docket_no,ferc_cogen_status,ferc_exempt_wholesale_generator,ferc_exempt_wholesale_generator_docket_no,ferc_small_power_producer,ferc_small_power_producer_docket_no,grid_voltage_2_kv,grid_voltage_3_kv,grid_voltage_kv,iso_rto_code,latitude,liquefied_natural_gas_storage,longitude,natural_gas_local_distribution_company,natural_gas_pipeline_name_1,natural_gas_pipeline_name_2,natural_gas_pipeline_name_3,natural_gas_storage,nerc_region,net_metering,pipeline_notes,plant_id_eia,plant_name_eia,primary_purpose_naics_id,regulatory_status_code,report_date,sector_id,sector_name,service_area,state,street_address,transmission_distribution_owner_id,transmission_distribution_owner_name,transmission_distribution_owner_state,utility_id_eia,water_source,zip_code
table,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1
boiler_fuel_eia923,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1214200,0,0,0,1214200,0,0,0,0,0,0,0,0,0,0,0
boiler_generator_assn_eia860,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,80356,0,0,0,80356,0,0,0,0,0,0,0,0,80356,0,0
fuel_receipts_costs_eia923,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,536118,0,0,0,536118,0,0,0,0,0,0,0,0,0,0,0
generation_eia923,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,513862,0,0,0,513862,0,0,0,0,0,0,0,0,0,0,0
generation_fuel_eia923,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2401225,0,0,0,2401225,0,0,0,0,0,0,0,0,0,0,0
generators_eia860,0,0,0,27900,0,0,300916,0,0,9180,95926,96591,9512,96226,9048,0,0,0,0,29957,0,29954,0,0,0,0,0,0,0,0,459931,403785,0,0,459931,266943,300365,0,403775,0,0,0,0,459931,0,0
ownership_eia860,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,74481,58246,0,0,74481,0,0,0,47349,0,0,0,0,74481,0,0
plants_eia860,43223,50451,2572,65859,65977,108012,144245,5561,40528,7022,109103,109091,5086,109144,22330,1685,346,80685,12990,75217,20172,75215,5257,13749,790,163,24246,143961,1685,746,144699,144699,144668,115438,144699,96136,96136,19024,144689,106560,105641,93877,103247,144699,85893,144011


In [46]:
compiled_plants.groupby("table").count()["service_area"]

table
boiler_fuel_eia923                  0
boiler_generator_assn_eia860        0
fuel_receipts_costs_eia923          0
generation_eia923                   0
generation_fuel_eia923              0
generators_eia860                   0
ownership_eia860                    0
plants_eia860                   19024
Name: service_area, dtype: int64

In [51]:
compiled_plants[~compiled_plants['service_area'].isna()].report_date.value_counts()

2006-01-01    3379
2005-01-01    3244
2004-01-01    3176
2003-01-01    3137
2002-01-01    3099
2001-01-01    2989
Name: report_date, dtype: int64

In [27]:
col_dfs["service_area"].sort_values(by="service_area_consistent_rate").head(15)

Unnamed: 0,plant_id_eia,report_date,service_area,entity_occurences,record_occurences,service_area_consistent_rate,service_area_consistent
9402,55569,2001-01-01,Exelon Energy Co,6,1,0.166667,False
4952,3170,2002-01-01,PECO Energy Co.,6,1,0.166667,False
4940,3168,2002-01-01,PECO Energy Co.,6,1,0.166667,False
4934,3166,2002-01-01,PECO Energy Co.,6,1,0.166667,False
4928,3163,2002-01-01,PECO Energy Co.,6,1,0.166667,False
4922,3162,2002-01-01,PECO Energy Co.,6,1,0.166667,False
15607,55377,2002-01-01,"Allegheny Electric Coop, Inc",6,1,0.166667,False
15605,55196,2002-01-01,"Allegheny Electric Coop, Inc",6,1,0.166667,False
4910,3160,2002-01-01,PECO Energy Co.,6,1,0.166667,False
15576,3179,2001-01-01,Allegheny Energy Supply Co LLC,6,1,0.166667,False


## Create a locally cached datastore

In [14]:
ds = pudl.workspace.datastore.Datastore(local_cache_path=Path(pudl_settings["data_dir"]))

# EIA-860

## Extract just the EIA-860

In [15]:
%%time
eia860_extractor = pudl.extract.eia860.Extractor(ds)
eia860_raw_dfs = eia860_extractor.extract(year=eia860_years)

Extracting eia860 spreadsheet data.

boiler_generator_assn
Columns for boiler_generator_assn are off: should be 4 but got 8
Unmapped raw columns: {'report_year', 'steam_plant_type', 'utility_name', 'plant_name'}

generator
Columns for generator are off: should be 67 but got 70
Unmapped raw columns: {'report_year', 'fercdock', 'ferccogen'}

generator_existing

generator_proposed

generator_retired

ownership

plant
Columns for plant are off: should be 47 but got 49
Unmapped raw columns: {'report_year', 'ferc_exempt_wholesale_generator_docket_number'}

utility
CPU times: user 51.1 s, sys: 956 ms, total: 52.1 s
Wall time: 52.8 s


In [16]:
eia860_raw_dfs.keys()

dict_keys(['boiler_generator_assn', 'generator', 'generator_existing', 'generator_proposed', 'generator_retired', 'ownership', 'plant', 'utility'])

In [17]:
# for table_name, df in eia860_raw_dfs.items():
#     print(table_name)
#     print(df.info())
#     print()

# # eia860_raw_dfs["ownership"].info()

In [18]:
generators_df = eia860_raw_dfs['generator']

generators_df.report_year.value_counts()

2001.0    16029
Name: report_year, dtype: int64

## Transform just the EIA-860

In [19]:
%%time
eia860_transformed_dfs = pudl.transform.eia860.transform(
    eia860_raw_dfs, eia860_tables=eia860_tables)

Transforming raw EIA 860 DataFrames for ownership_eia860 concatenated across all years.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7595 entries, 0 to 7594
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   fraction_owned           7595 non-null   float64       
 1   generator_id             7595 non-null   object        
 2   operational_status_code  4792 non-null   object        
 3   owner_city               4531 non-null   object        
 4   owner_name               7595 non-null   object        
 5   owner_state              4511 non-null   string        
 6   owner_street_address     4415 non-null   object        
 7   owner_utility_id_eia     7595 non-null   Int64         
 8   owner_zip_code           4523 non-null   object        
 9   plant_id_eia             7595 non-null   Int64         
 10  plant_name_eia           4792 non-null   object        
 11  state 

In [23]:
eia860_transformed_dfs.keys()

dict_keys(['ownership_eia860', 'generators_eia860', 'plants_eia860', 'boiler_generator_assn_eia860', 'utilities_eia860'])

In [262]:
eia860_transformed_dfs.keys()

dict_keys(['ownership_eia860', 'generators_eia860', 'plants_eia860', 'boiler_generator_assn_eia860', 'utilities_eia860'])

In [263]:
eia860_transformed_dfs['generators_eia860'].info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67678 entries, 0 to 35093
Data columns (total 106 columns):
 #    Column                                     Dtype         
---   ------                                     -----         
 0    associated_combined_heat_power             object        
 1    bypass_heat_recovery                       object        
 2    capacity_mw                                float64       
 3    carbon_capture                             object        
 4    cofire_fuels                               object        
 5    county                                     object        
 6    data_source                                object        
 7    deliver_power_transgrid                    object        
 8    distributed_generation                     object        
 9    duct_burners                               object        
 10   energy_source_1_transport_1                object        
 11   energy_source_1_transport_2                object   

# What is transmode1?

In [264]:
generators = eia860_transformed_dfs['generators_eia860']

In [267]:
generators.report_date.value_counts()

2019-01-01    29009
2004-01-01    19493
2003-01-01    19176
Name: report_date, dtype: int64

In [288]:
generators.groupby(generators.transmode1.isna()).report_date.value_counts()

transmode1  report_date
False       2003-01-01     13972
True        2019-01-01     29009
            2004-01-01     19493
            2003-01-01      5204
Name: report_date, dtype: int64

Confirmed mutligen is only in 2003.

## unit_id_eia

In [278]:
generators.groupby(generators.unit_id_eia.isna()).report_date.value_counts()

unit_id_eia  report_date
False        2019-01-01      2210
             2004-01-01       307
True         2019-01-01     26799
             2004-01-01     19186
             2003-01-01     19176
Name: report_date, dtype: int64

In [281]:
gen04 = generators.query("report_date == '2004-01-01'")

gen04[~gen04.unit_id_eia.isna()][["generator_id", "unit_id_eia"]]

Unnamed: 0,generator_id,unit_id_eia
2240,4,4.0
17944,1,B151
17945,2,B151
17946,3,B151
17962,1,B152
...,...,...
33970,TG-2,F601
33971,TG-3,F601
34120,GEN1,F301
34121,GEN2,F301


In [284]:
gen03 = generators.query("report_date == '2003-01-01'")

gen03[~gen03.multigen.isna()][["generator_id", "multigen"]]

Unnamed: 0,generator_id,multigen
1602,1,1.0
1603,2,2.0
588,1,B151
589,2,B151
590,3,B151
...,...,...
16739,TG-2,F601
16740,TG-3,F601
16894,GEN1,F301
16895,GEN2,F301


In [286]:
gen03.multigen.value_counts().head(10)

A094    27
A096    23
A092    18
A103    16
A110    14
C789    13
C052    12
D004    12
B148    10
F901     8
Name: multigen, dtype: int64

In [287]:
gen04.unit_id_eia.value_counts().head(10)

C789    13
D004    12
G281    12
C052    12
B148    10
B061     8
F901     8
D016     8
D002     6
F401     5
Name: unit_id_eia, dtype: int64

# EIA-923

## Extract just the EIA-923

In [None]:
%%time
eia923_extractor = pudl.extract.eia923.Extractor(ds)
eia923_raw_dfs = eia923_extractor.extract(year=eia923_years)

## Transform just the EIA-923

In [None]:
%%time
eia923_transformed_dfs = pudl.transform.eia923.transform(
    eia923_raw_dfs, eia923_tables=eia923_tables)

# Combined EIA Data

## Merge the EIA-860 and EIA-923 Dataframe Dictionaries

In [None]:
%%time
eia_transformed_dfs = eia923_transformed_dfs.copy()
eia_transformed_dfs.update(eia860_transformed_dfs.copy())

## Set all column data types

In [None]:
%%time
eia_transformed_dfs = pudl.helpers.convert_dfs_dict_dtypes(
    eia_transformed_dfs, 'eia')

## Run the entity resolution process

In [None]:
entities_dfs, eia_transformed_dfs = pudl.transform.eia.transform(
    eia_transformed_dfs,
    eia860_years=eia860_years,
    eia923_years=eia923_years,
)

In [None]:
list(eia_transformed_dfs.keys())