In [None]:
%load_ext autoreload
%autoreload 3

In [None]:
import pandas as pd
import pudl
import sqlalchemy as sa
from pathlib import Path
import zipfile

In [None]:
pudl_engine = sa.create_engine(pudl.workspace.setup.get_defaults()['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine)

### Generate Plant Parts List

In [None]:
# if you have it, read in pickled dataframe
plant_parts_eia = pd.read_pickle("plant_parts_eia_distinct.pkl.gz")

In [None]:
# make ppl distinct for Panda
# this was adapted from the RMI repo
# takes as input a non-distinct (includes non true grans) ppl
def get_plant_parts_distinct(plant_parts_eia):
    """Get the EIA plant-parts with only the unique granularities."""
    # We want only the records of the EIA plant-parts that are "true
    # granularies" and those which are not duplicates based on their
    # ownership  so the model doesn't get confused as to which option to
    # pick if there are many records with duplicate data
    plant_parts_eia = plant_parts_eia.assign(
        plant_id_report_year_util_id=lambda x: x.plant_id_report_year
        + "_"
        + x.utility_id_pudl.map(str)
    ).astype({"installation_year": "float"})
    plant_parts_distinct = plant_parts_eia[
        (plant_parts_eia["true_gran"]) & (~plant_parts_eia["ownership_dupe"])
    ]
    return plant_parts_distinct

In [None]:
plant_parts_eia = pudl_out.plant_parts_eia()

In [None]:
# a little patch because there was one duplicate record due to a bug in ppl generation
plant_parts_eia = plant_parts_eia[~plant_parts_eia.index.duplicated(keep="first")]
plant_parts_eia_distinct = get_plant_parts_distinct(plant_parts_eia)

In [None]:
# it's not necessary to remove columns any more to save on memory
# but these columns seemed non essential for Panda matching
ppl_cols_to_remove = {
    'appro_part_label',
    'appro_record_id_eia',
    'operational_status',
    'operational_status_pudl',
    'ownership_dupe',
    'retirement_date',
    'planned_retirement_date',
    'true_gran',
    'ownership',
    'fraction_owned',
    'record_count'
}
ppl_cols_to_keep = set(plant_parts_eia.columns) - ppl_cols_to_remove
plant_parts_eia = plant_parts_eia[list(ppl_cols_to_keep)]

### Generate FERC side
- Currently this function is taken from the RMI repo: `connect_ferc1_to_eia.InputManager.get_all_ferc1`
- Could add `pudl_rmi` to environment and import this

In [None]:
def get_ferc_plants(pudl_out):
    fbp_cols_to_use = [
        "report_year",
        "utility_id_ferc1",
        "plant_name_ferc1",
        "utility_id_pudl",
        "fuel_cost",
        "fuel_mmbtu",
        "primary_fuel_by_mmbtu",
    ]
    plants_ferc1_df = (
        pudl_out.plants_all_ferc1()
        .merge(
            pudl_out.fbp_ferc1()[fbp_cols_to_use],
            on=[
                "report_year",
                "utility_id_ferc1",
                "utility_id_pudl",
                "plant_name_ferc1",
            ],
            how="left",
        )
        .pipe(pudl.helpers.convert_cols_dtypes, "ferc1")
        .assign(
            installation_year=lambda x: (
                x.installation_year.astype("float")
            ),  # need for comparison vectors
            plant_id_report_year=lambda x: (
                x.plant_id_pudl.map(str) + "_" + x.report_year.map(str)
            ),
            plant_id_report_year_util_id=lambda x: (
                x.plant_id_report_year + "_" + x.utility_id_pudl.map(str)
            ),
            fuel_cost_per_mmbtu=lambda x: (x.fuel_cost / x.fuel_mmbtu),
            heat_rate_mmbtu_mwh=lambda x: (x.fuel_mmbtu / x.net_generation_mwh),
        )
        .rename(
            columns={
                "record_id": "record_id_ferc1",
                "opex_plants": "opex_plant",
                "fuel_cost": "total_fuel_cost",
                "fuel_mmbtu": "total_mmbtu",
                "opex_fuel_per_mwh": "fuel_cost_per_mwh",
                "primary_fuel_by_mmbtu": "fuel_type_code_pudl",
            }
        )
        .set_index("record_id_ferc1")
    )
    return plants_ferc1_df

In [None]:
ferc_df = get_ferc_plants(pudl_out)

In [None]:
# these columns are useful for matching with EIA
ferc_df = ferc_df[[
   'report_year', 'utility_id_pudl',
   'utility_name_ferc1', 'plant_id_pudl', 'plant_name_ferc1',
   'capacity_factor', 'capacity_mw', 'construction_type',
   'construction_year', 'installation_year', 'net_generation_mwh',
   'fuel_cost_per_mwh', 'plant_capability_mw', 'plant_type',
   'fuel_cost_per_mmbtu', 'fuel_type',
   'total_fuel_cost', 'total_mmbtu',
   'fuel_type_code_pudl', 'plant_id_report_year',
   'plant_id_report_year_util_id', 'heat_rate_mmbtu_mwh'
]]

In [None]:
ferc_df.to_pickle("full_ferc.pkl")

### Add on utility name to EIA side

If the latest version of the plant parts list is being used then construction year and installation year should already be included.

In [None]:
# currently df is intended to be the distinct plant parts list
def add_utility_name(df, pudl_engine):
    # join on utility_name_eia
    eia_util = pd.read_sql("utilities_eia", pudl_engine)
    eia_util = eia_util.set_index('utility_id_eia')['utility_name_eia']
    non_null_df = df[~(df.utility_id_eia.isnull())]
    non_null_df = non_null_df.merge(eia_util, how="left", left_on='utility_id_eia', right_index=True, validate="m:1")
    df_util = pd.concat([non_null_df, df[df.utility_id_eia.isnull()]])
    df = df_util.reindex(df.index)
    
    return df_util

In [None]:
plant_parts_eia = add_utility_name(plant_parts_eia, pudl_engine)

### Separate the plant parts list by year

Currently Panda has a memory issue so inputs are broken out by year

In [None]:
def separate_ppl_by_year(ppl_distinct, output_dir):
    dir_path = Path(output_dir)
    for year in ppl_distinct.report_year.unique():
        (dir_path / f"ferc_eia_{year}").mkdir(parents=True, exist_ok=True)
        df = ppl_distinct[ppl_distinct.report_year == year]
        df.to_csv(f"{output_dir}/ferc_eia_{year}/right.csv")

### Prep inputs for just one plant part

In [None]:
part = "plant"

In [None]:
plant_part_df = plant_parts_eia[plant_parts_eia.plant_part == part]

In [None]:
# when breaking up the ppl into individual parts there are columns that are almost fully null
# depending on the part, let's drop these columns from the FERC and EIA side
def drop_null_cols(eia_df, ferc_df, threshold=.1):
    percent_null = eia_df.isnull().sum() / len(eia_df)
    print(percent_null)
    cols_to_drop = set(percent_null[percent_null >= threshold].index)
    eia_cols_to_keep = list(set(eia_df.columns) - cols_to_drop)
    ferc_cols_to_keep = list(set(ferc_df.columns) - cols_to_drop)
    return eia_df[eia_cols_to_keep], ferc_df[ferc_cols_to_keep]

In [None]:
small_part_df, small_ferc_df = drop_null_cols(plant_part_df, ferc_df, threshold=.8)

In [None]:
# drop the plant part column as it's filtered by plant part anyways
small_part_df = small_part_df.drop(columns=["plant_part"], axis=1)
# little patch: not sure why installation_year is a float right now
small_part_df = small_part_df.astype({"installation_year": "Int64"})

In [None]:
# random, not sure where to put this
# rename matching columns to be the same name, might help Panda?
plant_part_df = plant_part_df.rename(columns={"plant_name_eia": "plant_name",
                                "utility_name_eia": "utility_name"})
ferc_df = ferc_df.rename(columns={"plant_name_ferc1": "plant_name",
                        "utility_name_ferc1": "utility_name"})

In [None]:
plant_part_df = plant_part_df.drop(
    ["plant_part_id_eia", "unit_id_pudl", "utility_id_pudl", "plant_id_pudl"], axis=1)

In [None]:
ferc_df = ferc_df.drop(
    ["utility_id_pudl"], axis=1)

In [None]:
years = [2019, 2020]
zip_dfs_for_panda(
    ferc_df[ferc_df.report_year.isin(years)], 
    plant_part_df[plant_part_df.report_year.isin(years)], "19_20_plant")

In [None]:
plant_parts_eia.to_pickle("full_eia_plant_parts_clean.pkl")
ferc_df.to_pickle("full_ferc_clean.pkl")

In [None]:
ferc_df.columns

### Zip up FERC and EIA to be ready for Panda import

In [None]:
def zip_dfs_for_panda(ferc_df, eia_df, zip_name):
    with zipfile.ZipFile(f"panda_inputs/{zip_name}.zip", "w") as csv_zip:
        csv_zip.writestr("left.csv", pd.DataFrame(ferc_df).to_csv())
        csv_zip.writestr("right.csv", pd.DataFrame(eia_df).to_csv())

In [None]:
# temp: put this here, need to take out a bunch of cols to get under memory limit
eia_drop_cols = [
    "capacity_eoy_mw",
    "energy_source_code_1",
    "ferc_acct_name", 
    "generator_id",
    "operating_year",
    "plant_id_eia",
    "plant_name_new",
    "plant_part_id_eia",
    "report_date",
    "utility_id_eia"
]
ferc_drop_cols = [
    "construction_type",
    "plant_capability_mw",
    "total_cost_of_plant"
]
small_part_df = small_part_df.drop(eia_drop_cols, axis=1)
small_ferc_df = small_ferc_df.drop(ferc_drop_cols, axis=1)

In [None]:
smaller_part_df = small_part_df[small_part_df.report_year == 2020]
smaller_ferc_df = small_ferc_df[small_ferc_df.report_year == 2020]

In [None]:
zip_dfs_for_panda(smaller_ferc_df, smaller_part_df, f"2020_{part}")

### Look at full records for training data matches

In [None]:
training_labels = pd.read_csv("train_ferc1_eia.csv")

In [None]:
full_records = training_labels[["record_id_eia", "record_id_ferc1", "notes"]].join(
    plant_parts_eia, on="record_id_eia").join(ferc_df, on="record_id_ferc1", rsuffix="_ferc")

In [None]:
full_records = full_records.reindex(sorted(full_records.columns), axis=1)

In [None]:
full_records

In [None]:
full_records[full_records.plant_part == "plant_gen"][["capacity_mw", "capacity_mw_ferc"]]

In [None]:
full_records[~(full_records.generator_id.isnull())]

In [None]:
cap_diff = abs(full_records["capacity_mw"] - full_records["capacity_mw_ferc"])

In [None]:
cap_diff.describe()

In [None]:
cap_diff[cap_diff < 1000].describe()

In [None]:
full_records[["plant_type", "technology_description"]].value_counts()

In [None]:
full_records[["plant_type", "prime_mover_code"]].value_counts()

In [None]:
full_records[["plant_type", "fuel_type_code_pudl"]].value_counts()