# Notebook Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Standard libraries
import logging
import os
import pathlib
import sys

# 3rd party libraries
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa

# Local libraries
import pudl

In [None]:
sns.set()
%matplotlib inline
mpl.rcParams['figure.figsize'] = (10,4)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [None]:
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

# Define Functions & Constants

## Notebook Constants

In [None]:
PRIME_MOVER_CODE_MAP = {
    "BA": "battery_storage",
    "BT": "binary_cycle_turbine",
    "CA": "combined_cycle_steam_part",
    "CC": "combined_cycle_total_unit",
    "CE": "compressed_air_storage",
    "CP": "concentrated_solar_storage",
    "CS": "combined_cycle_single_shaft",
    "CT": "combined_cycle_combustion_turbine",
    "ES": "other_energy_storage",
    "FC": "fuel_cell",
    "FW": "flywheel_storage",
    "GT": "gas_combustion_turbine",
    "HA": "hydrokinetic_axial_flow_turbine",
    "HB": "hydrokinetic_wave_buoy",
    "HK": "other_hydrokinetic",
    "HY": "hydraulic_turbine",
    "IC": "internal_combustion_engine",
    "PS": "pumped_hydraulic_storage",
    "OT": "other",
    "ST": "steam_turbine",
    "PV": "photovoltaic",
    "WT": "onshore_wind_turbine",
    "WS": "offshore_wind_turbine",
}

FOSSIL_PRIME_MOVERS = {
    "CT": "combined_cycle_combustion_turbine",
    "CS": "combined_cycle_single_shaft",
    "CA": "combined_cycle_steam_part",
    "CC": "combined_cycle_total_unit",
    "GT": "gas_combustion_turbine",
    "IC": "internal_combustion_engine",
    "ST": "steam_turbine",
}

GENS_COLS = [
    "report_date",
    "plant_id_eia",
    "plant_name_eia",
    "unit_id_pudl",
    "generator_id",
    "capacity_mw",
    "prime_mover_code",
    "prime_mover",
    "energy_source_code_1",
    "energy_source_code_2",
    "fuel_type_code_pudl",
    "technology_description",
    "associated_combined_heat_power",
]

START_DATE, END_DATE = ("2009-01-01", "2019-01-01")


## `prep_gens()`

In [None]:
def prep_gens(
    pudl_out,
    start_date=START_DATE,
    end_date=END_DATE,
    pm_code_map=FOSSIL_PRIME_MOVERS,
):
    """
    Preproccessing function to look at generators by prime mover and year.
    """
    gens_eia860 = pudl_out.gens_eia860()
    # Generator selection criteria
    mask = (
        (gens_eia860.report_date >= start_date)
        & (gens_eia860.report_date <= end_date)
        & (gens_eia860.prime_mover_code.isin(pm_code_map.keys()))
    )
    working_gens = (
        gens_eia860
        .loc[mask]
        .assign(prime_mover=lambda x: x.prime_mover_code.map(pm_code_map))
    )
    # Calculate the annual generation by generator
    net_gen_by_gen = (
        pudl_out.gen_eia923()
        .set_index("report_date")
        .groupby([pd.Grouper(freq="AS"), "plant_id_eia", "generator_id"])
        .net_generation_mwh.sum()
        .to_frame()
        .reset_index()
    )
    # Merge annual generation by generator into the working DF
    return pd.merge(
        working_gens,
        net_gen_by_gen,
        how="left",
        validate="1:1",
    )

## `unit_gen_coverage()`

In [None]:
def unit_gen_coverage(prepped_gens):
    """
    Characterize generator-level PUDL Unit ID coverage by year.
    
    For each year and fossil fuel prime mover code, calculate:
    
    * number & fraction of generator_id values that have a unit_id_pudl
    * sum and fraction of overall capacity (MW) that has a unit_id_pudl
    * sum and fraction of overall generation (MWh) associated with a
      unit_id_pudl in the generation_eia923 table
    
    """
    working_gens = prepped_gens.copy()
    # A boolean column indicating whether a record has a PUDL Unit ID
    working_gens.loc[:, "has_unit_id"] = working_gens.unit_id_pudl.notna()
    
    def tot_frac(df, col):
        return df.loc[:, (col, True)] / df[col].sum(axis="columns")
    
    working_gens = (
        working_gens.groupby(["report_date", "prime_mover", "has_unit_id"])
        .agg(
            num_gens=pd.NamedAgg(column="generator_id", aggfunc="size"),
            capacity_mw=pd.NamedAgg(column="capacity_mw", aggfunc="sum"),
            net_generation_mwh=pd.NamedAgg(column="net_generation_mwh", aggfunc="sum"),
        )
        .unstack(fill_value=0)
        .assign(
            num_gens_fraction=lambda x: tot_frac(x, "num_gens"),
            capacity_mw_fraction=lambda x: tot_frac(x, "capacity_mw"),
            net_generation_mwh_fraction=lambda x: tot_frac(x, "net_generation_mwh"),
        )
    )
    return working_gens

## `chp_prevalence()`

In [None]:
def chp_prevalence(gens_df):
    """
    Assess the prevalence of Combined Heat & Power in Generators.
    
    Break down generators by year and prime mover, and within each group
    calculate the proportion and total quantity associated with CHP per
    
    * number of generators
    * capacity (MW)
    * generation (MWh) in the generation_eia923 table
    """
    gens_working = gens_df.copy()
    # A boolean column indicating whether a record is associated with CHP
    gens_working.loc[:, "ass_chap"] = gens_working.associated_combined_heat_power.astype(bool)
    
    def tot_frac(df, col):
        return df.loc[:, (col, True)] / df[col].sum(axis="columns")
    
    gens_working = (
        gens_working.groupby(["report_date", "prime_mover", "ass_chap"])
        .agg(
            num_gens=pd.NamedAgg(column="generator_id", aggfunc="size"),
            capacity_mw=pd.NamedAgg(column="capacity_mw", aggfunc="sum"),
            net_generation_mwh=pd.NamedAgg(column="net_generation_mwh", aggfunc="sum"),
        )
        .unstack(fill_value=0)
        .assign(
            num_gens_fraction=lambda x: tot_frac(x, "num_gens"),
            capacity_mw_fraction=lambda x: tot_frac(x, "capacity_mw"),
            net_generation_mwh_fraction=lambda x: tot_frac(x, "net_generation_mwh"),
        )
    )
    return gens_working 

# Assess Generators

## Create PUDL output object

In [None]:
pudl_settings = pudl.workspace.setup.get_defaults()
display(pudl_settings)

ferc1_engine = sa.create_engine(pudl_settings['ferc1_db'])
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])


API_KEY_EIA = os.environ["API_KEY_EIA"]

pudl_out = pudl.output.pudltabl.PudlTabl(
    freq="MS",
    pudl_engine=pudl_engine,
)

## Examine Existing Unit ID Coverage
Only a fraction of all fossil generation is associated with PUDL Unit IDs, which are integral to our most granular heat rate determination. We need to understand how good the coverage is, and why the generators / generation we're missing isn't covered. THere's at least 3 ways we can evaluate this coverage, at the generator level, based on information in the `generators_eia860` table:
* by generator ID
* by capacity (MW)
* by generation (MWh)

These can be broken down by:
* Time (look at annual quantities and how they've changed)
* Prime mover code (useful since it's in both generation and generation_fuel tables)
* Whether the generators appear in the `generation_eia923` table
* Whether the generators are associated with CHP
  * Are there plants where all generators do CHP? Would be useful for benchmarking heat rate impacts.
* Primary fuel (not totally well defined, since there are many mixed-fuel generators)

Values to report on those breakdowns:
* Absolute totals (# of IDs, MW, or MWh)
* Fraction of reported totals

Can also look at just the population of generators which report in the `generation_eia923` table each year, since those are the only ones where we will really have generator level electricity output and boiler level fuel inputs that can be directly linked with the `unit_id_pudl` if it exists. Many of the generators without IDs will likely be generators without boilers. Assigning them Unit IDs will be useful insofar as they still have fuel inputs which are reported (probably only in `generation_fuel_eia923`) and so we can assess both their inputs and outputs. Or insofar as we are using the `unit_id_pudl` values to do aggregations on tables of generators, which are sometimes linked to each other even if they don't have boilers involved in that linkage.

We want an assessment that gets at the above information, and which can be run on both existing and potential new ID assignments.

## Summarize Generator Unit Coverage

In [None]:
prepped_gens = prep_gens(
    pudl_out,
    start_date=START_DATE,
    end_date=END_DATE,
    pm_code_map=FOSSIL_PRIME_MOVERS,
)
unit_summary = unit_gen_coverage(prepped_gens)

## Display Results

### Absolute Unit Coverage (N, MW, MWh)

In [None]:
sns.relplot(x="report_date", y=("num_gens", True), data=unit_summary, hue="prime_mover", kind="line")
sns.relplot(x="report_date", y=("capacity_mw", True), data=unit_summary, hue="prime_mover", kind="line")
sns.relplot(x="report_date", y=("net_generation_mwh", True), data=unit_summary, hue="prime_mover", kind="line")
plt.show();

### Proportional Unit Coverage

In [None]:
sns.relplot(x="report_date", y="num_gens_fraction", data=unit_summary, hue="prime_mover", kind="line")
sns.relplot(x="report_date", y="capacity_mw_fraction", data=unit_summary, hue="prime_mover", kind="line")
sns.relplot(x="report_date", y="net_generation_mwh_fraction", data=unit_summary, hue="prime_mover", kind="line")
plt.show();

## CHP Prevalence
* What fraction of generators, capacity, and generation are associated with generators that also do CHP?

In [None]:
chp_summary = chp_prevalence(prepped_gens)
chp_summary

In [None]:
gens_df[gens_df.associated_combined_heat_power.isna()][GENS_COLS]

# Improve PUDL Unit ID Coverage

## Identify new types of Unit IDs
* Plants with only a single generator (of any kind)
* Combined Cycle plants w/ a single pair of combustion + steam turbines
* Any single-shaft Combined Cycle generator (always independent?)
* Agglomerations of all combined-cycle generators lacking a PUDL ID within a plant

## Assign and label new Unit IDs

## Assess New Unit ID Impacts on coverage
* Run the same assessment functions as we did above

In [None]:
assert False

# Assess Existing Heat Rates
* We need some way to compare different ways of calculating heat rates and choose between them.
* We want to check both for their correctness, and their completeness.
* Apparent correctness will depend on the type of generator / unit type. Need to define different expectations.

In [None]:
%%time
hr_by_unit = pudl_out.hr_by_unit()
hr_by_unit.info()

## Compile Net Generation
* **By generator**
  * `net_generation_mwh` available from `generation_eia923` table
  * Associated with `plant_id_eia` and `generator_id` columns directly.
  * Can be associated with `unit_id_pudl` if the generator is part of an identified unit
  * Can be associated with a `technology_description` and `prime_mover_code` based on the `generators_eia860` table.
  * Can be associated with a list of energy sources based on `energy_source_N` in `generators_eia860` table.
* **By plant-prime-fuel**
  * `net_generation_mwh` is available from generation_fuel_eia923`

## Compile Fuel Consumption
* **By boiler**
  * `total_heat_content_mmbtu` and `fuel_type_code_pudl` are available by `plant_id_eia` and `boiler_id` in `boiler_fuel_eia923`
  * This value can be be associated with a `unit_id_pudl` based on the BGA table.
  * The heat consumed in here includes fuels both for electricity and steam (direct heat) outputs.
* **By plant-prime-fuel**
  * Available in `generation_fuel` and broken down as for CHP vs Electricity separately.

## Identify Combined Cycle Units
* Combined Cycle turbines show up in the generators table, but have no "boiler" so they don't end up in the boiler-generator-association
* This means they don't get assigned `unit_id_pudl` values and are often lost.
* However, they are identifiable based on `technology_description` in the generators table, and so can be associated with a plant.
* Within a given plant, it's possible to combine all the natural gas that goes into a 

## Questions:
* Do all of the generators that show up in the generation table end up getting PUDL Unit IDs assigned?
* Where are the heat inputs being reported for combined cycle units? Do they really exist after 2015? What is still missing?
* Triage units / generators into: Easy, Hard, and Impossible. Work on Hard ones until diminishing returns. Assign impossible and too-hard ones the median values.

## Tables of Interest:
* `generation_eia923`
* `generation_fuel_eia923`
* `boiler_generator_assn_eia860`
* `generators_eia860`
* `generators_entity_eia`
* `boiler_fuel_eia923`

In [None]:
gens_eia860_sql = """
SELECT report_date,
       plant_id_eia,
       generator_id,
       capacity_mw,
       energy_source_code_1,
       energy_source_code_2,
       energy_source_code_3,
       energy_source_code_4,
       energy_source_code_5,
       energy_source_code_6,
       fuel_type_code_pudl,
       technology_description,
FROM generators_eia860
"""

gens_entity_sql = """
SELECT report_date,
       plant_id_eia,
       generator_id,
       prime_mover_code,
       bypass_heat_recovery,
       associated_combined_heat_and_power,
FROM generators_entity_eia
"""

gen_eia923_sql = """
SELECT report_date,
       plant_id_eia,
       generator_id,
       net_generation_mwh
FROM generation_eia923
"""

gf_eia923_sql = """
SELECT report_date,
       plant_id_eia,
       nuclear_unit_id,
       fuel_type,
       fuel_type_code_pudl,
       prime_mover_code,
       fuel_consumed_mmbtu,
       fuel_consumed_for_electricity_mmbtu,
       net_generation_mwh
FROM generation_fuel_eia923
"""

bga_sql = """
SELECT report_date,
       plant_id_eia,
       unit_id_pudl,
       generator_id,
       boiler_id
FROM boiler_generator_assn_eia860
"""

bf_eia923_sql = """
SELECT report_date,
       plant_id_eia,
       boiler_id,
       fuel_type_code,
       fuel_type_code_pudl,
       fuel_consumed_units,
       fuel_mmbtu_per_unit
FROM boiler_fuel_eia923
"""


## Compile Fuel Consumption
  * Reported by boiler in `boiler_fuel_eia923` and so can be associated 

# Preliminary Data Wrangling
Once all of the data is loaded and looks like it's in good shape, do any initial wrangling that's specific to this particular analysis. This should mostly make use of the higher level functions which were defined above. If this step takes a while, don't be shy about producing `logging` outputs.

# Data Analysis and Visualization
* Now that you've got the required data in a usable form, you can tell the story of your analysis through a mix of visualizations, and further data wrangling steps.
* This narrative should be readable, with figures that have titles, legends, and labeled axes as appropriate so others can understand what you're showing them.
* The code should be concise and make use of the parameters and functions which you've defined above when possible. Functions should contain comprehensible chunks of work that make sense as one step in the story of the analysis.

In [None]:
coal_ax = finite_distplot(mcoe_coal, "heat_rate_mmbtu_mwh", max_val=20)
plt.title("Coal heat rate distribution");

In [None]:
gas_ax = finite_distplot(mcoe_gas, "heat_rate_mmbtu_mwh", max_val=20)
plt.title("Gas heat rate distribution");