# Notebook Preamble

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Standard libraries
import logging
import os
import pathlib
import sys

# 3rd party libraries
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa

# Local libraries
import pudl
import pudl.constants as pc

In [3]:
sns.set()
%matplotlib inline
mpl.rcParams['figure.figsize'] = (10,4)
mpl.rcParams['figure.dpi'] = 100
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [4]:
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

# Function Definitions

## `prep_gens()`

In [5]:
def prep_gens(pudl_out, prime_mover_codes):
    """
    Preproccessing function to look at generators by prime mover and year.
    """
    gens_eia860 = pudl_out.gens_eia860()
    # Generator selection criteria
    working_gens = gens_eia860.loc[gens_eia860.prime_mover_code.isin(prime_mover_codes)]
    # Calculate the annual generation by generator
    net_gen_by_gen = (
        pudl_out.gen_eia923()
        .set_index("report_date")
        .groupby([pd.Grouper(freq="AS"), "plant_id_eia", "generator_id"])
        .net_generation_mwh.sum()
        .to_frame()
        .reset_index()
    )
    # Merge annual generation by generator into the working DF
    return pd.merge(
        working_gens,
        net_gen_by_gen,
        how="left",
        validate="1:1",
    )

## `unit_gen_coverage()`

In [6]:
def unit_gen_coverage(prepped_gens):
    """
    Characterize generator-level PUDL Unit ID coverage by year.
    
    For each year and fossil fuel prime mover code, calculate:
    
    * number & fraction of generator_id values that have a unit_id_pudl
    * sum and fraction of overall capacity (MW) that has a unit_id_pudl
    * sum and fraction of overall generation (MWh) associated with a
      unit_id_pudl in the generation_eia923 table
    
    """
    working_gens = prepped_gens.copy()
    # A boolean column indicating whether a record has a PUDL Unit ID
    working_gens.loc[:, "has_unit_id"] = working_gens.unit_id_pudl.notna()
    
    def tot_frac(df, col):
        return df.loc[:, (col, True)] / df[col].sum(axis="columns")
    
    working_gens = (
        working_gens.groupby(["report_date", "prime_mover_code", "has_unit_id"])
        .agg(
            num_gens=pd.NamedAgg(column="generator_id", aggfunc="size"),
            capacity_mw=pd.NamedAgg(column="capacity_mw", aggfunc="sum"),
            net_generation_mwh=pd.NamedAgg(column="net_generation_mwh", aggfunc="sum"),
        )
        .unstack(fill_value=0)
        .assign(
            num_gens_fraction=lambda x: tot_frac(x, "num_gens"),
            capacity_mw_fraction=lambda x: tot_frac(x, "capacity_mw"),
            net_generation_mwh_fraction=lambda x: tot_frac(x, "net_generation_mwh"),
        )
    )
    return working_gens

## `chp_prevalence()`

In [7]:
def chp_prevalence(gens_df):
    """
    Assess the prevalence of Combined Heat & Power in Generators.
    
    Break down generators by year and prime mover, and within each group
    calculate the proportion and total quantity associated with CHP per
    
    * number of generators
    * capacity (MW)
    * generation (MWh) in the generation_eia923 table
    """
    gens_working = gens_df.copy()
    # A boolean column indicating whether a record is associated with CHP
    gens_working.loc[:, "ass_chap"] = gens_working.associated_combined_heat_power.astype(bool)
    
    def tot_frac(df, col):
        return df.loc[:, (col, True)] / df[col].sum(axis="columns")
    
    gens_working = (
        gens_working.groupby(["report_date", "prime_mover_code", "ass_chap"])
        .agg(
            num_gens=pd.NamedAgg(column="generator_id", aggfunc="size"),
            capacity_mw=pd.NamedAgg(column="capacity_mw", aggfunc="sum"),
            net_generation_mwh=pd.NamedAgg(column="net_generation_mwh", aggfunc="sum"),
        )
        .unstack(fill_value=0)
        .assign(
            num_gens_fraction=lambda x: tot_frac(x, "num_gens"),
            capacity_mw_fraction=lambda x: tot_frac(x, "capacity_mw"),
            net_generation_mwh_fraction=lambda x: tot_frac(x, "net_generation_mwh"),
        )
    )
    return gens_working 

## `plot_unit_ids()`

In [8]:
def plot_unit_ids(df, pm_codes):
    for code in pm_codes:
        data = (
            df[df.prime_mover_code==code]
            .groupby("report_date")["unit_id_pudl"]
            .count()
        )
        plt.plot(data, label=code, linewidth="2", markersize="4", marker="o")
    plt.legend(loc="upper left")
    plt.ylabel("Generator Records with Unit IDs")
    plt.show();

# Pull data & set constants

## Notebook Constants

In [9]:
THERMAL_PRIME_MOVERS = [ "CT", "CS", "CA", "CC", "GT", "IC", "ST", ]

GENS_COLS = [
    "report_date",
    "plant_id_eia",
    #"plant_name_eia",
    "unit_id_pudl",
    "bga_source",
    "generator_id",
    #"capacity_mw",
    "prime_mover_code",
    #"energy_source_code_1",
    #"energy_source_code_2",
    "fuel_type_code_pudl",
    #"technology_description",
    #"associated_combined_heat_power",
]


## Create PUDL output object

In [113]:

from pudl.workspace.setup import PudlPaths

# TODO(janrous): provide property for accessing ferc db?
ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))
pudl_engine = sa.create_engine(PudlPaths().pudl_db)

API_KEY_EIA = os.environ["API_KEY_EIA"]

pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine)

{'pudl_in': '/home/zane/code/catalyst/pudl-work',
 'data_dir': '/home/zane/code/catalyst/pudl-work/data',
 'settings_dir': '/home/zane/code/catalyst/pudl-work/settings',
 'pudl_out': '/home/zane/code/catalyst/pudl-work',
 'sqlite_dir': '/home/zane/code/catalyst/pudl-work/sqlite',
 'parquet_dir': '/home/zane/code/catalyst/pudl-work/parquet',
 'datapkg_dir': '/home/zane/code/catalyst/pudl-work/datapkg',
 'ferc1_db': 'sqlite:////home/zane/code/catalyst/pudl-work/sqlite/ferc1.sqlite',
 'pudl_db': 'sqlite:////home/zane/code/catalyst/pudl-work/sqlite/pudl.sqlite',
 'censusdp1tract_db': 'sqlite:////home/zane/code/catalyst/pudl-work/sqlite/censusdp1tract.sqlite'}

# Remaining PUDL Unit ID questions:
* What is the output table describing these units going to look like?
* Should it be several different well normalized tables defining different kinds of Unit IDs?
* Should it be a single un-normalized table?
* Should we back/forward fill the technology descriptions and prime mover codes? Esp. in older yeras?
* Should we fill in pseudo-boiler IDs for the units that we've created, like the latter years of CCNG plants do?
* Does every boiler that we know of (in the boiler entity table) mapped to generators in the BGA table? Or are there some orphaned, unassociated boilers?
* Should we make this more extensive Unit ID assignment process optional in the generators_eia860 output table?

In [117]:
%%time
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine)
gens_df = pudl_out.gens_eia860(unit_ids=True)

Selected 172517 ['CC', 'CS', 'GT', 'IC'] records lacking Unit IDs from 403834 records overall. 
Selected 1891 ['ST'] records lacking Unit IDs from 403834 records overall. 
Selected 3836 ST records lacking Unit IDs burning coal from 403834 records overall.
Selected 1299 ST records lacking Unit IDs burning oil from 403834 records overall.
Selected 4966 ST records lacking Unit IDs burning gas from 403834 records overall.
Selected 2256 ST records lacking Unit IDs burning waste from 403834 records overall.
CPU times: user 2min 23s, sys: 5.85 s, total: 2min 28s
Wall time: 2min 29s


In [13]:
assert False

AssertionError: 

# Net Generation Allocation

In [None]:
gf_cols = [
    "plant_id_eia",
    "report_date",
    "energy_source_code",
    "prime_mover_code",
    "net_generation_mwh",
    "fuel_consumed_mmbtu_for_electricity",
]

gens_cols = [
    "plant_id_eia",
    "generator_id",
    "report_date",
    "energy_source_code_1",
    "energy_source_code_2",
    "energy_source_code_3",
    "energy_source_code_4",
    "energy_source_code_5",
    "energy_source_code_6",
    "capacity_mw",
    "prime_mover_code",
]

# Cull fully reported units
* In some plants, all of the generators report all of their fuel consumption and electricity generation in the `generation_eia923` and `boiler_fuel_eia923` tables. If they also don't have CHP, our current heat rate calculation should fully accommodate these plants already.
* In other plants, all of the electricity generation and fuel consumption being reported in association with a particular type of prime mover (but maybe not all prime movers) are fully covered by the more granular boiler / generator reporting, and so those plant-prime combinations can be removed from the heat rate calculations based on the `generation_fuel_eia923` table data. So long as there's no CHP. This is a more general case of the bullet above, so maybe we should just do this as the first cut.
* To identify these cases, we need to aggregate net generation and fuel consumption on the basis of `unit_id_pudl` and identify cases in which all generators of any prime mover type that is involved have been included in that aggregation, on an annual basis. In these cases the net generation and fuel consumption associated with those prime mover types can be safely removed from the `generation_fuel_eia923` table, leaving only fuel and electricity that hasn't been accounted for, or is only partially accounted for.

# Assess New Unit ID coverage
* Run the same assessment functions as we did above
* Per generator
* Per MW installed
* Per MWh of net generation

# Heat rates by Plant-Prime
* The `generation_fuel_eia923` table breaks down net generation & fuel consumption by plant and prime mover.
* Can one calculate realistic heat rates on the basis of plant-prime? Or do they always need to have units?
* Calculate the distribution of plant-prime heat rates and plot them to see what they look like.
* It seems likely that this strategy probably won't work, and we'll need to do some kind of grouping into pseudo-units.

In [None]:
gen_fuel = pudl_out.gf_eia923()

In [None]:
len(gen_fuel)

In [None]:
assert False

# Combined Heat and Power
* What fraction of generators, capacity, and generation are associated with generators that also do CHP?

In [None]:
gf_eia923 = pudl_out.gf_eia923()
gf_eia923["fuel_ratio"] = gf_eia923.fuel_consumed_for_electricity_mmbtu / gf_eia923.fuel_consumed_mmbtu
gf_eia923["no_chp"] = pd.Series(np.isclose(gf_eia923.fuel_ratio, 1.0))
plt.hist(gf_eia923.fuel_ratio, bins=20)
plt.yscale("log")
plt.xlabel("Fraction of fuel used for electricity")
plt.ylabel("Number of Records (log scale)");

In [None]:
gf_eia923.groupby("report_date")[["fuel_consumed_for_electricity_mmbtu", "fuel_consumed_mmbtu"]].sum().plot()
plt.ylim(0,5e9)
plt.ylabel("Fuel Consumed [MMBTU]");

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(
    gf_eia923.fuel_consumed_mmbtu,
    gf_eia923.fuel_consumed_for_electricity_mmbtu,
    s=1, alpha=0.01, color="black",
)
plt.xscale("log")
plt.yscale("log")
plt.xlim(1e3, 1e8)
plt.ylim(1e3, 1e8)
plt.xlabel("Total Fuel Consumed [MMBTU]")
plt.ylabel("Fuel Consumed for Electricity [MMBTU]");

In [None]:
chp_summary = chp_prevalence(prepped_gens)
chp_summary

# Assess Existing Heat Rates
* We need some way to compare different ways of calculating heat rates and choose between them.
* We want to check both for their correctness, and their completeness.
* Apparent correctness will depend on the type of generator / unit type. Need to define different expectations.

In [None]:
%%time
hr_by_unit = pudl_out.hr_by_unit()
hr_by_unit.info()

## Compile Net Generation
* **By generator**
  * `net_generation_mwh` available from `generation_eia923` table
  * Associated with `plant_id_eia` and `generator_id` columns directly.
  * Can be associated with `unit_id_pudl` if the generator is part of an identified unit
  * Can be associated with a `technology_description` and `prime_mover_code` based on the `generators_eia860` table.
  * Can be associated with a list of energy sources based on `energy_source_N` in `generators_eia860` table.
* **By plant-prime-fuel**
  * `net_generation_mwh` is available from generation_fuel_eia923`

## Compile Fuel Consumption
* **By boiler**
  * `total_heat_content_mmbtu` and `fuel_type_code_pudl` are available by `plant_id_eia` and `boiler_id` in `boiler_fuel_eia923`
  * This value can be be associated with a `unit_id_pudl` based on the BGA table.
  * The heat consumed in here includes fuels both for electricity and steam (direct heat) outputs.
* **By plant-prime-fuel**
  * Available in `generation_fuel` and broken down as for CHP vs Electricity separately.

## Identify Combined Cycle Units
* Combined Cycle turbines show up in the generators table, but have no "boiler" so they don't end up in the boiler-generator-association
* This means they don't get assigned `unit_id_pudl` values and are often lost.
* However, they are identifiable based on `technology_description` in the generators table, and so can be associated with a plant.
* Within a given plant, it's possible to combine all the natural gas that goes into a 

## Questions:
* Do all of the generators that show up in the generation table end up getting PUDL Unit IDs assigned?
* Where are the heat inputs being reported for combined cycle units? Do they really exist after 2015? What is still missing?
* Triage units / generators into: Easy, Hard, and Impossible. Work on Hard ones until diminishing returns. Assign impossible and too-hard ones the median values.

## Tables of Interest:
* `generation_eia923`
* `generation_fuel_eia923`
* `boiler_generator_assn_eia860`
* `generators_eia860`
* `generators_entity_eia`
* `boiler_fuel_eia923`

In [None]:
gens_eia860_sql = """
SELECT report_date,
       plant_id_eia,
       generator_id,
       capacity_mw,
       energy_source_code_1,
       energy_source_code_2,
       energy_source_code_3,
       energy_source_code_4,
       energy_source_code_5,
       energy_source_code_6,
       fuel_type_code_pudl,
       technology_description,
FROM generators_eia860
"""

gens_entity_sql = """
SELECT report_date,
       plant_id_eia,
       generator_id,
       prime_mover_code,
       bypass_heat_recovery,
       associated_combined_heat_and_power,
FROM generators_entity_eia
"""

gen_eia923_sql = """
SELECT report_date,
       plant_id_eia,
       generator_id,
       net_generation_mwh
FROM generation_eia923
"""

gf_eia923_sql = """
SELECT report_date,
       plant_id_eia,
       nuclear_unit_id,
       fuel_type,
       fuel_type_code_pudl,
       prime_mover_code,
       fuel_consumed_mmbtu,
       fuel_consumed_for_electricity_mmbtu,
       net_generation_mwh
FROM generation_fuel_eia923
"""

bga_sql = """
SELECT report_date,
       plant_id_eia,
       unit_id_pudl,
       generator_id,
       boiler_id
FROM boiler_generator_assn_eia860
"""

bf_eia923_sql = """
SELECT report_date,
       plant_id_eia,
       boiler_id,
       fuel_type_code,
       fuel_type_code_pudl,
       fuel_consumed_units,
       fuel_mmbtu_per_unit
FROM boiler_fuel_eia923
"""


## Compile Fuel Consumption
  * Reported by boiler in `boiler_fuel_eia923` and so can be associated 

# Preliminary Data Wrangling
Once all of the data is loaded and looks like it's in good shape, do any initial wrangling that's specific to this particular analysis. This should mostly make use of the higher level functions which were defined above. If this step takes a while, don't be shy about producing `logging` outputs.

# Data Analysis and Visualization
* Now that you've got the required data in a usable form, you can tell the story of your analysis through a mix of visualizations, and further data wrangling steps.
* This narrative should be readable, with figures that have titles, legends, and labeled axes as appropriate so others can understand what you're showing them.
* The code should be concise and make use of the parameters and functions which you've defined above when possible. Functions should contain comprehensible chunks of work that make sense as one step in the story of the analysis.

In [None]:
coal_ax = finite_distplot(mcoe_coal, "heat_rate_mmbtu_mwh", max_val=20)
plt.title("Coal heat rate distribution");

In [None]:
gas_ax = finite_distplot(mcoe_gas, "heat_rate_mmbtu_mwh", max_val=20)
plt.title("Gas heat rate distribution");