## Recreating the Master Unit List

#### setup/imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import pudl
import pudl.constants as pc
import pudl.extract.ferc1
import sqlalchemy as sa
import logging
import sys
import copy
import pathlib

from copy import deepcopy

In [3]:
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
mpl.style.use('dark_background')

In [4]:
import pudl_rmi.make_plant_parts_eia
from pudl_rmi.make_plant_parts_eia import *
pd.options.display.max_columns = None

In [5]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

#### defining a table grabbing objectDEBUG

In [6]:
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])

In [7]:
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine,freq='AS',
    roll_fuel_cost=True,
    fill_fuel_cost=True,
    fill_net_gen=True
)

In [8]:
gens_maker = MakeMegaGenTbl(pudl_out)



In [9]:
grans_labeler = LabelTrueGranularities(gens_maker)
parts_compiler = MakePlantParts(pudl_out, gens_maker, grans_labeler)

In [10]:
%%time
# there is a warning in here that will scream if don't have utility ids
# for all of the generators. i've fixed this on the PUDL side by requiring
# utility id's to be present in the annual generators table
plant_parts_df = parts_compiler.execute()

Generating the mega generator table with ownership.
Allocating net generation from the generation_fuel_eia923 to the generator level instead of using the less complete generation_eia923 table.
Removing 3465 generators that retired mid-year out of 373857
No records found with fuel-only records. This is expected.
Ratio calc types: 
   All gens w/in generation table:  71405#, 1.2e+07 MW
   Some gens w/in generation table: 2584#, 1.6e+05 MW
   No gens w/in generation table:   330520#, 1.5e+07 MW
   GF table records have no PM:     0#




1.174% of records have are partially off from their 'IDX_PM_FUEL' group
gen v fuel table net gen diff:      42.2%
new v fuel table net gen diff:      99.6%
new v fuel table fuel (mmbtu) diff: 99.5%
6.63% of generator records are more that 5% off from the net generation table
filling in fuel cost NaNs EIA APIs monthly state averages
filling in fuel cost NaNs with rolling averages
Labeled 18.45% of generators as non-operative.
true grans found for plant: 137896
true grans found for plant_unit: 140612
true grans found for plant_prime_mover: 37318
true grans found for plant_technology: 14922
true grans found for plant_prime_fuel: 1979
true grans found for plant_ferc_acct: 7585
true grans found for plant_gen: 277473
begin aggregation for: plant
begin aggregation for: plant_unit
begin aggregation for: plant_prime_mover
begin aggregation for: plant_technology
begin aggregation for: plant_prime_fuel
begin aggregation for: plant_ferc_acct
begin aggregation for: plant_gen
CPU times: user 20min 3

In [11]:
test_run_aggregations(plant_parts_df=plant_parts_df, plant_gen_df=gens_maker.execute())

Begining tests for plant:
  Results for total_fuel_cost: [True]
  Results for net_generation_mwh: [True]
  Results for capacity_mw: [True]
  Results for capacity_mw_eoy: [True]
  Results for total_mmbtu: [True]
Begining tests for plant_unit:
  Results for total_fuel_cost: [True]
  Results for net_generation_mwh: [True]
  Results for capacity_mw: [True]
  Results for capacity_mw_eoy: [True]
  Results for total_mmbtu: [True]
Begining tests for plant_prime_mover:
  Results for total_fuel_cost: [True]
  Results for net_generation_mwh: [True]
  Results for capacity_mw: [True]
  Results for capacity_mw_eoy: [True]
  Results for total_mmbtu: [True]
Begining tests for plant_technology:
  Results for total_fuel_cost: [True]
  Results for net_generation_mwh: [True]
  Results for capacity_mw: [True]
  Results for capacity_mw_eoy: [True]
  Results for total_mmbtu: [True]
Begining tests for plant_prime_fuel:
  Results for total_fuel_cost: [True]
  Results for net_generation_mwh: [True]
  Results fo

In [17]:
file_path_mul = pathlib.Path().cwd().parent /'outputs' /'master_unit_list.pkl.gz'
plant_parts_df_old = pd.read_pickle(file_path_mul)
len(plant_parts_df)/len(plant_parts_df_old)

1.0

In [18]:
file_path_mul = pathlib.Path().cwd().parent /'outputs' /'master_unit_list.pkl.gz'
plant_parts_df.to_pickle(file_path_mul)

In [19]:
plant_parts_df

Unnamed: 0_level_0,plant_id_eia,report_date,plant_part,generator_id,unit_id_pudl,prime_mover_code,energy_source_code_1,technology_description,ferc_acct_name,utility_id_eia,true_gran,appro_part_label,appro_record_id_eia,capacity_factor,capacity_mw,capacity_mw_eoy,fraction_owned,fuel_cost_per_mmbtu,fuel_cost_per_mwh,fuel_type_code_pudl,heat_rate_mmbtu_mwh,installation_year,net_generation_mwh,operational_status,operational_status_pudl,ownership,ownership_dupe,planned_retirement_date,plant_id_pudl,plant_name_eia,plant_name_new,record_count,retirement_date,total_fuel_cost,total_mmbtu,utility_id_pudl,report_year,plant_id_report_year
record_id_eia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
63_2004_plant_owned_213_bu,63,2004-01-01,plant,,,IC,DFO,Petroleum Liquids,Other,213,True,plant,63_2004_plant_owned_213_BU,,8.5,0.0,1.0,,,oil,,,,,BU,owned,True,NaT,826,Gold Creek,Gold Creek,1,NaT,,,19,2004,826_2004
64_2004_plant_owned_213_bu,64,2004-01-01,plant,,,,DFO,Petroleum Liquids,Other,213,True,plant,64_2004_plant_owned_213_BU,,57.5,0.0,1.0,,,oil,,,,,BU,owned,True,NaT,325,Lemon Creek,Lemon Creek,1,NaT,,,19,2004,325_2004
936_2004_plant_owned_3037_bu,936,2004-01-01,plant,,,IC,DFO,Petroleum Liquids,Other,3037,True,plant,936_2004_plant_owned_3037_BU,,12.5,0.0,1.0,,,oil,,,,,BU,owned,True,NaT,1625,Carlyle,Carlyle,1,NaT,,,942,2004,1625_2004
1160_2004_plant_owned_11581_bu,1160,2004-01-01,plant,,,IC,DFO,Petroleum Liquids,Other,11581,True,plant,1160_2004_plant_owned_11581_BU,,1.0,0.0,1.0,,,oil,,,,,BU,owned,True,NaT,1691,Manning,Manning,1,NaT,,,1087,2004,1691_2004
1363_2004_plant_owned_11249_bu,1363,2004-01-01,plant,11,,GT,NG,Natural Gas Fired Combustion Turbine,Other,11249,True,plant,1363_2004_plant_owned_11249_BU,,16.3,0.0,1.0,,,gas,,,,,BU,owned,True,NaT,98,Cane Run,Cane Run,1,NaT,,,169,2004,98_2004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61257_zv204_2019_plant_gen_total_60865,61257,2019-01-01,plant_gen,ZV204,,PV,SUN,Solar Photovoltaic,Other,60865,False,plant,61257_2019_plant_total_60865,0.223069,4.9,4.9,1.0,,,solar,,2017,9575.0,existing,operating,total,False,NaT,11122,"ZV Solar 2, LLC","ZV Solar 2, LLC ZV204",1,NaT,,,5833,2019,11122_2019
60549_zv3_2019_plant_gen_total_61119,60549,2019-01-01,plant_gen,ZV3,,PV,SUN,Solar Photovoltaic,Other,61119,False,plant,60549_2019_plant_total_61119,0.213790,5.0,5.0,1.0,,,solar,,2016,9364.0,existing,operating,total,False,NaT,9727,"ZV Solar 3, LLC","ZV Solar 3, LLC ZV3",1,NaT,,,5830,2019,9727_2019
60220_zwed1_2019_plant_gen_total_60003,60220,2019-01-01,plant_gen,ZWED1,,IC,OBG,Other Waste Biomass,Other,60003,True,plant_gen,60220_ZWED1_2019_plant_gen_total_60003,0.277112,0.8,0.8,1.0,,,gas,,2015,1942.0,existing,operating,total,False,NaT,7565,Zero Waste Energy Development Co LLC,Zero Waste Energy Development Co LLC ZWED1,2,NaT,,,3836,2019,7565_2019
60220_zwed2_2019_plant_gen_total_60003,60220,2019-01-01,plant_gen,ZWED2,,IC,OBG,Other Waste Biomass,Other,60003,True,plant_gen,60220_ZWED2_2019_plant_gen_total_60003,0.277112,0.8,0.8,1.0,,,gas,,2015,1942.0,existing,operating,total,False,NaT,7565,Zero Waste Energy Development Co LLC,Zero Waste Energy Development Co LLC ZWED2,2,NaT,,,3836,2019,7565_2019


In [14]:
compiled_plant_parts = {}
compiled_plant_parts_true = {}
true_parts_df = plant_parts_df[plant_parts_df.true_gran]
for part in PLANT_PARTS.keys():
    part_df = plant_parts_df[(plant_parts_df['plant_part'] == part)]
    compiled_plant_parts_true[part] = true_parts_df[(true_parts_df['plant_part'] == part)]
    compiled_plant_parts[part] = part_df

In [15]:
def plot_plant_vs_agg(compiled_plant_parts, field, xy_limits, scale):
    """
    Make plots to compare FERC & EIA reported values for Coal & Gas plants.
    
    For each of the fields specified in fields_to_plot, create a pair of plots,
    one for 'gas' and one for 'coal' in the same frame, showing the EIA quantity
    vs. the FERC quantity in a scatter plot.
    """
    gens = compiled_plant_parts['plant_gen']
    negative_fields = gens[gens[field] < 0].plant_id_eia.unique()
    for plant_gran, df in compiled_plant_parts.items():
        if plant_gran != 'plant':
            field_plant = field+'_plant'
            field_gran = field+'_'+plant_gran
            id_cols = ['plant_id_eia', 'report_date', 'utility_id_eia', 'ownership']
            try:
                merge_df = (
                    compiled_plant_parts['plant'][id_cols + [field]]
                    .merge(
                        df[id_cols+ ['generator_id'] + [field]],
                        on=id_cols,
                        suffixes=('_plant',f'_{plant_gran}')
                        )
                    )
                # this is for the try
                if field in ['capacity_mw', 'net_generation_mwh', 'total_mmbtu']:
                    baddies = (
                        merge_df[
                            (merge_df[field_plant] < merge_df[field_gran])
                            & ~(merge_df.plant_id_eia.isin(negative_fields))
                        ]
                        .set_index(id_cols + ['generator_id'])
                        [[field_plant, field_gran]])
                    if not baddies.empty:
                        raise AssertionError(f"{plant_gran}/{field} found some baddies {len(baddies)}\n {baddies}")
                #merge_df = merge_df[merge_df['plant_id_eia'] == 3]
                fig, (ax) = plt.subplots(ncols=1, nrows=1, figsize=(5, 5))
                ax.scatter(merge_df[field_plant],
                           merge_df[field_gran],
                           color='aquamarine', alpha=0.1, label=field)
                ax.set_ylim(xy_limits[field][0],xy_limits[field][1])
                ax.set_xlim(xy_limits[field][0],xy_limits[field][1])
                ax.set_xscale(scale)
                ax.set_yscale(scale)
                ax.set_ylabel(f'{plant_gran} {field}')
                ax.set_xlabel(f'Plant {field}')
                ax.set_title(f"Plant vs {plant_gran}: {field}")
            except KeyError:
                pass

In [16]:
fields_to_plot = [
    # Simple Quantities
    'capacity_mw',
    'net_generation_mwh',
    'total_mmbtu',
    # Derived values
    #'capacity_factor',
    'heat_rate_mmbtu_mwh',
    'fuel_cost_per_mwh',
    'fuel_cost_per_mmbtu',
    'total_fuel_cost'
]

xy_limits = {
    # Simple Quantities
    'capacity_mw': (1e0, 1e4),
    'net_generation_mwh': (1e3,1e8),
    'total_mmbtu': (1e4,1e9),
    # Derived values
    'capacity_factor': (0,1.0),
    'heat_rate_mmbtu_mwh': (6,16),
    'fuel_cost_per_mwh': (10,80),
    'fuel_cost_per_mmbtu': (1e0,1e1),
    'total_fuel_cost': (1e7,1e9)
}

for field in fields_to_plot:
    plot_plant_vs_agg(compiled_plant_parts,field, xy_limits, scale="log")

AssertionError: plant_gen/capacity_mw found some baddies 34997
                                                                 capacity_mw_plant  \
plant_id_eia report_date utility_id_eia ownership generator_id                      
63           2004-01-01  213            owned     IC5                         1.6   
1363         2004-01-01  11249          owned     4                          16.3   
                                                  5                          16.3   
                                                  6                          16.3   
1366         2004-01-01  11249          owned     13                         48.6   
...                                                                           ...   
63000        2019-01-01  61012          total     WALDN                       4.0   
63673        2019-01-01  63392          total     EU-12                       1.3   
6061         2019-01-01  17568          total     MOR1                      400.0   
61346        2019-01-01  60982          total     PV1                         0.0   
                                                  PV2                         0.0   

                                                                capacity_mw_plant_gen  
plant_id_eia report_date utility_id_eia ownership generator_id                         
63           2004-01-01  213            owned     IC5                            3.50  
1363         2004-01-01  11249          owned     4                            163.20  
                                                  5                            209.40  
                                                  6                            272.00  
1366         2004-01-01  11249          owned     13                            94.34  
...                                                                               ...  
63000        2019-01-01  61012          total     WALDN                          5.00  
63673        2019-01-01  63392          total     EU-12                          3.00  
6061         2019-01-01  17568          total     MOR1                         550.00  
61346        2019-01-01  60982          total     PV1                          250.00  
                                                  PV2                          250.00  

[34997 rows x 2 columns]

In [None]:
def plot_gens_vs(compiled_plant_parts,part_name, data_col, weight_col, x_range):
    gen_df = compiled_plant_parts['plant_gen'][compiled_plant_parts['plant_gen'][data_col] != 0]
    part_df = compiled_plant_parts[part_name][compiled_plant_parts[part_name][data_col] != 0]
    if weight_col:
        weights_gen = gen_df[weight_col]
        weights_part = part_df[weight_col]
    else:
        weights_gen = None
        weights_part = None

    plt.hist(gen_df[data_col], 
             weights=weights_gen,
             range=x_range,
             bins=100,
             color="purple", #alpha=test_alpha,
             label="Generators")

    plt.hist(part_df[data_col], 
             weights=weights_part,
             range=x_range,
             bins=100,
             color="aquamarine",
             label=f'{part_name}')

    plt.title(f'Gens vs. {part_name}: {data_col}')
    plt.xlabel(data_col)
    plt.ylabel(None)
    plt.legend()
    plt.show()

In [None]:
x_ranges = {
    'capacity_mw' : (0,400),
    'net_generation_mwh': (0, 2500000),
    'fuel_cost_per_mmbtu': (0, 5),
    'fuel_cost_per_mwh': (0, 100),
    'total_fuel_cost': (0,200000000)
}

In [None]:
for part_name in compiled_plant_parts.keys():
    data_col =  'net_generation_mwh'
    weight_col = 'capacity_mw'
    plot_gens_vs(compiled_plant_parts,
                 part_name=part_name,
                 data_col=data_col, 
                 weight_col=weight_col,
                 x_range=x_ranges[data_col])

In [None]:
for part_name in compiled_plant_parts.keys():
    data_col =  'total_fuel_cost'
    weight_col = 'capacity_mw'
    plot_gens_vs(compiled_plant_parts,
                 part_name=part_name,
                 data_col=data_col, 
                 weight_col=weight_col,
                 x_range=x_ranges[data_col])

In [None]:
for part_name in compiled_plant_parts.keys():
    data_col =  'fuel_cost_per_mwh'
    weight_col = 'capacity_mw'
    plot_gens_vs(compiled_plant_parts,
                 part_name=part_name,
                 data_col=data_col, 
                 weight_col=weight_col,
                 x_range=x_ranges[data_col])

In [None]:
for part_name in compiled_plant_parts.keys():
    data_col =  'fuel_cost_per_mmbtu'
    weight_col = 'capacity_mw'
    plot_gens_vs(compiled_plant_parts,
                 part_name=part_name,
                 data_col=data_col, 
                 weight_col=weight_col,
                 x_range=x_ranges[data_col])

## Playing with the compiled outputs 

In [None]:
null_zeros = {0:np.NaN}
count_df = pd.DataFrame(index=list(compiled_plant_parts['plant'].columns))
for k,cpp_df in compiled_plant_parts.items():
    cpp_df = cpp_df.replace({
        'net_generation_mwh':null_zeros,
        'capacity_factor' : null_zeros,
        'fuel_cost_per_mmbtu': null_zeros,
        'fuel_cost_per_mwh': null_zeros,
        'capacity_mw': null_zeros,
                        })
    count_df = count_df.merge(
        pd.DataFrame(cpp_df#[cpp_df['report_date'].dt.year == 2018]
                     .count(), columns=[k]),
        right_index=True, left_index=True)
count_df = count_df.merge(pd.DataFrame(count_df.sum(axis=1),columns=['total']),
                          right_index=True, left_index=True)
count_df