## Code for Comparison

In [None]:
 'plant_technology': {
        'id_cols': ['plant_id_eia', 'generator_id','technology_description'],
        'ag_tables': {
            'generators_eia860': {
                'denorm_table': None,
                'denorm_cols': None,
                'ag_cols': {
                    'capacity_mw': pudl.helpers.sum_na,
                },
            },
             'generation_eia923': {
                'denorm_table': 'generators_eia860',
                'denorm_cols': ['plant_id_eia','generator_id', 'report_date'],
                'ag_cols': {
                    'net_generation_mwh': pudl.helpers.sum_na,
                },
            },
        },
    },

## Recreating the Master Unit List

#### setup/imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import pudl
import pudl.constants as pc
import pudl.extract.ferc1
import sqlalchemy as sa
import logging
import sys

In [3]:
from copy import deepcopy

In [4]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

#### defining a table grabbing object

In [5]:
class compile_tables(object):
    
    def __init__(self, pudl_engine, freq=None, start_date=None, end_date=None):
        """
        initializing a table compiler
        """
        self.pudl_engine = pudl_engine
        self.freq = freq

        if start_date is None:
            self.start_date = \
                pd.to_datetime(
                    '{}-01-01'.format(min(pc.working_years['eia923'])))
        else:
            # Make sure it's a date... and not a string.
            self.start_date = pd.to_datetime(start_date)

        if end_date is None:
            self.end_date = \
                pd.to_datetime(
                    '{}-12-31'.format(max(pc.working_years['eia923'])))
        else:
            # Make sure it's a date... and not a string.
            self.end_date = pd.to_datetime(end_date)

        if not pudl_engine:
            raise AssertionError('PudlTabl object needs a pudl_engine')
        self.pudl_engine = pudl_engine

        # grabing the metadata object for the sqlite db
        self.pt = pudl.output.pudltabl.get_table_meta(self.pudl_engine)
        
        self.pudl_out = pudl.output.pudltabl.PudlTabl(
            pudl_engine=pudl_engine, freq=self.freq)
        
        self._dfs = {
            'generation_fuel_eia923': None,
            'fuel_receipts_costs_eia923': None,
            'generators_eia860': None,
            'boiler_generator_assn_eia860': None,
            'generation_eia923': None,

            'fuel_cost': None,
        }

    
    def grab_the_table(self, table):
        if table is None:
            return 

        if self._dfs[table] is None:
            # this is going to try to see if the table is in the db
            # if pt[table] is not None:
            try:
                tbl = pt[table]
                print(f'   grabbing {table} from the sqlite db')
                select = sa.sql.select([tbl, ])
                if self.start_date is not None:
                    select = select.where(
                        tbl.c.report_date >= self.start_date)
                if self.end_date is not None:
                    select = select.where(
                        tbl.c.report_date <= self.end_date)
                self._dfs[table] = pd.read_sql(select, self.pudl_engine, parse_dates=['report_date'], index_col=['id'])
            # if is it not a database table, it is an output function
            # elif hasattr(pudl_out_eia, table):
            except KeyError:
                # getattr turns the string of the table into an attribute
                # of the object, so this runs the output function
                print(f'   grabbing {table} from the output object')
                self._dfs[table] = getattr(self.pudl_out, table)()
        return self._dfs[table]

In [6]:
def grab_denormalize_table(table, denorm_table=None, denorm_cols=None, indicator=False):
    """
    Grab and denormalize the table.
    
    Grab the table that you want, and merge it with another table based
    on the 'denorm_cols'.
    
    Args:
        table (string): a table name
        denorm_table (string): the name of the table you want to merge in
        denorm_cols (list): the columns to use to merge the tables
        indicator (bool): True of False for whether or not you want to
            include an indicator column in your merge that notes where
            each row came from.
    Returns:
        pandas.Dataframe
    """
    table_df = table_compiler.grab_the_table(table)
    if denorm_table:
        logger.info(f'   denormalizing {table}')
        # denormalize the plant granularity
        table_df = table_df.merge(table_compiler.grab_the_table(denorm_table),
                                  on=denorm_cols,
                                  how='outer',
                                  indicator=indicator)
    return table_df

In [11]:
plant_granularities = {
    'plant_technology': {
        'id_cols': ['plant_id_eia', 'generator_id','technology_description'],
        'ag_tables': {
            'generators_eia860': {
                'denorm_table': None,
                'denorm_cols': None,
                'ag_cols': {
                    'capacity_mw': pudl.helpers.sum_na,
                },
            },
             'generation_eia923': {
                'denorm_table': 'generators_eia860',
                'denorm_cols': ['plant_id_eia','generator_id', 'report_date'],
                'ag_cols': {
                    'net_generation_mwh': pudl.helpers.sum_na,
                },
            },
        },
    },
    'plant': {
        'id_cols': ['plant_id_eia'],
        'ag_tables': {
            'fuel_cost':{ 
                'denorm_table': None,
                'denorm_cols': None,
                'ag_cols':{
                    'fuel_cost_per_mwh': 'mean',
                    },
            },
            'generation_fuel_eia923': {
                'denorm_table': None,
                'denorm_cols': None,
                'ag_cols': {
                    'fuel_consumed_mmbtu': pudl.helpers.sum_na,
                    'net_generation_mwh': pudl.helpers.sum_na,
                    },
            },
            'fuel_receipts_costs_eia923': {
                'denorm_table': None,
                'denorm_cols': None,
                'ag_cols': {
                    'fuel_cost_per_mmbtu': pudl.helpers.sum_na,
                    },
            },
            'generators_eia860': {
                'denorm_table': None,
                'denorm_cols': None,
                'ag_cols': {            
                    'capacity_mw': pudl.helpers.sum_na,
                    },
            },
        }
    },
    'plant_unit': {
        'id_cols': ['plant_id_eia','unit_id_pudl'],
        # unit_id_pudl are associated with plant_id's and plant_id's/generator_id's
        'ag_tables': {
            'generators_eia860': {
                'denorm_table': 'boiler_generator_assn_eia860',
                'denorm_cols': ['plant_id_eia','generator_id', 'report_date'],
                'ag_cols': {
                    'capacity_mw': pudl.helpers.sum_na,
                },
            },
            'generation_fuel_eia923': {
                'denorm_table': 'boiler_generator_assn_eia860',
                'denorm_cols': ['plant_id_eia', 'report_date'],
                'ag_cols': {
                    'fuel_consumed_mmbtu': pudl.helpers.sum_na,
                    'net_generation_mwh': pudl.helpers.sum_na,
                },
            },
            'fuel_cost': {
                'denorm_table': 'boiler_generator_assn_eia860',
                'denorm_cols': ['plant_id_eia','generator_id', 'report_date'],
                'ag_cols': {
                    'fuel_cost_per_mwh': pudl.helpers.sum_na,
                },

            }

        },
    },
    'plant_prime_fuel': {
        'id_cols': ['plant_id_eia','energy_source_code_1'],
        'ag_tables': {
            
        }
    }
    
}

In [12]:
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings["pudl_db"])
pt = pudl.output.pudltabl.get_table_meta(pudl_engine)

In [13]:
table_compiler = compile_tables(pudl_engine, freq='AS')

In [14]:
compiled_dfs = {}
for plant_gran in plant_granularities:
    logger.info(f'compiling data for {plant_gran}')
    cols_to_grab = plant_granularities[plant_gran]['id_cols'] + ['report_date']
    all_the_stuff = pd.DataFrame(columns=cols_to_grab)
    for table, table_details in plant_granularities[plant_gran]['ag_tables'].items():
        # grab the table
        logger.info(f'   aggregating {table}')
        all_the_stuff = (
            # grab the table
            grab_denormalize_table(
                table,
                denorm_table=table_details['denorm_table'],
                denorm_cols=table_details['denorm_cols']).
            groupby(cols_to_grab).
            # use the groupby object to aggregate on the ag_cols
            # this runs whatever function we've defined in the 
            # ag_cols dictionary
            agg(table_details['ag_cols']).
            # reset the index because the output of the agg
            reset_index().
            # merge the new table into the compiled df
            merge(all_the_stuff, how='outer', on=cols_to_grab)
        ) 
    # add the df into a dictionary of dfs
    compiled_dfs[plant_gran] = all_the_stuff

compiling data for plant_technology
   aggregating generators_eia860
   grabbing generators_eia860 from the sqlite db
   aggregating generation_eia923
   grabbing generation_eia923 from the sqlite db
   denormalizing generation_eia923
compiling data for plant
   aggregating fuel_cost
   grabbing fuel_cost from the output object
   aggregating generation_fuel_eia923


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


   grabbing generation_fuel_eia923 from the sqlite db
   aggregating fuel_receipts_costs_eia923
   grabbing fuel_receipts_costs_eia923 from the sqlite db
   aggregating generators_eia860
compiling data for plant_unit
   aggregating generators_eia860
   denormalizing generators_eia860
   grabbing boiler_generator_assn_eia860 from the sqlite db
   aggregating generation_fuel_eia923
   denormalizing generation_fuel_eia923
   aggregating fuel_cost
   denormalizing fuel_cost
compiling data for plant_prime_fuel


## Playing with the table compiler

In [15]:
table_compiler.grab_the_table('generators_eia860')

Unnamed: 0_level_0,plant_id_eia,generator_id,report_date,operational_status_code,operational_status,ownership_code,capacity_mw,summer_capacity_mw,winter_capacity_mw,energy_source_code_1,...,switch_oil_gas,nameplate_power_factor,minimum_load_mw,uprate_derate_during_year,uprate_derate_completed_date,current_planned_operating_date,summer_estimated_capability_mw,winter_estimated_capability_mw,operating_switch,retirement_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,61959,COLS,2017-01-01,T,proposed,S,19.6,19.6,19.6,SUN,...,False,,,False,,2018-12-01,,,,
2,61958,BAP1,2017-01-01,U,proposed,S,1.5,1.5,1.5,SUN,...,False,,,False,,2018-10-01,,,,
3,61957,PW,2017-01-01,T,proposed,S,243.0,220.0,220.0,WND,...,False,,,False,,2018-12-01,,,,
4,61956,TBESS,2017-01-01,T,proposed,S,4.0,4.0,4.0,MWH,...,False,,,False,,2018-10-01,,,,
5,61956,GRDMT,2017-01-01,T,proposed,S,4.4,4.4,4.4,SUN,...,False,,,False,,2018-09-01,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176935,3,5,2009-01-01,,,,,,,,...,,,,,,,,,,
176936,3,4,2009-01-01,,,,,,,,...,,,,,,,,,,
176937,3,3,2009-01-01,,,,,,,,...,,,,,,,,,,
176938,3,2,2009-01-01,,,,,,,,...,,,,,,,,,,


In [16]:
grab_denormalize_table('generation_eia923',
                       denorm_table='boiler_generator_assn_eia860',
                       denorm_cols=['plant_id_eia','generator_id', 'report_date'],
                       indicator=True)

   denormalizing generation_eia923


Unnamed: 0,plant_id_eia,generator_id,report_date,net_generation_mwh,boiler_id,unit_id_eia,unit_id_pudl,bga_source,_merge
0,3,1,2009-01-01,39699.00,,,,,left_only
1,3,1,2009-02-01,5594.00,,,,,left_only
2,3,1,2009-03-01,13015.00,,,,,left_only
3,3,1,2009-04-01,15858.00,,,,,left_only
4,3,1,2009-05-01,68232.00,,,,,left_only
...,...,...,...,...,...,...,...,...,...
427816,61838,ST2,2017-08-01,,,,,,left_only
427817,61838,ST2,2017-09-01,,,,,,left_only
427818,61838,ST2,2017-10-01,,,,,,left_only
427819,61838,ST2,2017-11-01,,,,,,left_only


## Trying the grouby/aggregation

In [17]:
# you can change these inputs and run the following cell
# to see what an aggregated dataframe with these inputs will
# result in.
table = 'generation_fuel_eia923'
denorm_table = 'boiler_generator_assn_eia860'
denorm_cols = ['plant_id_eia', 'report_date']
# the id_cols + report_date
cols_to_grab = ['plant_id_eia', 'report_date']
ag_cols = {'fuel_consumed_mmbtu': pudl.helpers.sum_na,
           'net_generation_mwh': pudl.helpers.sum_na,}

In [18]:
(grab_denormalize_table(
    table,
    denorm_table=denorm_table,
    denorm_cols=denorm_cols).
 groupby(cols_to_grab).
 agg(ag_cols).reset_index())

   denormalizing generation_fuel_eia923


KeyboardInterrupt: 

## Playing with the compiled outputs 

In [19]:
# printing out the keys of the dictionary so you can see
# which 
compiled_dfs.keys()

dict_keys(['plant_technology', 'plant', 'plant_unit', 'plant_prime_fuel'])

In [20]:
plant_unit = compiled_dfs['plant_unit']

In [21]:
# if you want to look at an individaul plant
plant_unit[plant_unit['plant_id_eia'] == 3]

Unnamed: 0,plant_id_eia,unit_id_pudl,report_date,fuel_cost_per_mwh,fuel_consumed_mmbtu,net_generation_mwh,capacity_mw
0,3,1,2011-01-01,49.066847,9320870.0,1073820.0,153.1
1,3,1,2012-01-01,53.815039,7284879.0,938031.0,153.1
2,3,1,2013-01-01,,9714519.0,1126106.0,153.1
3,3,1,2014-01-01,,,,153.1
4,3,1,2015-01-01,-132.311474,5937773.0,803834.0,153.1
5,3,1,2016-01-01,,8821365.0,1123279.0,153.1
6,3,1,2017-01-01,91.082285,6228557.0,841633.0,153.1
7,3,2,2011-01-01,51.295507,9320870.0,1073820.0,153.1
8,3,2,2012-01-01,56.845119,7284879.0,938031.0,153.1
9,3,2,2013-01-01,,9714519.0,1126106.0,153.1


In [22]:
# selecting on two criteria (plant_id_eia and report_date)
plant_unit[(plant_unit['plant_id_eia'] == 3) & (plant_unit['report_date'] == '2017-01-01')]

Unnamed: 0,plant_id_eia,unit_id_pudl,report_date,fuel_cost_per_mwh,fuel_consumed_mmbtu,net_generation_mwh,capacity_mw
6,3,1,2017-01-01,91.082285,6228557.0,841633.0,153.1
13,3,2,2017-01-01,82.77007,6228557.0,841633.0,153.1
26,3,4,2017-01-01,31.541425,6228557.0,841633.0,403.7
33,3,5,2017-01-01,28.684894,6228557.0,841633.0,788.8
40,3,6,2017-01-01,281.494647,74742684.0,10099596.0,2141.6
47,3,7,2017-01-01,277.633262,74742684.0,10099596.0,2141.6


In [23]:
# you can see where fields are empty
plant_unit[plant_unit['capacity_mw'].isnull()]

Unnamed: 0,plant_id_eia,unit_id_pudl,report_date,fuel_cost_per_mwh,fuel_consumed_mmbtu,net_generation_mwh,capacity_mw
2537,1010,6,2012-01-01,,1202096.0,103137.005,
13809,54529,1,2012-01-01,,503600.0,20823.0,
13810,54529,1,2013-01-01,,606836.0,22051.0,
13811,54529,1,2014-01-01,,509947.0,14292.0,
13812,54529,1,2015-01-01,,697085.0,21636.0,
16659,58215,2,2013-01-01,,4237879.0,606807.005,
16702,58574,1,2013-01-01,,0.0,0.0,
16703,58574,1,2014-01-01,,3908.0,253.0,
16704,58574,1,2015-01-01,,0.0,0.0,


In [24]:
# you can see where fields are not empty
plant_unit[plant_unit['capacity_mw'].notnull()]

Unnamed: 0,plant_id_eia,unit_id_pudl,report_date,fuel_cost_per_mwh,fuel_consumed_mmbtu,net_generation_mwh,capacity_mw
0,3,1,2011-01-01,49.066847,9320870.0,1073820.001,153.1
1,3,1,2012-01-01,53.815039,7284879.0,938030.999,153.1
2,3,1,2013-01-01,,9714519.0,1126105.996,153.1
3,3,1,2014-01-01,,,,153.1
4,3,1,2015-01-01,-132.311474,5937773.0,803834.000,153.1
...,...,...,...,...,...,...,...
16778,60100,1,2017-01-01,,0.0,0.000,11.5
16779,60122,1,2017-01-01,,,,4921.2
16780,60340,1,2017-01-01,,,,54.5
16781,60768,1,2016-01-01,,,,2476.0
