In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import pudl
import pudl.constants as pc
import pudl.extract.ferc1
import sqlalchemy as sa
import logging
import sys
%matplotlib inline

In [3]:
from copy import deepcopy

In [4]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [6]:
start_date = None
end_date = None
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings["pudl_db"])
pt = pudl.output.pudltabl.get_table_meta(pudl_engine)

In [342]:
class compile_tables(object):
    
    def __init__(self, pudl_engine, freq=None, start_date=None, end_date=None):
        """
        initializing a table compiler
        """
        self.pudl_engine = pudl_engine
        self.freq = freq

        if start_date is None:
            self.start_date = \
                pd.to_datetime(
                    '{}-01-01'.format(min(pc.working_years['eia923'])))
        else:
            # Make sure it's a date... and not a string.
            self.start_date = pd.to_datetime(start_date)

        if end_date is None:
            self.end_date = \
                pd.to_datetime(
                    '{}-12-31'.format(max(pc.working_years['eia923'])))
        else:
            # Make sure it's a date... and not a string.
            self.end_date = pd.to_datetime(end_date)

        if not pudl_engine:
            raise AssertionError('PudlTabl object needs a pudl_engine')
        self.pudl_engine = pudl_engine

        # grabing the metadata object for the sqlite db
        self.pt = pudl.output.pudltabl.get_table_meta(self.pudl_engine)
        
        self.pudl_out = pudl.output.pudltabl.PudlTabl(
            pudl_engine=pudl_engine, freq=self.freq)
        
        self._dfs = {
            'generation_fuel_eia923': None,
            'fuel_receipts_costs_eia923': None,
            'generators_eia860': None,
            'boiler_generator_assn_eia860': None,

            'fuel_cost': None,
        }

    
    def grab_the_table(self, table):
        if table is None:
            return 

        if self._dfs[table] is None:
            # this is going to try to see if the table is in the db
            # if pt[table] is not None:
            try:
                tbl = pt[table]
                print(f'   grabbing {table} from the sqlite db')
                select = sa.sql.select([tbl, ])
                if self.start_date is not None:
                    select = select.where(
                        tbl.c.report_date >= self.start_date)
                if self.end_date is not None:
                    select = select.where(
                        tbl.c.report_date <= self.end_date)
                self._dfs[table] = pd.read_sql(select, self.pudl_engine, parse_dates=['report_date'], index_col=['id'])
            # if is it not a database table, it is an output function
            # elif hasattr(pudl_out_eia, table):
            except KeyError:
                # getattr turns the string of the table into an attribute
                # of the object, so this runs the output function
                print(f'   grabbing {table} from the output object')
                self._dfs[table] = getattr(self.pudl_out, table)()
        return self._dfs[table]

In [347]:
plant_granularities = {
    'plant': {
        'id_cols': ['plant_id_eia'],
        'ag_tables': {
            'fuel_cost':{ 
                'denorm_table': None,
                'denorm_cols': None,
                'ag_cols':{
                    'fuel_cost_per_mwh': 'mean',
                    },
            },
            'generation_fuel_eia923': {
                'denorm_table': None,
                'denorm_cols': None,
                'ag_cols': {
                    'fuel_consumed_mmbtu': pudl.helpers.sum_na,
                    'net_generation_mwh': pudl.helpers.sum_na,
                    },
            },
            'fuel_receipts_costs_eia923': {
                'denorm_table': None,
                'denorm_cols': None,
                'ag_cols': {
                    'fuel_cost_per_mmbtu': pudl.helpers.sum_na,
                    },
            },
            'generators_eia860': {
                'denorm_table': None,
                'denorm_cols': None,
                'ag_cols': {            
                    'capacity_mw': pudl.helpers.sum_na,
                    },
            },
        }
    },
    'plant_unit': {
        'id_cols': ['plant_id_eia','unit_id_pudl'],
        # unit_id_pudl are associated with plant_id's and plant_id's/generator_id's
        'ag_tables': {
            'generators_eia860': {
                'denorm_table': 'boiler_generator_assn_eia860',
                'denorm_cols': ['plant_id_eia','generator_id', 'report_date'],
                'ag_cols': {
                    'capacity_mw': pudl.helpers.sum_na,
                },
            },
            'generation_fuel_eia923': {
                'denorm_table': 'boiler_generator_assn_eia860',
                'denorm_cols': ['plant_id_eia', 'report_date'],
                'ag_cols': {
                    'fuel_consumed_mmbtu': pudl.helpers.sum_na,
                    'net_generation_mwh': pudl.helpers.sum_na,
                },
            },
            'fuel_cost': {
                # the fuel cost table already has the unit_id_pudl in it
                'denorm_table': 'boiler_generator_assn_eia860',
                'denorm_cols': ['plant_id_eia','generator_id', 'report_date'],
                'ag_cols': {
                    'fuel_cost_per_mwh': pudl.helpers.sum_na,
                },

            }

        },
    },
    'plant_prime_fuel': {
        'id_cols': ['plant_id_eia','energy_source_code_1'],
        'ag_tables': {
            
        }
    }
    
}

In [316]:
def grab_denormalize_table(table, denorm_table=None, denorm_cols=None):
    table_df = table_compiler.grab_the_table(table)
    if denorm_table:
        logger.info(f'denormalizing {table}')
        # denormalize the plant granularity
        table_df = table_df.merge(table_compiler.grab_the_table(denorm_table),on=denorm_cols, how='outer')
    return table_df

In [348]:
table_compiler = compile_tables(pudl_engine, freq='AS')

In [None]:
compiled_dfs = {}
for plant_gran in plant_granularities:
    logger.info(f'compiling data for {plant_gran}')
    cols_to_grab = plant_granularities[plant_gran]['id_cols'] + ['report_date']
    all_the_stuff = pd.DataFrame(columns=cols_to_grab)
    for table, table_details in plant_granularities[plant_gran]['ag_tables'].items():
        # grab the table
        logger.info(f'   aggregating {table}')
        all_the_stuff = grab_denormalize_table(
            table,
            denorm_table=table_details['denorm_table'],
            denorm_cols=table_details['denorm_cols']).\
        groupby(cols_to_grab).\
        agg(table_details['ag_cols']).reset_index().\
        merge(all_the_stuff, how='outer', on=cols_to_grab) # merge the new table into the compiled df
    # add the df into a dictionary of dfs
    compiled_dfs[plant_gran] = all_the_stuff

compiling data for plant
   aggregating fuel_cost
   grabbing fuel_cost from the output object
   aggregating generation_fuel_eia923
   grabbing generation_fuel_eia923 from the sqlite db


In [345]:
table_compiler.grab_the_table('fuel_cost')

   grabbing fuel_cost from the output object


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Unnamed: 0,report_date,plant_id_eia,plant_id_pudl,plant_name,utility_id_eia,utility_id_pudl,utility_name,generator_id,net_generation_mwh,fuel_type_code_pudl,fuel_type_count,fuel_cost_per_mmbtu,heat_rate_mmbtu_mwh,fuel_cost_per_mwh
0,2011-01-01,3,32,Barry,195,18,Alabama Power Co,1,312130.0,coal,2,4.295619,11.422532,49.066847
1,2011-01-01,3,32,Barry,195,18,Alabama Power Co,2,191475.0,coal,2,4.295619,11.941354,51.295507
2,2011-01-01,3,32,Barry,195,18,Alabama Power Co,3,710069.0,coal,2,4.295619,10.583450,45.462469
3,2011-01-01,3,32,Barry,195,18,Alabama Power Co,4,1175685.0,coal,2,4.295619,10.180722,43.732504
4,2011-01-01,3,32,Barry,195,18,Alabama Power Co,5,2264413.0,coal,2,4.295619,10.181889,43.737514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22679,2017-01-01,60100,7582,Dinuba Energy,59849,1488,Dinuba Energy,G-1,0.0,waste,1,,,
22680,2017-01-01,60122,10751,Colorado Bend II,6035,1692,Exelon Power,CT7,,gas,1,,,
22681,2017-01-01,60122,10751,Colorado Bend II,6035,1692,Exelon Power,CT8,,gas,1,,,
22682,2017-01-01,60122,10751,Colorado Bend II,6035,1692,Exelon Power,STG9,,gas,1,,,


In [None]:
plant_granularities[plant_gran]['ag_tables']