# Energy Innovation MCOE Compilation

### Contents
- <a href=#setup>Setup</a>
- <a href=#key_inputs>Key Inputs</a>
- <a href=#ass_func>Assisting Functions</a>
    * <a href=#p1_func>Part 1 Functions</a>
    * <a href=#p2_func>Part 2 Functions</a>
- <a href=#data_out>Data Outputs</a>
    * <a href=#part1>Part 1: Basic Plant & Unit Information</a>
    * <a href=#part2>Part 2: Cost Data</a>
    * Part 3: Emissions & Public Health Data

-------------

## <a id='setup'>Setup</a>

In [34]:
%load_ext autoreload
%autoreload 2

In [35]:
import pudl
import pandas as pd
import sqlalchemy as sa
import sys
import logging
import pathlib
import json
import datetime

In [37]:
# basic setup for logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]
pd.options.display.max_columns = None

In [38]:
# pudl_settings is a dictionary that includes the paths to several key pudl directories
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings["pudl_db"])

In [39]:
# the creation of the pudl_out object
# this will compile output tables like mcoe - if you want to restart/wipe the saved elements rerun this cell
pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine, freq='AS', rolling=True)

In [79]:
# Grab key working tables
mcoe = pudl_out.mcoe() # The original function is in pudl.analysis.mcoe.mcoe()
ferc1_steam = pudl_out.plants_steam_ferc1()
ferc1_fuel = pudl_out.fuel_ferc1()

--------------

## <a id='key_inputs'>Key Inputs</a>
    

These values are used to group the data by pre-specified columns

In [43]:
input_dict = {'plant_index_cols': ['plant_id_pudl', 'report_year'],
              'fuel_index_cols': ['plant_id_pudl', 'fuel_type_code_pudl', 'report_year'],
              'unit_index_cols': ['plant_id_pudl','plant_id_eia', 'unit_id_pudl', 'fuel_type_code_pudl', 'report_year'],
              'merge_cols_qual': ['state', 'city', 'latitude', 'longitude'],
              'merge_cols_simple': ['fuel_type_code_pudl'],
              'eia_sum_cols': ['total_fuel_cost','net_generation_mwh','capacity_mw'],
              'eia_pct_cols': ['net_generation_mwh','capacity_mw'],
              'ferc_sum_cols': ['capex_total','opex_fuel','opex_production_total']}

In [44]:
eia_wa_col_dict = {'generator_age_years':'capacity_mw','heat_rate_mmbtu_mwh':'net_generation_mwh'}

In [45]:
fuel_types = ['coal','gas','oil','waste']

------------

## <a id='ass_func'>Assisting Functions</a>

In [84]:
def date_to_year(df):
    """Convert report_date to report_year for MCOE table."""
   
    df['report_year'] = df['report_date'].dt.year
    df = df.drop('report_date',axis=1)
    
    return df

In [85]:
def add_generator_age(df):
    """Add column for generator age."""
    
    # Convert 'operating_date' to Timestamp to match 'report_date'
    df['operating_date'] = [pd.Timestamp(date) for date in df['operating_date']]

    # Create column for generator age based on 'report_date' - 'operating_date'
    df = df.assign(generator_age_years = lambda x: x.report_year - x.operating_date.dt.year)

    return df

**<a id='p1_fun'>Part 1 Functions</a>** -  Mostly neutral and applicable accross eia and ferc datasets. Their primary purpose is to use groupby and merge functions to reorient and grab subsets of data. Some of them are used in Part 2 as well.

In [46]:
def test_segment(df):
    """Grab small portion of the find database that help to visualize whether the aggregation was done correctly."""
    df = df.loc[df['plant_id_pudl']==32].sort_values('report_year',ascending=False)
    
    return df

In [47]:
# NEW --- takes multiple arguments
def weighted_average(df, wa_col_dict, by_cols):
    """Generate a weighted average for multiple columns at once."""
    merge_df = df[by_cols]
    for data, weight in wa_col_dict.items():
        df['_data_times_weight'] = df[data] * df[weight]
        df['_weight_where_notnull'] = df[weight] * pd.notnull(df[data])
        g = df.groupby(by_cols)
        result = g['_data_times_weight'].sum() / g['_weight_where_notnull'].sum()
        del df['_data_times_weight'], df['_weight_where_notnull']
        result = result.to_frame(name='weighted_ave_'+data).reset_index()
        merge_df = pd.merge(merge_df,result,on=by_cols,how='outer')
    
    return merge_df

In [112]:
def regroup_data(df, index_cols, merge_cols=[], wa_col_dict=None, sum_cols=None, drop_calcs=False, count_col=False): 
    """Regroup data either by plant or unit and run aggregation calculations."""
    
    # Create empty dataframes for merge incase left blank in parameters
    sum_df = df[index_cols]
    wa_df = df[index_cols]
    
    # ATTEMPT AT ADDING COUNT FUNCTION --- messes up column names.
    #calc_funcs = []
    #if sum_cols != None and count_col == True:
    #    calc_funcs = ['sum', 'size']
    #elif sum_cols != None and count_col == False:
    #    calc_funcs = ['sum']
    # Find sum of other relevant fields per unit/plant
    
    if sum_cols != None:
        sum_df = df.groupby(index_cols,as_index=False)[sum_cols].sum()
        #sum_df = df.groupby(index_cols,as_index=False)[sum_cols].agg(calc_funcs)
    # Find weighted average of generator ages (based on capacity) and heat rates (based on net gen) per unit/plant
    if wa_col_dict != None:
        wa_df = weighted_average(df, wa_col_dict, index_cols)
    # Merge sum and weighted average tables
    wa_sum_merge_df = pd.merge(sum_df,wa_df,on=index_cols,how='outer')
    # Merge conglomerate table with final 'merge_cols'
    merge_df = df[index_cols+merge_cols]
    result_df = pd.merge(wa_sum_merge_df, merge_df, on=index_cols, how='left').drop_duplicates()
    # Conditional to comply with EI contract request to have plant-level data bare.
    if drop_calcs == True: 
        result_df = result_df[index_cols+merge_cols]
    
    return result_df

In [49]:
def year_selector(df, start_year, end_year):
    """Define the range of dates represented in final dataframe"""
    df_years = df.loc[df['report_year'].isin(range(start_year, end_year+1))]
    
    return df_years

In [140]:
def part1_main(pudl_out, level, start_yr=None, end_yr=None):
    """Create final data output for Part 1."""
    
    # Prep mcoe table data
    df = pudl_out.mcoe()
    df = add_generator_age(date_to_year(df))
    
    if level == 'plant':
        level_df = regroup_data(df, input_dict['plant_index_cols'], 
                                merge_cols=input_dict['merge_cols_qual'], 
                                wa_col_dict=eia_wa_col_dict, 
                                sum_cols=input_dict['eia_pct_cols'], 
                                drop_calcs=True)
    if level == 'unit':
        level_df = regroup_data(df, input_dict['unit_index_cols'], 
                                merge_cols=input_dict['merge_cols_qual'], 
                                wa_col_dict=eia_wa_col_dict, 
                                sum_cols=input_dict['eia_sum_cols'])
    if start_yr != None:
        level_df = year_selector(level_df, start_yr, end_yr)
    return level_df

**<a id='p2_func'>Part 2 Functions</a>** - Primarily used to transform EIA923 and FERC Form 1 data so that they are compatible with one another. We use EIA936 data broken down by plant and fuel type to inform the FERC Form 1 data disaggregation in the same manner. In other words, we calculate the percent that each fuel type contributes to a given plant-level statistic (in this case capacity, net generation, or cost) for the EIA data and use those fuel percentages accros statistics to map disaggregate FERC Form 1 fixed and operating cost data in a similar manner. We use the combined information from  EIA923 and FERC Form 1 to calculate an mcoe value for each fuel type within each plant for any given report year.

In [115]:
def eia_pct_df_maker(df, col):
    """Reorient dataframe by plant so that fuel type column percentages are columns rather than row values."""
    pct_df = df.pivot_table('pct_'+col, input_dict['plant_index_cols'], 'fuel_type_code_pudl').reset_index()
    pct_df = pct_df.rename(columns={'coal':'pct_'+col+'_coal',
                                    'gas':'pct_'+col+'_gas',
                                    'oil':'pct_'+col+'_oil',
                                    'waste':'pct_'+col+'_waste'})
    return pct_df

In [116]:
def calc_eia_fuel_percentages(df, pct_col1, pct_col2):
    """Calculate the percentage that each fuel contributes to input columns (likely capacity and net gen)."""
    
    # Calculate percent that each fuel contributes to input cols (capcity and net gen in this case)                                                       
    df['pct_'+pct_col1] = df[pct_col1] / df[pct_col1+'_plant_level']
    df['pct_'+pct_col2] = df[pct_col2] / df[pct_col2+'_plant_level']
    
    # Reorient table so that fuel type percents become columns (makes it easier to run calculations on FERC1 data)  
    pct_df1 = eia_pct_df_maker(df, pct_col1)
    pct_df2 = eia_pct_df_maker(df, pct_col2)
    
    # Merge percent dfs so that they are both included.
    # pd.merge will not take a df LIST -- need to fix this.
    eia_pct_merge = pd.merge(pct_df1, pct_df2, on=input_dict['plant_index_cols'], how='outer')
    
    return eia_pct_merge

In [137]:
def prep_eia_data(df):
    """Group eia data by plant and fuel type."""
    # Create df that groups EIA923 data by plant and fuel type and aggregates (sums) relevant mcoe data
    eia_plant_fuel_df = regroup_data(df, input_dict['fuel_index_cols'], sum_cols=input_dict['eia_sum_cols'])
    
    return eia_plant_fuel_df

In [138]:
def eia_fuel_pcts(df):
    """Extract fuel type percents on plant level basis for use with FERC Form 1 Data."""
    eia_plant_fuel_df = prep_eia_data(df)
    
    # Create df that finds the plant level totals (combines fuel types) for the aggregated mcoe data
    eia_plant_totals_df = regroup_data(df, input_dict['plant_index_cols'], 
                                       merge_cols=input_dict['merge_cols_simple'], 
                                       sum_cols=input_dict['eia_sum_cols'])
    
    # Rename fields to differentiate fuel type level vs. plant level.
    eia_plant_totals_df = eia_plant_totals_df.rename(columns={'total_fuel_cost':'total_fuel_cost_plant_level',
                                                              'net_generation_mwh':'net_generation_mwh_plant_level',
                                                              'capacity_mw':'capacity_mw_plant_level'}) 
    # Merge with eia_plant_fuel_df --- having a hard time doing this in the regroup_data() function
    # Should show plant totals AND fuel type totals
    eia_plant_fuel_df = pd.merge(eia_plant_fuel_df, eia_plant_totals_df, on=input_dict['fuel_index_cols'], how='left') 
    
                                                                                       # Calculate the percentage that each fuel type (coal, oil, gas, waste) accounts for for the specified columns (net gen & capacity)
    # **NOTE** cannot feed this function a list of col names beacuse merge function does not take a list.
    eia_pct_df = calc_eia_fuel_percentages(eia_plant_fuel_df,'net_generation_mwh', 'capacity_mw')
    
    # Return table needed for ferc fuel type delineation and final FERC1 merge.
    return eia_pct_df

In [129]:
def ferc1_plant_level_prep(df):
    """Ready FERC Form 1 data for merging with EIA-932 fuel pct breakdown."""

    # **NOTE** Does not include 'COUNT' field -- had trouble adding it into the regroup_data() function
    
    # Group data by plant to calculate plant-level opex_nofuel values for use in mcoe calculation. 
    ferc1_plant_df = regroup_data(df, input_dict['plant_index_cols'], sum_cols=input_dict['ferc_sum_cols'])
    ferc1_plant_df['opex_nofuel_ferc1'] = ferc1_plant_df['opex_production_total']-ferc1_plant_df['opex_fuel']
    
    # Rename cols to indication origin of FERC1
    ferc1_plant_df = ferc1_plant_df.rename(columns={'count':'count_ferc1',
                                                    'capex_total':'capex_total_ferc1',
                                                    'opex_fuel':'opex_fuel_ferc1',
                                                    'opex_production_total':'opex_production_total_ferc1'})
    
    return ferc1_plant_df

In [130]:
def ferc_cost_pct_breakdown(df):
    """Calculate FERC Form 1 cost breakdowns from EIA-923 fuel percentages."""
    for fuel in fuel_types:
        df['capex_'+fuel] = df['capex_total_ferc1'] * df['pct_capacity_mw_'+fuel]
        df['opex_nofuel_'+fuel] = df['opex_nofuel_ferc1'] * df['pct_net_generation_mwh_'+fuel]
    return df

In [131]:
def cost_subtable_maker(df, cost):
    """Using FERC Form 1 data merged with EIA percents, calculate cooresponding FERC per-fuel cost breakdown."""
    # apply EIA fuel percents to specified FERC cost data.
    df = df[input_dict['plant_index_cols'] + [cost+'_coal', cost+'_gas', cost+'_oil', cost+'_waste']]
    df = df.rename(columns={cost+'_coal':'coal',
                            cost+'_gas':'gas',
                            cost+'_oil':'oil',
                            cost+'_waste':'waste'})
    df_melt = pd.melt(df, input_dict['plant_index_cols']).rename(columns={'value': cost, 
                                                                          'variable': 'fuel_type_code_pudl'})
    df_melt = df_melt.dropna(subset=[cost])
    
    return df_melt

In [132]:
def merge_ferc_with_eia_pcts(eia_pct_df, ferc_df):
    """Merge EIA fuel percents with FERC Form 1 data creating FERC table with plant and fuel breakdown."""

    # Merge prepped EIA923 percent data with FERC1 cost data
    ferc_eia_pcts = pd.merge(eia_pct_df, ferc_df, on=input_dict['plant_index_cols'], how='outer')
    ferc_eia_pcts = ferc_cost_pct_breakdown(ferc_eia_pcts)
    capex_melt = cost_subtable_maker(ferc_eia_pcts, 'capex')
    opex_melt = cost_subtable_maker(ferc_eia_pcts, 'opex_nofuel')
    
    # Merge capex and opex FERC1 tables 
    ferc_cap_op = pd.merge(capex_melt, opex_melt, on=input_dict['fuel_index_cols'], how='outer')
   
    return ferc_cap_op

In [141]:
def merge_ferc_eia_mcoe(eia_fuel_df, ferc_fuel_df):
    """Take final, compatible FERC and EIA tables separated by plant and fuel type and merges them."""
    
    # Merge FERC1 and EIA923 on plant, fuel, and year using prep_eia_data() output associated with key 'plant_fuel_ag'
    eia_ferc_merge = pd.merge(eia_fuel_df, ferc_fuel_df, on=input_dict['fuel_index_cols'], how='outer')
    
    # Rename columns to specify where they are coming from
    eia_ferc_merge = eia_ferc_merge.rename(columns={
        'total_fuel_cost':'fuel_cost_eia923',
        'net_generation_mwh':'net_generation_mwh_eia923',
        'capacity_mw':'capacity_mw_eia860',
        'capex':'capex_ferc1',
        'opex_nofuel':'opex_nofuel_ferc1'})
   
    # Add mcoe column
    eia_ferc_merge = eia_ferc_merge.assign(mcoe=
        ((eia_ferc_merge['fuel_cost_eia923'] + eia_ferc_merge['opex_nofuel_ferc1']) + eia_ferc_merge['capex_ferc1'] * eia_ferc_merge['capacity_mw_eia860']) / eia_ferc_merge['net_generation_mwh_eia923'])
    
    # Rearrange columns
    eia_ferc_merge = eia_ferc_merge[[
        'plant_id_pudl',
        'fuel_type_code_pudl',
        'report_year',
        'fuel_cost_eia923',
        'net_generation_mwh_eia923',
        'capacity_mw_eia860',
        'capex_ferc1',
        'opex_nofuel_ferc1',
        'mcoe']]
    
    return eia_ferc_merge

In [139]:
def part2_main(pudl_out, start_yr=None, end_yr=None):
    """Create final da Part 2 data output"""
    
    eia_raw = pudl_out.mcoe()
    ferc_raw = pudl_out.plants_steam_ferc1()
    
    #mcoe_df = merge_ferc1_eia(prep_eia_data(eia_raw), prep_ferc1_data(ferc_raw))
    
    ferc_prep = merge_ferc_with_eia_pcts(eia_pcts(eia_raw), ferc1_plant_level_prep(ferc_raw))
    eia_prep = prep_eia_data(eia_raw)
    
    mcoe_df = merge_ferc_eia_mcoe(eia_prep, ferc_prep)
    
    if start_yr != None:
        mcoe_df = year_selector(mcoe_df, start_yr, end_yr)
    return mcoe_df

-----------

## <a id='data_out'>Data Outputs</a>

#### <a id='part1'>Part 1: Plant & Unit Level Data</a>
EIA generator-level data aggregated by either plant or unit and subdivided by broad fuel type (coal, gas, oil, waste). Generator age calculated by weighted average (capacity as weight) and heat rate calculated by weighted average (net generation as weight). Capacity and net generation calculated by summing generator-level data.

**Plant Level**

In [87]:
plant_df = part1_main(pudl_out, 'plant')
test_segment(plant_df)

Unnamed: 0,plant_id_pudl,report_year,state,city,latitude,longitude
813,32,2017,AL,Bucks,31.0069,-88.0103
749,32,2016,AL,Bucks,31.0069,-88.0103
649,32,2015,AL,Bucks,31.0069,-88.0103
648,32,2013,AL,Bucks,31.0069,-88.0103
623,32,2012,AL,Bucks,31.0069,-88.0103
598,32,2011,AL,Bucks,31.0069,-88.0103


**Unit Level**

In [88]:
unit_level_df = part1_main(pudl_out, 'unit')
test_segment(unit_level_df).query('report_year==2017')

Unnamed: 0,plant_id_pudl,plant_id_eia,unit_id_pudl,fuel_type_code_pudl,report_year,total_fuel_cost,net_generation_mwh,capacity_mw,weighted_ave_generator_age_years,weighted_ave_heat_rate_mmbtu_mwh,state,city,latitude,longitude
342,32,3,7.0,gas,2017,97585150.0,4217873.0,535.4,17.0,6.917677,AL,Bucks,31.0069,-88.0103
278,32,3,1.0,gas,2017,657705.2,7221.0,153.1,63.0,27.23353,AL,Bucks,31.0069,-88.0103
315,32,3,6.0,gas,2017,98502010.0,4199100.0,535.4,17.0,7.013889,AL,Bucks,31.0069,-88.0103
282,32,3,2.0,gas,2017,620610.0,7498.0,153.1,63.0,24.748185,AL,Bucks,31.0069,-88.0103
296,32,3,5.0,coal,2017,77744900.0,2710308.0,788.8,46.0,9.881649,AL,Bucks,31.0069,-88.0103
291,32,3,4.0,coal,2017,22790380.0,722554.0,403.7,48.0,10.865694,AL,Bucks,31.0069,-88.0103


#### <a id='part2'>Part 2: Cost Data</a>

Gost and generation data from EIA and FERC subdivided by plant and broad fuel type.

MCOE Variables & Origins:
- Fuel cost = **EIA**: *total_fuel_cost*
- MW Capacity = **EIA**: *capacity_mw*
- Net MWh Generated = **EIA**: *net_generation_mwh*
- Variable O&M = **FERC**: *(opex_production_total) - (opex_fuel)*
- Fixed O&M = **FERC**: *capex_total*

**MCOE**

In [136]:
mcoe_data = part2_main(pudl_out)
test_segment(mcoe_data)

Unnamed: 0,plant_id_pudl,fuel_type_code_pudl,report_year,fuel_cost_eia923,net_generation_mwh_eia923,capacity_mw_eia860,capex_ferc1,opex_nofuel_ferc1,mcoe
97,32,coal,2017,100535300.0,3432862.0,1192.5,947881800.0,31608100.0,329311.564142
100,32,gas,2017,197365500.0,8431692.0,1377.0,1094535000.0,77634870.0,178783.800261
96,32,coal,2016,145946800.0,4357881.0,1192.5,1025313000.0,32030670.0,280609.766978
99,32,gas,2016,168962500.0,8401360.0,1070.8,920675400.0,61750480.0,117372.651362
95,32,coal,2015,152946300.0,4605304.0,1464.5,1018691000.0,40326390.0,323988.778939
98,32,gas,2015,151115900.0,6783671.0,1223.9,851332500.0,59401280.0,153627.191411
94,32,coal,2013,79820380.0,1675131.0,403.7,1712071000.0,70039520.0,412691.875258
93,32,coal,2012,256313700.0,5319147.0,1770.7,1712178000.0,70617160.0,570031.210571
92,32,coal,2011,207874000.0,4653772.0,1770.7,1679821000.0,84803140.0,639213.044735


-----------------
-----------

##### **Plants With Significantly Different Hear Rates** <font color=grey>*test*</font> 
Using unit-level data from Part 1, find out whether there is any major discrepancies in heat rate for a given plant.

In [44]:
# Sum generator heat rate by plant and fuel type
plant_heat_rate = unit_level_df_all_years.groupby([
    'plant_id_pudl','fuel_type_code_pudl','report_date'],as_index=False)[
        'weighted_ave_heat_rate_mwh'].sum().rename(columns={
            'weighted_ave_heat_rate_mwh':'plant_heat_rate'})

# Merge back with unit-level data
plant_hr_merge = pd.merge(unit_level_df_all_years, plant_heat_rate, on=['plant_id_pudl','fuel_type_code_pudl','report_date'], how='outer')

In [45]:
# Delete unnecessary columns
plant_hr_merge = plant_hr_merge[['plant_id_pudl','unit_id_pudl','fuel_type_code_pudl',
                                 'report_date','weighted_ave_heat_rate_mwh','plant_heat_rate']]

# Add new column for calculation of whether its significantly different or not

In [46]:
import datetime
plant_hr_merge.loc[(plant_hr_merge['plant_id_pudl']==32)&(plant_hr_merge['report_date']==datetime.datetime(2017,1,1))]

Unnamed: 0,plant_id_pudl,unit_id_pudl,fuel_type_code_pudl,report_date,weighted_ave_heat_rate_mwh,plant_heat_rate
181,32,1.0,gas,2017-01-01,27.23353,65.913281
182,32,2.0,gas,2017-01-01,24.748185,65.913281
183,32,6.0,gas,2017-01-01,7.013889,65.913281
184,32,7.0,gas,2017-01-01,6.917677,65.913281
194,32,4.0,coal,2017-01-01,10.865694,20.747343
195,32,5.0,coal,2017-01-01,9.881649,20.747343


#### Data Validation FERC vs. EIA

In [77]:
# Snatched from ferc1-eia923-comparison notebook
# FERC1 data merge 

fuel_ferc1 = pudl_out.fuel_ferc1()#[[
    #'report_year',
    #'plant_id_pudl',
    #'fuel_type_code_pudl',
    #'fuel_consumed_mmbtu',
    #'fuel_consumed_total_cost',
    #'fuel_cost_per_mmbtu'
#]]
steam_ferc1 = pudl_out.plants_steam_ferc1()#[[
    #'report_year',
    #'plant_id_pudl',
    #'capacity_mw',
    #'net_generation_mwh'
#]]

nf = pudl.transform.ferc1.fuel_by_plant_ferc1(pudl_out.fuel_ferc1())

key_cols = [
    'report_year',
    'utility_id_ferc1',
    'plant_name_ferc1',
]
ferc1_plants = (
    pd.merge(nf, steam_ferc1, on=key_cols, how='inner').
    assign(heat_rate_mmbtu_mwh=lambda x: x.fuel_mmbtu / x.net_generation_mwh).
    merge(steam_ferc1[key_cols+['utility_id_pudl', 'utility_name_ferc1','plant_id_pudl', 'plant_id_ferc1']]))
    #query(f'report_year >= {start_year}')

In [109]:
ferc_small = ferc1_plants[[
    'report_year',
    'utility_id_ferc1',
    'plant_name_ferc1',
    'primary_fuel_by_mmbtu',
    'plant_id_pudl',
    'capacity_mw',
    'net_generation_mwh',
    'opex_fuel',
    'fuel_cost']]

In [87]:
#ferc1_plants.columns.to_list()

In [110]:
# example of difficult data

ferc_small.loc[(ferc_small['plant_id_pudl']==123) & (ferc_small['report_year']==2016)]

Unnamed: 0,report_year,utility_id_ferc1,plant_name_ferc1,primary_fuel_by_mmbtu,plant_id_pudl,capacity_mw,net_generation_mwh,opex_fuel,fuel_cost
8488,2016,89,columbia 1,coal,123,112.6,463964.0,12648598.0,12363170.0
8513,2016,89,columbia 2,coal,123,112.4,624504.0,16205510.0,15919290.0
8538,2016,89,columbia total,coal,123,225.0,1088468.0,28854108.0,28282270.0
18056,2016,194,columbia 1 (all),coal,123,556.0,2221726.967,59148521.0,59149470.0
18059,2016,194,columbia 1 (wpl),coal,123,256.9,1069401.614,28674248.0,28672640.0
18062,2016,194,columbia 2 (all),coal,123,556.0,2755172.549,70857110.0,70854110.0
18065,2016,194,columbia 2 (wpl),coal,123,256.9,1264637.996,32529438.0,32528110.0
18750,2016,195,columbia 1 & 2,coal,123,335.2,1577770.0,42492965.0,42496010.0


In [None]:
#ferc1_steam_count = ferc1_steam.groupby(
#    ['plant_id_pudl','report_year']).size().reset_index(name='count')

#ferc = ferc_small.groupby(['plant_id_pudl','primary_fuel_by_mmbtu','report_year']).size().reset_index(name='count')
#ferc.sort_values('count',ascending=False)
#123

In [62]:
ferc1_merge = pd.merge(ferc_fuel,ferc_steam,on=['plant_id_pudl','report_year'],how='outer')

In [105]:
eia_subset = mcoe[[
    'plant_id_pudl',
    'unit_id_pudl',
    'generator_id',
    'fuel_type_code_pudl',
    'report_date',
    'total_mmbtu',
    'capacity_mw',
    'net_generation_mwh',
    'heat_rate_mmbtu_mwh',
]].drop_duplicates()

#eia_by_plant = eia_subset.groupby(['plant_id_pudl','report_year'])
eia_subset = eia_subset.assign(report_year=lambda x: x.report_date.dt.year)

In [108]:
eia_subset.loc[(eia_subset['plant_id_pudl']==123)&(eia_subset['report_year']==2015)]

Unnamed: 0,plant_id_pudl,unit_id_pudl,generator_id,fuel_type_code_pudl,report_date,total_mmbtu,capacity_mw,net_generation_mwh,heat_rate_mmbtu_mwh,report_year
104278,123,1.0,1,coal,2015-01-01,27465940.0,556.0,2528128.0,10.864143,2015
104279,123,2.0,2,coal,2015-01-01,24833700.0,556.0,2331530.0,10.651244,2015
