In [1]:
import sys
import os
import numpy as np
import pandas as pd
import sqlalchemy as sa
sys.path.append(os.path.abspath(os.path.join('..','..','..')))
from pudl import pudl, ferc1, eia923, settings, constants, analysis
from pudl import models, models_ferc1, models_eia923
from pudl import clean_eia923, clean_ferc1, clean_pudl
import matplotlib.pyplot as plt
import matplotlib as mpl
pd.options.mode.chained_assignment = None
%matplotlib inline
plt.style.use('ggplot')
mpl.rcParams['figure.figsize'] = (10,6)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 56
pd.options.display.max_rows = 100

In [2]:
pudl_engine  = pudl.connect_db()

### Pulling the required tables

In [3]:
# Bring in pudl IDs for all EIA plants
pudl_ids = g9 = analysis.simple_select('plants_eia', pudl_engine)

In [4]:
# Convert the fuel_receipts_costs_eia923 table into a dataframe
frc9 = analysis.simple_select('fuel_receipts_costs_eia923', pudl_engine)

In [5]:
frc9['fuel_cost'] = (frc9['fuel_quantity'] * frc9['average_heat_content'] * frc9['fuel_cost_per_mmbtu'])
frc9['mmbtu'] =  (frc9['fuel_quantity'] * frc9['average_heat_content'])

In [6]:
# Identify the primary fuel for each plant; later assign all fuel costs to that category, so secondary fuels aren't lost
frc9 = frc9.set_index(pd.DatetimeIndex(frc9['report_date']))
frc_df = analysis.primary_fuel_frc_eia923(frc9, fuel_thresh = .99)
frc_df_coal = frc_df[frc_df['primary_fuel']=='Coal']
len(frc_df_coal.plant_id.unique())

402

In [7]:
plants = frc_df_coal.plant_id.unique()

In [8]:
frc9 = frc9[frc9['plant_id'].isin(plants)]

In [9]:
# change energy source column to coal
frc9['energy_source'] = 'Coal'

In [10]:
# Get yearly fuel cost by plant_id, year and energy_source
frc9_summed = analysis.yearly_sum_eia(frc9 , 'fuel_cost', columns=['plant_id','report_date','energy_source']) #toggle
frc9_summed = frc9_summed.reset_index()
# frc9_summed = frc9_summed.rename(columns={'primary_fuel': 'energy_source'})

In [11]:
frc9_summed.head()

Unnamed: 0,plant_id,report_date,energy_source,fuel_cost
0,8,2009,Coal,219201700.0
1,8,2010,Coal,263307700.0
2,8,2011,Coal,213258300.0
3,8,2012,Coal,150118800.0
4,8,2013,Coal,124774900.0


In [12]:
frc9_mmbtu_summed = analysis.yearly_sum_eia(frc9 , 'mmbtu', columns=['plant_id','report_date','energy_source']) #toggle
# frc9_mmbtu_summed = frc9_mmbtu_summed.rename(columns={'primary_fuel': 'energy_source'})
frc9_mmbtu_summed = frc9_mmbtu_summed.reset_index()
frc9_mmbtu_summed.head(2)

Unnamed: 0,plant_id,report_date,energy_source,mmbtu
0,8,2009,Coal,56915370.0
1,8,2010,Coal,66582900.0


In [13]:
frc9_summed = frc9_mmbtu_summed.merge(frc9_summed)
frc9_summed['fuel_cost_per_mmbtu_average'] = (frc9_summed.fuel_cost/frc9_summed.mmbtu)
frc9_summed.head()

Unnamed: 0,plant_id,report_date,energy_source,mmbtu,fuel_cost,fuel_cost_per_mmbtu_average
0,8,2009,Coal,56915370.0,219201700.0,3.851362
1,8,2010,Coal,66582900.0,263307700.0,3.954585
2,8,2011,Coal,51515140.0,213258300.0,4.13972
3,8,2012,Coal,36594670.0,150118800.0,4.102205
4,8,2013,Coal,30617010.0,124774900.0,4.075345


In [14]:
# Convert the generation_eia923 table into a dataframe
g9 = analysis.simple_select('generation_eia923', pudl_engine)
#subset the g9 data frame to only coal plants
g9 = g9[g9['plant_id'].isin(plants)]
# Get yearly net generation by plant_id, year and generator_id
g9_summed = analysis.yearly_sum_eia(g9, 'net_generation_mwh')
g9_summed.reset_index(inplace=True)

In [15]:
# Convert the generators_eia860 table into a dataframe
g8 = analysis.simple_select('generators_eia860', pudl_engine)

In [16]:
# In order to use the energy source associated with each generator
g8_es = g8[['plant_id','generator_id', 'energy_source_1']]
g8_es = g8_es.rename(columns={'energy_source_1': 'energy_source'})
g8_es.drop_duplicates(['plant_id','generator_id'], inplace=True)
g8_es['energy_source'] = 'Coal'

#subset the g8 data frame to only coal plants
g8 = g8[g8['plant_id'].isin(plants)]
g8_es = g8_es[g8_es['plant_id'].isin(plants)]

In [17]:
# In order to use the capacity of each generator
g8 = g8[['plant_id','plant_name', 'operator_name', 'state', 'generator_id','nameplate_capacity_mw','summer_capacity_mw','winter_capacity_mw','year']]
g8.rename(columns={'year': 'report_date'},inplace=True)

# # Cheating to duplicate 2015 EIA860 capacity factor data for 2016:
g8_2016 = g8.loc[g8['report_date'] == 2015]
g8_2016.report_date.replace([2015], [2016], inplace=True)
g8 = g8_2016.append(g8)

In [18]:
# see which non-fuel coals exist for these coal plants
non_coal_fuels = g8_es.loc[(g8_es['energy_source']!='BIT') & (g8_es['energy_source']!='SUB') & (g8_es['energy_source']!='LIG')]

In [19]:
# Look at the variety of non-coal fuels used in coal plants
non_coal_fuels.energy_source.unique()

array(['Coal'], dtype=object)

In [20]:
# re-assign all energy source values to coal
g8['energy_source'] = 'Coal'

In [21]:
# Convert the boiler_fuel_eia923 table into a dataframe
bf9 = analysis.simple_select('boiler_fuel_eia923', pudl_engine)
bf9['fuel_consumed_mmbtu'] = bf9['fuel_qty_consumed']*bf9['fuel_mmbtu_per_unit']

In [22]:
#subset the bf9 data frame to only coal plants
bf9 = bf9[bf9['plant_id'].isin(plants)]

# re-assign all energy source values to coal
bf9['energy_source'] = 'Coal'

In [23]:
# Get yearly fuel consumed by plant_id, year and boiler_id
bf9_summed = analysis.yearly_sum_eia(bf9 , 'fuel_consumed_mmbtu', columns=['plant_id','report_date','boiler_id'])
bf9_summed.reset_index(inplace=True)
# Get yearly fuel consumed by plant_id, year and boiler_id
bf9_plant_summed = analysis.yearly_sum_eia(bf9 , 'fuel_consumed_mmbtu', columns=['plant_id','report_date'])
bf9_plant_summed.reset_index(inplace=True)

In [24]:
# Convert the boiler_generator_assn_eia860 table into a dataframe
bga8 = analysis.simple_select('boiler_generator_assn_eia860', pudl_engine)
bga8.drop(['id','operator_id'],axis=1, inplace=True)
bga8.drop_duplicates(['plant_id','boiler_id','generator_id'],inplace=True)

In [25]:
# get the primary fuel for each plant (to be used to combine plants that have unassociated generators)
gf9 = analysis.simple_select('generation_fuel_eia923', pudl_engine)
# primary_fuel = analysis.primary_fuel_gf_eia923(gf9).rename(columns={'year':'report_date'})

In [26]:
#subset the bf9 data frame to only coal plants
gf9 = gf9[gf9['plant_id'].isin(plants)]

## Generating Capacity Factor

In [27]:
capacity_factor = analysis.capacity_factor(g9_summed,g8)

In [28]:
capacity_factor.head(2)

Unnamed: 0,plant_id,report_date,generator_id,net_generation_mwh,plant_name,operator_name,state,nameplate_capacity_mw,summer_capacity_mw,winter_capacity_mw,energy_source,capacity_factor
0,8,2011,10,3348756.0,Gorgas,Alabama Power Co,AL,788.8,703.0,703.0,Coal,0.484632
1,8,2011,6,240813.0,Gorgas,Alabama Power Co,AL,125.0,103.0,103.0,Coal,0.219921


## Generating Heat Rate

In [29]:
# heat_rate = analysis.heat_rate(bga8, g9_summed, bf9_summed, bf9_plant_summed, pudl_engine)

In [30]:
# Calculate heat rate here (instead of with function) to retain 
# some of the underlying fields used for calculations
gens_with_bga = analysis.gens_with_bga(bga8, g9_summed)

In [31]:
# heat_rate(bga8, g9_summed, bf9_summed, bf9_plant_summed, pudl_engine):
#     """
#     Generate hate rates for all EIA generators.
#     """
# This section pulls the unassociated generators
gens = gens_with_bga
# Get a list of generators from plants with unassociated plants
# gens_unassn_plants = gens[gens['plant_assn'] == False
gens_unassn_plants = gens[gens['complete_assn'] == False]

# Sum the yearly net generation for these plants
gup_gb = gens_unassn_plants.groupby(by=['plant_id', 'report_date'])
gens_unassn_plants_summed = gup_gb.agg({'net_generation_mwh': np.sum})
gens_unassn_plants_summed.reset_index(inplace=True)

# Pull in mmbtu
unassn_plants = gens_unassn_plants_summed.merge(
    bf9_plant_summed, on=['plant_id', 'report_date'])
# calculate heat rate by plant
unassn_plants['heat_rate_mmbtu_mwh'] = \
    unassn_plants['fuel_consumed_mmbtu'] / \
    unassn_plants['net_generation_mwh']

# Merge these plant level heat heat rates with the unassociated generators
# Assign heat rates to generators across the plants with unassociated
# generators
heat_rate_unassn = gens_unassn_plants.merge(unassn_plants[[
                                            'plant_id',
                                            'report_date',
                                            'heat_rate_mmbtu_mwh']],
                                            on=['plant_id',
                                                'report_date'],
                                            how='left')
heat_rate_unassn.drop(
    ['boiler_id', 'boiler_generator_assn'], axis=1, inplace=True)

# This section generates heat rate from the generators of
# the plants that have any generators that are included in
# the boiler generator association table (860)
generation_w_boilers = g9_summed.merge(
    bga8, how='left', on=['plant_id', 'generator_id'])

# get net generation per boiler
gb1 = generation_w_boilers.groupby(
    by=['plant_id', 'report_date', 'boiler_id'])
generation_w_boilers_summed = gb1.agg({'net_generation_mwh': np.sum})
generation_w_boilers_summed.reset_index(inplace=True)
generation_w_boilers_summed.rename(
    columns={'net_generation_mwh': 'net_generation_mwh_boiler'},
    inplace=True)

# get the generation per boiler/generator combo
gb2 = generation_w_boilers.groupby(
    by=['plant_id', 'report_date', 'boiler_id', 'generator_id'])
generation_w_bg_summed = gb2.agg({'net_generation_mwh': np.sum})
generation_w_bg_summed.reset_index(inplace=True)
generation_w_bg_summed.rename(
    columns={'net_generation_mwh': 'net_generation_mwh_boiler_gen'},
    inplace=True)

# squish them together
generation_w_boilers_summed = \
    generation_w_boilers_summed.merge(generation_w_bg_summed,
                                      how='left',
                                      on=['plant_id',
                                          'report_date',
                                          'boiler_id'])

bg = bf9_summed.merge(bga8, how='left', on=['plant_id', 'boiler_id'])
bg = bg.merge(generation_w_boilers_summed, how='left', on=[
              'plant_id', 'report_date', 'boiler_id', 'generator_id'])

# Use the proportion of the generation of each generator to allot mmBTU
bg['proportion_of_gen_by_boil_gen'] = \
    bg['net_generation_mwh_boiler_gen'] / bg['net_generation_mwh_boiler']
bg['fuel_consumed_mmbtu_per_gen'] = \
    bg['proportion_of_gen_by_boil_gen'] * bg['fuel_consumed_mmbtu']

# Get yearly fuel_consumed_mmbtu by plant_id, year and generator_id
bg_gb = bg.groupby(by=['plant_id',
                       'report_date',
                       'generator_id', 
                       'net_generation_mwh_boiler_gen',
                       'net_generation_mwh_boiler',
                       'proportion_of_gen_by_boil_gen'], as_index=False)
bg_summed = bg_gb.agg({'fuel_consumed_mmbtu_per_gen': np.sum})
bg_summed.reset_index(inplace=True)

# Calculate heat rate
heat_rate = bg_summed.merge(g9_summed, how='left', on=[
                            'plant_id', 'report_date', 'generator_id'])
heat_rate['heat_rate_mmbtu_mwh'] = \
    heat_rate['fuel_consumed_mmbtu_per_gen'] / \
    heat_rate['net_generation_mwh_boiler_gen']

# Importing the plant association tag to filter out the
# generators that are a part of plants that aren't in the bga table
heat_rate = heat_rate.merge(gens[['plant_id',
                                  'report_date',
                                  'generator_id',
                                  'complete_assn',
                                  'plant_assn']],
                            on=['plant_id',
                                'report_date',
                                'generator_id'])
heat_rate_assn = heat_rate[heat_rate['complete_assn'] == True]

# Append heat rates for associated and unassociated
heat_rate_all = heat_rate_assn.append(heat_rate_unassn)
heat_rate_all.sort_values(
    by=['plant_id', 'report_date', 'generator_id'], inplace=True)

In [32]:
#CALCULATIONS FROM INSIDE OF FUEL_COST FUNCTION:

net_gen = g9_summed.merge(g8_es, how='left', on=[
                              'plant_id', 'generator_id'])

# Merge this net_gen table with frc9_summed to have
# fuel_cost_per_mmbtu_total associated with generators

fuel_cost_per_mmbtu = net_gen.merge(frc9_summed,
                                    how='outer',
                                    on=['plant_id',
                                        'report_date',
                                        'energy_source'])

## Generating Fuel Cost

In [33]:
fuel_cost = analysis.fuel_cost(g9_summed,g8_es,frc9_summed,heat_rate)

In [34]:
fuel_cost.head(2)

Unnamed: 0,plant_id,report_date,generator_id,net_generation_mwh,energy_source,mmbtu,fuel_cost,fuel_cost_per_mmbtu_average,heat_rate_mmbtu_mwh,fuel_cost_per_mwh
0,8,2009,10,3933248.0,Coal,56915370.0,219201700.0,3.851362,9.814333,37.798547
1,8,2009,6,98792.0,Coal,56915370.0,219201700.0,3.851362,12.920373,49.761033


### Merge fields together

In [35]:
mcoe_by_generator = fuel_cost[['plant_id',
                               'report_date',
                               'generator_id',
                               'energy_source',                               'mmbtu',
                               'fuel_cost',
                               'fuel_cost_per_mmbtu_average',
                               'fuel_cost_per_mwh']].merge(heat_rate,
                                                           on=['plant_id','report_date','generator_id'])

In [36]:
mcoe_by_generator = mcoe_by_generator.merge(capacity_factor.drop('net_generation_mwh', axis=1),
                                            on=['plant_id','report_date','generator_id'])

In [37]:
mcoe_by_generator = mcoe_by_generator.merge(bf9_plant_summed, on=['plant_id','report_date'])

In [38]:
mcoe_by_generator.head(2)

Unnamed: 0,plant_id,report_date,generator_id,energy_source_x,mmbtu,fuel_cost,fuel_cost_per_mmbtu_average,fuel_cost_per_mwh,index,net_generation_mwh_boiler_gen,net_generation_mwh_boiler,proportion_of_gen_by_boil_gen,fuel_consumed_mmbtu_per_gen,net_generation_mwh,heat_rate_mmbtu_mwh,complete_assn,plant_assn,plant_name,operator_name,state,nameplate_capacity_mw,summer_capacity_mw,winter_capacity_mw,energy_source_y,capacity_factor,fuel_consumed_mmbtu
0,8,2011,10,Coal,51515140.0,213258300.0,4.13972,40.567746,10,3348756.0,3348756.0,1.0,32816580.0,3348756.0,9.799634,True,True,Gorgas,Alabama Power Co,AL,788.8,703.0,703.0,Coal,0.484632,50694000.0
1,8,2011,6,Coal,51515140.0,213258300.0,4.13972,51.88191,11,240813.0,240813.0,1.0,3018039.0,240813.0,12.532709,True,True,Gorgas,Alabama Power Co,AL,125.0,103.0,103.0,Coal,0.219921,50694000.0


### Filter to include only plants >= 10 MW

In [39]:
# Now, let's grab only those plants larger than 10MW
mcoe_by_generator = mcoe_by_generator[mcoe_by_generator['nameplate_capacity_mw']>=10]

### Add PUDL plant IDs

In [40]:
mcoe_by_generator = mcoe_by_generator.merge(pudl_ids[['plant_id','plant_id_pudl']], on=['plant_id'])

In [41]:
mcoe_by_generator.rename(columns={'plant_id': 'plant_id_eia'}, inplace=True)

## Aggregate at fuel cost and net gen at plant level for comparison with FERC

In [42]:
eia_fuel = analysis.yearly_sum_eia(frc9 , 'fuel_cost', columns=['plant_id','report_date'])
fuel_summed = eia_fuel.reset_index()
fuel_summed.rename(columns={'fuel_cost': 'fuel_cost_annual_eia'}, inplace=True)
fuel_summed = fuel_summed.reset_index()

In [43]:
eia_netgen = mcoe_by_generator.groupby(by=['plant_id_eia','report_date'])
netgen_summed = eia_netgen.agg({'net_generation_mwh':np.sum})
netgen_summed.rename(columns={'net_generation_mwh': 'net_generation_mwh_plant_eia'}, inplace=True)
netgen_summed = netgen_summed.reset_index()

In [44]:
mcoe_by_generator2 = mcoe_by_generator.merge(fuel_summed, how="left", left_on=['plant_id_eia', 'report_date'], right_on=['plant_id', 'report_date'])

In [45]:
mcoe_by_generator3 = mcoe_by_generator2.merge(netgen_summed, how="left", left_on=['plant_id_eia', 'report_date'], right_on=['plant_id_eia', 'report_date'])

In [46]:
mcoe_by_generator3.rename(columns={'energy_source_x': 'energy_source'},
    inplace=True)

#mcoe_by_generator3.drop('energy_source_y', axis=1, inplace=True)

In [47]:
mcoe_by_generator = mcoe_by_generator3

In [48]:
mcoe_by_generator3.head()

Unnamed: 0,plant_id_eia,report_date,generator_id,energy_source,mmbtu,fuel_cost,fuel_cost_per_mmbtu_average,fuel_cost_per_mwh,index_x,net_generation_mwh_boiler_gen,net_generation_mwh_boiler,proportion_of_gen_by_boil_gen,fuel_consumed_mmbtu_per_gen,net_generation_mwh,heat_rate_mmbtu_mwh,complete_assn,plant_assn,plant_name,operator_name,state,nameplate_capacity_mw,summer_capacity_mw,winter_capacity_mw,energy_source_y,capacity_factor,fuel_consumed_mmbtu,plant_id_pudl,index_y,plant_id,fuel_cost_annual_eia,net_generation_mwh_plant_eia
0,8,2011,10,Coal,51515140.0,213258300.0,4.13972,40.567746,10,3348756.0,3348756.0,1.0,32816580.0,3348756.0,9.799634,True,True,Gorgas,Alabama Power Co,AL,788.8,703.0,703.0,Coal,0.484632,50694000.0,227,2.0,8.0,213258300.0,4936430.0
1,8,2011,6,Coal,51515140.0,213258300.0,4.13972,51.88191,11,240813.0,240813.0,1.0,3018039.0,240813.0,12.532709,True,True,Gorgas,Alabama Power Co,AL,125.0,103.0,103.0,Coal,0.219921,50694000.0,227,2.0,8.0,213258300.0,4936430.0
2,8,2011,7,Coal,51515140.0,213258300.0,4.13972,47.555815,12,273543.0,273543.0,1.0,3142376.0,273543.0,11.487687,True,True,Gorgas,Alabama Power Co,AL,125.0,104.0,104.0,Coal,0.249811,50694000.0,227,2.0,8.0,213258300.0,4936430.0
3,8,2011,8,Coal,51515140.0,213258300.0,4.13972,45.240677,13,516932.0,516932.0,1.0,5649259.0,516932.0,10.928438,True,True,Gorgas,Alabama Power Co,AL,187.5,161.0,161.0,Coal,0.314723,50694000.0,227,2.0,8.0,213258300.0,4936430.0
4,8,2011,9,Coal,51515140.0,213258300.0,4.13972,45.146258,14,556386.0,556386.0,1.0,6067740.0,556386.0,10.90563,True,True,Gorgas,Alabama Power Co,AL,190.4,170.0,170.0,Coal,0.333584,50694000.0,227,2.0,8.0,213258300.0,4936430.0


## Export the data frame

In [49]:
# rearrange columns
mcoe_by_generator = mcoe_by_generator3[[
    'plant_id_eia', 
    'plant_id_pudl',
    'plant_name',
    'operator_name', 
    'state', 
    'report_date', 
    'generator_id', 
    'energy_source',
    'mmbtu',
    'fuel_cost',
    'fuel_cost_per_mmbtu_average',
    'fuel_consumed_mmbtu',
    'net_generation_mwh_boiler_gen',
    'net_generation_mwh_boiler',
    'proportion_of_gen_by_boil_gen',
    'fuel_consumed_mmbtu_per_gen',
    'fuel_cost_annual_eia',
    'heat_rate_mmbtu_mwh', 
    'plant_assn', 
    'net_generation_mwh', 
    'net_generation_mwh_plant_eia',
    'fuel_cost_per_mwh',
    'nameplate_capacity_mw', 
    'summer_capacity_mw',
    'winter_capacity_mw', 
    'capacity_factor']]

# Pull in FERC data & identify plants with simple

In [50]:
simple_ferc = analysis.simple_ferc1_plant_ids(pudl_engine)

In [51]:
# convert simple ferc list into a pandas series for import to analysis.ferc_expenses() function
simple_ferc = simple_ferc.iloc[:,0]

In [52]:
pd.options.display.max_rows = 999
type(simple_ferc)

pandas.core.series.Series

In [53]:
fp = analysis.get_steam_ferc1_df(pudl_engine)

In [54]:
# ferc_expenses returns:
# ferc1_expns_corr: A dictionary of expense categories
#             and their correlations to the plant's net electricity generation.
# steam_df: a dataframe with all the operating expenses broken out for each simple FERC PUDL plant.

In [55]:
ferc_expenses = analysis.ferc_expenses(pudl_engine, simple_ferc)
ferc_expenses_df = ferc_expenses[1]

In [56]:
ferc_expenses_df.head()

Unnamed: 0,report_year,respondent_id,util_id_pudl,respondent_name,plant_id_pudl,plant_name,total_capacity_mw,year_constructed,year_installed,peak_demand_mw,water_limited_mw,not_water_limited_mw,plant_hours,net_generation_mwh,expns_operations,expns_fuel,expns_coolants,expns_steam,expns_steam_other,expns_transfer,expns_electric,expns_misc_power,expns_rents,expns_allowances,expns_engineering,expns_structures,expns_boiler,expns_plants,expns_misc_steam,expns_production_total,expns_per_mwh,expns_total_nonfuel_production,expns_total_nonproduction
4,2004,2,18,ALABAMA POWER COMPANY,227,Gorgas,1416.7,1929.0,1972.0,1233.0,0.0,1227.0,8784.0,7902681.0,4983970.0,138135187.0,0.0,2761582.0,0.0,0.0,1652067.0,9085207.0,0.0,4642951.0,2487427.0,3081447.0,16596930.0,3410896.0,588617.0,187426281.0,23.7,37423489.0,11867605.0
5,2004,2,18,ALABAMA POWER COMPANY,204,Gadsden,138.0,1949.0,1949.0,744.0,0.0,130.0,8692.0,506590.0,729463.0,13491128.0,0.0,885481.0,0.0,0.0,555542.0,1411726.0,9370.0,516904.0,367710.0,426909.0,1811727.0,400661.0,145879.0,20752500.0,41.0,5304214.0,1957158.0
8,2004,2,18,ALABAMA POWER COMPANY,293,Joseph M. Farley,1776.5,1977.0,1981.0,1488.0,0.0,1669.0,7789.0,13147985.0,51974819.0,60809385.0,680728.0,13526928.0,0.0,0.0,938119.0,22897542.0,871581.0,0.0,6927010.0,2502635.0,33326524.0,5633532.0,5408993.0,205497796.0,15.6,84814171.0,59874240.0
16,2004,2,18,ALABAMA POWER COMPANY,617,Washington County,122.58,1999.0,1999.0,122.0,0.0,123.0,744.0,713978.0,299747.0,49371051.0,0.0,663982.0,0.0,0.0,0.0,860112.0,0.0,0.0,262236.0,61030.0,0.0,2750855.0,62190.0,54331203.0,76.1,4598215.0,361937.0
18,2004,2,18,ALABAMA POWER COMPANY,580,Theodore,273.87,2000.0,2000.0,232.0,0.0,274.0,0.0,1290634.0,524021.0,69968406.0,0.0,730404.0,0.0,0.0,0.0,340009.0,0.0,0.0,168965.0,312938.0,0.0,1012093.0,48431.0,73105267.0,56.6,2564409.0,572452.0


In [57]:
# create a correlation table to export
corr = pd.DataFrame.from_dict(ferc_expenses[0], orient='index')
corr.index.names = ['expense_variable']
corr = corr.reset_index()
corr = corr.rename(columns={0: 'correlation_with_net_generation_mwh_plant'})

In [58]:
# create a ferc_expenses data frame to merge with EIA data
ferc_expenses_df = ferc_expenses[1]

In [59]:
ferc_expenses_df.rename(columns={'net_generation_mwh': 'net_generation_mwh_plant',
                                 'plant_name': 'plant_name_ferc',
                                 'respondent_id': 'respondent_id_ferc',
                                 'respondent_name': 'respondent_name_ferc',
                                 'total_capacity_mw': 'total_capacity_mw_plant'}, inplace=True)

In [60]:
mcoe_by_generator.rename(columns={'net_generation_mwh': 'net_generation_mwh_generator',
                                  'plant_name': 'plant_name_eia',
                                  'operator_name': 'operator_name_eia',
                                  'nameplate_capacity_mw': 'nameplate_capacity_mw_generator',
                                  'summer_capacity_mw': 'summer_capacity_mw_generator',
                                  'winter_capacity_mw': 'winter_capacity_mw_generator'}, inplace=True)

In [61]:
# Left merge keeps all rows of mcoe_by_generator and duplicates values of ferc_expenses_df 
# for each combination of plant_id_pudl and report_date
merged = mcoe_by_generator.merge(ferc_expenses_df, how="left", left_on=['plant_id_pudl', 'report_date'], right_on=['plant_id_pudl', 'report_year'])

In [62]:
# remove the non-simple plants by dropping rows where plant_name_ferc = NA
merged = merged.dropna(subset=['plant_name_ferc'])

In [63]:
# rearrange columns
merged = merged[[
    'plant_id_pudl', 'plant_id_eia', 'plant_name_eia',
    'operator_name_eia', 'state', 'report_date', 
    'generator_id', 
    'energy_source', 'mmbtu',
    'fuel_cost',
    'fuel_cost_per_mmbtu_average',
    'fuel_consumed_mmbtu', 
    'net_generation_mwh_boiler_gen',
    'net_generation_mwh_boiler',
    'proportion_of_gen_by_boil_gen',
    'fuel_consumed_mmbtu_per_gen', 'fuel_cost_annual_eia',
    'heat_rate_mmbtu_mwh', 'plant_assn', 
    'net_generation_mwh_generator', 'net_generation_mwh_plant_eia',
    'fuel_cost_per_mwh', 'nameplate_capacity_mw_generator', 
    'summer_capacity_mw_generator', 'winter_capacity_mw_generator', 'capacity_factor',
    'report_year', 'respondent_id_ferc', 'util_id_pudl', 
    'respondent_name_ferc', 'plant_name_ferc',
    'total_capacity_mw_plant', 'year_constructed', 'year_installed',
    'peak_demand_mw', 'water_limited_mw', 'not_water_limited_mw',
    'plant_hours', 'net_generation_mwh_plant', 'expns_operations',
    'expns_fuel', 'expns_coolants', 'expns_steam', 'expns_steam_other',
    'expns_transfer', 'expns_electric', 'expns_misc_power', 'expns_rents',
    'expns_allowances', 'expns_engineering', 'expns_structures',
    'expns_boiler', 'expns_plants', 'expns_misc_steam',
    'expns_production_total', 'expns_per_mwh',
    'expns_total_nonfuel_production', 'expns_total_nonproduction']]

In [72]:
# number of plants in this data set
len(merged.plant_id_pudl.unique())

82

In [66]:
mcoe_annotations = pd.read_csv('mcoe_field_annotations_detailed.csv')
mcoe_notes = pd.read_csv('mcoe_notes.csv')

In [67]:
# to export all the data
xlsx_writer = pd.ExcelWriter('coal_plants_detailed_MCOE_by_generator_2011-2016.xlsx')
merged.to_excel(xlsx_writer, sheet_name='MCOE by Generator', index=False, na_rep='NA')
corr.to_excel(xlsx_writer, sheet_name='Expense Correlations', index=False, na_rep='NA')
mcoe_annotations.to_excel(xlsx_writer, sheet_name='MCOE Annotations', index=False)
mcoe_notes.to_excel(xlsx_writer, sheet_name='MCOE Notes', index=False)

xlsx_writer.save()

In [70]:
# to export all the data
# xlsx_writer = pd.ExcelWriter('test_primary_fuel_cost_attribution.xlsx')
# xlsx_writer = pd.ExcelWriter('test_plant_category_cost_attribution.xlsx')
# xlsx_writer = pd.ExcelWriter('coal_plants_NSPC&MPI_detailed_MCOE_by_generator_2011-2016.xlsx')

# test.to_excel(xlsx_writer, sheet_name='MCOE by Generator', index=False, na_rep='NA')
# corr.to_excel(xlsx_writer, sheet_name='Expense Correlations', index=False, na_rep='NA')
# mcoe_annotations.to_excel(xlsx_writer, sheet_name='MCOE Annotations', index=False)
# mcoe_notes.to_excel(xlsx_writer, sheet_name='MCOE Notes', index=False)

# xlsx_writer.save()

In [None]:
merged.head(3)

In [69]:
merged2 = merged.dropna(subset=['net_generation_mwh_plant_eia', 'net_generation_mwh_plant'])

In [None]:
fuels = ['gas','oil','coal']
fig, (ax) = plt.subplots(ncols=1, nrows=1)
fig.set_figwidth(10)
fig.set_figheight(10)
fig.set_dpi(150)
ax.loglog()
plt.grid(b=True)
R2 = np.corrcoef(merged2.net_generation_mwh_plant_eia, merged2.net_generation_mwh_plant)[0,1]**2
ax.scatter(merged2.net_generation_mwh_plant_eia, merged2.net_generation_mwh_plant, s=10, color='blue')
ax.legend(loc='upper left')
plt.title("Annual net generation by plant, EIA 923 vs. FERC Form 1 ($r^2$={:.2f})".format(R2))
plt.xlabel("Net generation by PUDL plant from EIA923 (MWh)")
plt.ylabel("Net generation by PUDL plant from FERC1 (MWh)")
plt.show();

In [None]:
merged3 = merged.dropna(subset=['fuel_cost_annual_eia', 'expns_fuel'])

In [None]:
fuels = ['gas','oil','coal']
fig, (ax) = plt.subplots(ncols=1, nrows=1)
fig.set_figwidth(10)
fig.set_figheight(10)
fig.set_dpi(150)
ax.loglog()
plt.grid(b=True)
R2 = np.corrcoef(merged3.fuel_cost_annual_eia, merged3.expns_fuel)[0,1]**2
ax.scatter(merged3.fuel_cost_annual_eia, merged3.expns_fuel, s=10, color='green')
ax.legend(loc='upper left')
plt.title("Fuel cost per year, EIA 923 vs. FERC Form 1, ($r^2$={:.2f})".format(R2))
plt.xlabel("Fuel cost per year, by PUDL plant from EIA923 ($)")
plt.ylabel("Fuel cost per year, by PUDL plant from FERC1 ($)")
plt.show();

In [None]:
plt.hist(merged.net_generation_mwh_plant_eia, range=(0,500000), bins=100, alpha=0.5, color='green', label="EIA")
plt.hist(merged.net_generation_mwh_plant, range=(0,500000), bins=100, alpha=0.5, color='blue', label='FERC')
plt.xlabel("Total net generation per plant (MWh)")
plt.ylabel("Number of records")
plt.title("Net generation (MWh)")
plt.legend()

In [None]:
#quick look at spread in fuel_cost_per_mwh for these plants
plt.hist(merged.fuel_cost_per_mwh, range=(0,200), bins=200, alpha=0.5, color='green')
ferc_fuel_cost_per_mwh = merged.expns_fuel / merged.net_generation_mwh_plant
plt.hist(ferc_fuel_cost_per_mwh, range=(0,200), bins=200, alpha=0.5, color='blue')
plt.xlabel("Fuel Cost ($/mwh)")
plt.ylabel("Number of records")
plt.title("Cost of fuel")


## A few plots

In [None]:
# Take a look at heat rates from EIA
plt.hist(heat_rate.heat_rate_mmbtu_mwh, range=[0,50],bins=100,weights=heat_rate.net_generation_mwh, alpha=0.5)
#heat_rate['heat_rate_mmbtu_mwh'].plot(kind='hist', bins=100, range=[-20,70])
plt.xlabel('Generator heat rate (MMBtu/MWh)')
plt.ylabel("Generator records (weighted by net generation)")
plt.title("Heat rates")
plt.show()

In [None]:
#quick look at spread in fuel_cost_per_mmbtu
plt.hist(mcoe_by_generator.fuel_cost_per_mmbtu_average, range=(-5,50), bins=200, alpha=0.5, color='green')
plt.xlabel("Fuel Cost ($/mmBTU)")
plt.ylabel("Number of generator records")
plt.title("Cost of fuel")
plt.legend()

In [None]:
#quick look at capacity factors
plt.hist(mcoe_by_generator.capacity_factor, range=(0,1.5), bins=200, alpha=0.5, color='blue')
plt.xlabel("Capacity factor (Net generation/Nameplate capacity)")
plt.ylabel("Number of generator records")
plt.title("Capacity factors")
plt.legend()

In [None]:
#quick look at spread in fuel_cost_per_mwh for these plants
plt.hist(merged.fuel_cost_per_mwh, range=(0,200), bins=200, alpha=0.5, color='green')
plt.xlabel("Fuel Cost ($/mwh)")
plt.ylabel("Number of generator records")
plt.title("Cost of fuel")
plt.legend()