In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import sys
import os
import numpy as np
import pandas as pd
import sqlalchemy as sa
sys.path.append(os.path.abspath(os.path.join('..','..','..')))
from pudl import pudl, ferc1, eia923, settings, constants, analysis
from pudl import models, models_ferc1, models_eia923, outputs
from pudl import clean_eia923, clean_ferc1, clean_pudl, mcoe
import matplotlib.pyplot as plt
import matplotlib as mpl
pd.options.mode.chained_assignment = None
plt.style.use('ggplot')
mpl.rcParams['figure.figsize'] = (10,6)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 56
pd.options.display.max_rows = 100

In [4]:
pudl_engine  = pudl.connect_db()

# Pulling the required tables

In [5]:
g9_summed = mcoe.generation_pull_eia923(pudl_engine)
g8, g8_es = mcoe.generators_pull_eia860(pudl_engine)
frc9_summed, frc9_summed_plant = mcoe.fuel_reciepts_costs_pull_eia923(pudl_engine)
bga8 = mcoe.boiler_generator_pull_eia860(pudl_engine)
bf9_summed, bf9_plant_summed = mcoe.boiler_fuel_pull_eia923(pudl_engine)

### Calculate a generator's proportion of plant capacity

In [None]:
# capacity_eia = analysis.capacity_proportion_eia923(g8, id_col='plant_id_eia')
# capacity_pudl = analysis.capacity_proportion_eia923(g8, id_col='plant_id_pudl')

## Generating Capacity Factor

In [11]:
capacity_factor = analysis.capacity_factor(g9_summed,g8, id_col='plant_id_eia')

In [12]:
capacity_factor.head(2)

Unnamed: 0,report_year,plant_id_eia,plant_id_pudl,generator_id,net_generation_mwh,plant_name,operator_name,state,nameplate_capacity_mw,summer_capacity_mw,winter_capacity_mw,capacity_factor
0,2011,3,32,1,312130.0,Barry,Alabama Power Co,AL,153.1,138.0,138.0,0.232732
1,2011,3,32,2,191475.0,Barry,Alabama Power Co,AL,153.1,137.0,137.0,0.142769


## Generating Heat Rate

In [16]:
g9_summed.head(2)

Unnamed: 0,report_year,plant_id_eia,plant_id_pudl,generator_id,net_generation_mwh
0,2009,3,32,1,221908.0
1,2009,3,32,2,394031.0


In [22]:
heat_rate = mcoe.heat_rate(bga8, g9_summed, bf9_summed, bf9_plant_summed, pudl_engine, id_col='plant_id_eia')

In [23]:
heat_rate.head()

Unnamed: 0,complete_assn,fuel_consumed_mmbtu_per_gen,generator_id,heat_rate_mmbtu_mwh,net_generation_mwh,plant_assn,plant_id_eia,plant_id_pudl,report_year
0,True,2282135.0,1,10.284149,221908.0,True,3,32,2009
1,True,4047126.0,2,10.271086,394031.0,True,3,32,2009
2,True,13065990.0,3,10.157073,1286393.0,True,3,32,2009
3,True,16160730.0,4,9.935606,1626547.0,True,3,32,2009
4,True,44709100.0,5,9.906513,4513101.0,True,3,32,2009


## Calculating proportions of generation & capacity

In [None]:
# proportion_gen_eia = analysis.generator_proportion_eia923(g9, id_col='plant_id_eia')
# proportion_gen_pudl = analysis.generator_proportion_eia923(g9, id_col='plant_id_pudl')

In [None]:
# convert Y-M-D to year
# proportion_gen_eia.report_date = proportion_gen_eia.report_date.dt.year
# proportion_gen_pudl.report_date = proportion_gen_pudl.report_date.dt.year

In [None]:
# testplant = proportion[proportion['plant_id_pudl']==217]
# testplant = proportion[proportion['plant_id_eia']==7343]
# testplant = proportion[proportion['plant_id_eia']==1091]

In [None]:
# g9[g9["plant_id_eia"]==1091] #this is plant_id_pudl 217
# g9[g9["plant_id_eia"]==7343] #this is also plant_id_pudl 217

## Generating Fuel Cost

In [26]:
fuel_cost = mcoe.fuel_cost(g8_es, g9_summed, frc9_summed, frc9_summed_plant, heat_rate)

In [30]:
fuel_cost.sample(10)

Unnamed: 0,energy_source,energy_source_cons,energy_source_count,fuel_cost_es,fuel_cost_per_mmbtu_average,fuel_cost_per_mwh,fuel_cost_plant,generator_id,heat_rate_mmbtu_mwh,mmbtu_es,mmbtu_plant,nameplate_capacity_mw,net_generation_mwh,operator_name,plant_id_eia,plant_id_pudl,plant_name,report_year,state
13646,BLQ,biomass_solid,1.0,,,,,TG1,29.015609,,,64.0,420186.0,Weyerhaeuser Co,50184,3549,Weyerhaeuser Columbus MS,2016,MS
11948,MSW,solid_renewable,1.0,,,,,WFI1,16.499929,,,53.3,321533.0,Wheelabrator Environmental Systems,54746,4248,Wheelabrator Falls,2015,PA
11035,SUB,coal,1.0,,2.509452,27.227871,176036200.0,1,10.850125,,70149250.0,900.0,2779145.0,Entergy Arkansas Inc,6641,280,Independence Steam Electric Station,2015,AR
5501,SUB,coal,2.0,38904360.0,2.061973,25.311509,,1,12.275381,18867540.0,,540.0,1672822.0,Grand River Dam Authority,165,1355,GREC,2016,OK
10587,SUB,coal,1.0,,,,,6,21.787314,,,6.4,6120.952,Hibbing Public Utilities Comm,1979,1938,Hibbing,2015,MN
1240,BIT,coal,1.0,,,,,TG22,36.243376,,36840060.0,15.4,84261.879,Eastman Chemical Co-TN Ops,50481,3695,Tennessee Eastman Operations,2009,TN
6601,BIT,coal,1.0,,,,,1,10.607907,,73956540.0,576.0,2546110.0,Allegheny Energy Supply Co LLC,3179,1145,Hatfields Ferry Power Station,2013,PA
337,RFO,oil,1.0,,,,,1,25.521792,,2483199.0,50.0,4195.0,FPL Energy Wyman LLC,1507,654,William F Wyman,2009,ME
3255,SUB,coal,2.0,52575740.0,2.838123,42.919818,,8,15.122607,18524830.0,,156.3,860941.0,Consumers Energy Co,1720,666,J C Weadock,2013,MI
1698,BIT,coal,1.0,,4.117757,49.214837,221908900.0,3,11.951855,,53890730.0,122.5,388721.0,Georgia Power Co,728,658,Yates,2010,GA


### Merge fields together

In [69]:
# Merge relevant fuel cost fields with heat rate
mcoe_by_generator = fuel_cost[['plant_id_eia',
                               'plant_id_pudl',
                               'report_year',
                               'generator_id',
                               'energy_source',
                               'energy_source_cons',
                               'mmbtu_es',
                               'mmbtu_plant',
                               'fuel_cost_es',
                               'fuel_cost_plant',
                               'fuel_cost_per_mmbtu_average',
                               'fuel_cost_per_mwh']].merge(heat_rate,
                                                           on=['plant_id_eia',
                                                               'plant_id_pudl',
                                                               'report_year',
                                                               'generator_id'])

# Merge in capacity facotr
mcoe_by_generator = mcoe_by_generator.drop('net_generation_mwh', 
                                           axis=1).merge(capacity_factor,
                                                         on=['plant_id_eia',
                                                             'plant_id_pudl',
                                                             'report_year',
                                                             'generator_id'])

# Merge in plant level fuel consumption from bf9_plant_summed, rename field
mcoe_by_generator = mcoe_by_generator.merge(bf9_plant_summed,
                                            on=['plant_id_eia',
                                                'plant_id_pudl',
                                                'report_year'])
mcoe_by_generator.rename(columns={'fuel_consumed_mmbtu':'fuel_consumed_mmbtu_plant'},
                         inplace=True)

### Filter to include only plants >= 10 MW

In [70]:
# Now, let's grab only those plants larger than 10MW
mcoe_by_generator = mcoe_by_generator[mcoe_by_generator['nameplate_capacity_mw']>=10]

## Aggregate at fuel cost and net gen at plant level for comparison with FERC

In [71]:
frc9 = analysis.simple_select('fuel_receipts_costs_eia923', pudl_engine)
frc9['fuel_cost'] = (frc9['fuel_quantity'] *
                     frc9['average_heat_content'] *
                     frc9['fuel_cost_per_mmbtu'])

In [72]:
fuel_summed = analysis.yearly_sum_eia(frc9, 'fuel_cost', columns=['plant_id_pudl',
                                           'report_year'])
fuel_summed = fuel_summed.reset_index()
fuel_summed.rename(columns={'fuel_cost': 'fuel_cost_annual_eia'}, inplace=True)
fuel_summed = fuel_summed.reset_index()

In [73]:
eia_netgen = mcoe_by_generator.groupby(by=['plant_id_eia', 'plant_id_pudl','report_year'])
netgen_summed = eia_netgen.agg({'net_generation_mwh':np.sum})
netgen_summed.rename(columns={'net_generation_mwh': 'net_generation_mwh_plant_eia'}, inplace=True)
netgen_summed = netgen_summed.reset_index()

In [74]:
mcoe_by_generator = mcoe_by_generator.merge(fuel_summed, how="left", left_on=['plant_id_pudl', 'report_year'], 
                                             right_on=['plant_id_pudl', 'report_year'])

In [75]:
mcoe_by_generator = mcoe_by_generator.merge(netgen_summed, how="left", on=['plant_id_eia', 'plant_id_pudl', 'report_year'])

In [76]:
mcoe_by_generator.head(2)

Unnamed: 0,plant_id_eia,plant_id_pudl,report_year,generator_id,energy_source,energy_source_cons,mmbtu_es,mmbtu_plant,fuel_cost_es,fuel_cost_plant,fuel_cost_per_mmbtu_average,fuel_cost_per_mwh,complete_assn,fuel_consumed_mmbtu_per_gen,heat_rate_mmbtu_mwh,plant_assn,net_generation_mwh,plant_name,operator_name,state,nameplate_capacity_mw,summer_capacity_mw,winter_capacity_mw,capacity_factor,fuel_consumed_mmbtu_plant,index,fuel_cost_annual_eia,net_generation_mwh_plant_eia
0,7,204,2011,1,BIT,coal,,1508074.416,,5985504.0,3.968971,63.964246,True,2976913.306,16.116077,True,184717.0,Gadsden,Alabama Power Co,AL,69.0,64.0,64.0,0.3056,3632852.861,960.0,5985504.0,230200.0
1,7,204,2011,2,BIT,coal,,1508074.416,,5985504.0,3.968971,57.239085,True,655939.555,14.421642,True,45483.0,Gadsden,Alabama Power Co,AL,69.0,66.0,66.0,0.075248,3632852.861,960.0,5985504.0,230200.0


In [64]:
mcoe_by_generator = mcoe_by_generator3

## Export the data frame

In [None]:
# rearrange columns
mcoe_by_generator = mcoe_by_generator3[[
    'plant_id_eia', 
    'plant_id_pudl',
    'plant_name',
    'operator_name', 
    'state', 
    'report_year', 
    'generator_id', 
    'energy_source',
    'mmbtu_es',
    'mmbtu_plant',
    'fuel_cost_es',
    'fuel_cost_plant',
    'fuel_cost_per_mmbtu_average',
    'fuel_consumed_mmbtu',
    'fuel_consumed_mmbtu_per_gen',
    'fuel_cost_annual_eia',
    'heat_rate_mmbtu_mwh', 
    'plant_assn', 
    'net_generation_mwh', 
    'net_generation_mwh_plant_eia',
    'fuel_cost_per_mwh',
    'nameplate_capacity_mw', 
    'capacity_factor']]

In [None]:
mcoe_by_generator.head(2)

# Pull in FERC data & identify plants with simple

In [None]:
simple_ferc = analysis.simple_ferc1_plant_ids(pudl_engine)

In [None]:
# convert simple ferc list into a pandas series for import to analysis.ferc_expenses() function
simple_ferc = simple_ferc.iloc[:,0]

In [None]:
pd.options.display.max_rows = 999

In [None]:
fp = outputs.plants_steam_ferc1(pudl_engine)
# fp = analysis.get_steam_ferc1_df(pudl_engine)

In [None]:
# ferc_expenses returns:
# ferc1_expns_corr: A dictionary of expense categories
#             and their correlations to the plant's net electricity generation.
# steam_df: a dataframe with all the operating expenses broken out for each simple FERC PUDL plant.

In [None]:
ferc_expenses = analysis.ferc_expenses(pudl_engine, simple_ferc)
ferc_expenses_df = ferc_expenses[1]

In [None]:
ferc_expenses_df.head()

In [None]:
# create a correlation table to export
corr = pd.DataFrame.from_dict(ferc_expenses[0], orient='index')
corr.index.names = ['expense_variable']
corr = corr.reset_index()
corr = corr.rename(columns={0: 'correlation_with_net_generation_mwh_plant'})

In [None]:
# create a ferc_expenses data frame to merge with EIA data
ferc_expenses_df = ferc_expenses[1]

In [None]:
ferc_expenses_df.rename(columns={'net_generation_mwh': 'net_generation_mwh_plant',
                                 'plant_name': 'plant_name_ferc',
                                 'respondent_id': 'respondent_id_ferc',
                                 'respondent_name': 'respondent_name_ferc',
                                 'total_capacity_mw': 'total_capacity_mw_plant'}, inplace=True)

In [None]:
mcoe_by_generator.rename(columns={'net_generation_mwh': 'net_generation_mwh_generator',
                                  'plant_name': 'plant_name_eia',
                                  'operator_name': 'operator_name_eia',
                                  'nameplate_capacity_mw': 'nameplate_capacity_mw_generator',
                                  'summer_capacity_mw': 'summer_capacity_mw_generator',
                                  'winter_capacity_mw': 'winter_capacity_mw_generator'}, inplace=True)

In [None]:
mcoe_by_generator.head(2)

In [None]:
# mcoe_by_generator.drop('plant_id_pudl', axis=1, inplace=True)

In [None]:
# Left merge keeps all rows of mcoe_by_generator and duplicates values of ferc_expenses_df 
# for each combination of plant_id_pudl and report_date
merged = mcoe_by_generator.merge(ferc_expenses_df, how="left", left_on=['plant_id_pudl', 'report_year'], right_on=['plant_id_pudl', 'report_year'])

In [None]:
# remove the non-simple plants by dropping rows where plant_name_ferc = NA
merged = merged.dropna(subset=['plant_name_ferc'])

In [None]:
merged.head()

In [None]:
# rearrange columns
merged = merged[[
    'plant_id_pudl', 'plant_id_eia', 'plant_name_eia',
    'operator_name_eia', 'state', 'report_year', 
    'generator_id', 
    'energy_source', 'mmbtu_es', 'mmbtu_plant',
    'fuel_cost_es', 'fuel_cost_plant',
    'fuel_cost_per_mmbtu_average',
    'fuel_consumed_mmbtu', 
    'fuel_consumed_mmbtu_per_gen', 'fuel_cost_annual_eia',
    'heat_rate_mmbtu_mwh', 'plant_assn', 
    'net_generation_mwh_generator', 'net_generation_mwh_plant_eia',
    'fuel_cost_per_mwh', 'nameplate_capacity_mw_generator', 
    'capacity_factor',
    'report_year', 'respondent_id_ferc', 'util_id_pudl', 
    'respondent_name_ferc', 'plant_name_ferc',
    'total_capacity_mw_plant', 'year_constructed', 'year_installed',
    'peak_demand_mw', 'water_limited_mw', 'not_water_limited_mw',
    'plant_hours', 'net_generation_mwh_plant', 'expns_operations',
    'expns_fuel', 'expns_coolants', 'expns_steam', 'expns_steam_other',
    'expns_transfer', 'expns_electric', 'expns_misc_power', 'expns_rents',
    'expns_allowances', 'expns_engineering', 'expns_structures',
    'expns_boiler', 'expns_plants', 'expns_misc_steam',
    'expns_production_total', 'expns_per_mwh',
    'expns_total_nonfuel_production', 'expns_total_nonproduction']]

In [None]:
# number of plants in this data set
len(merged.plant_id_pudl.unique())

In [None]:
mcoe_annotations = pd.read_csv('mcoe_field_annotations_detailed.csv')
mcoe_notes = pd.read_csv('mcoe_notes.csv')

In [None]:
# # to export all the data
# xlsx_writer = pd.ExcelWriter('coal_plants_detailed_MCOE_by_generator_2011-2016.xlsx')
# merged.to_excel(xlsx_writer, sheet_name='MCOE by Generator', index=False, na_rep='NA')
# corr.to_excel(xlsx_writer, sheet_name='Expense Correlations', index=False, na_rep='NA')
# mcoe_annotations.to_excel(xlsx_writer, sheet_name='MCOE Annotations', index=False)
# mcoe_notes.to_excel(xlsx_writer, sheet_name='MCOE Notes', index=False)

# xlsx_writer.save()

In [None]:
# to export all the data
# xlsx_writer = pd.ExcelWriter('test_primary_fuel_cost_attribution.xlsx')
# xlsx_writer = pd.ExcelWriter('test_plant_category_cost_attribution.xlsx')
# xlsx_writer = pd.ExcelWriter('coal_plants_NSPC&MPI_detailed_MCOE_by_generator_2011-2016.xlsx')

# test.to_excel(xlsx_writer, sheet_name='MCOE by Generator', index=False, na_rep='NA')
# corr.to_excel(xlsx_writer, sheet_name='Expense Correlations', index=False, na_rep='NA')
# mcoe_annotations.to_excel(xlsx_writer, sheet_name='MCOE Annotations', index=False)
# mcoe_notes.to_excel(xlsx_writer, sheet_name='MCOE Notes', index=False)

# xlsx_writer.save()

In [None]:
merged.head(3)

In [None]:
merged2 = merged.dropna(subset=['net_generation_mwh_plant_eia', 'net_generation_mwh_plant'])

In [None]:
fuels = ['gas','oil','coal']
fig, (ax) = plt.subplots(ncols=1, nrows=1)
fig.set_figwidth(10)
fig.set_figheight(10)
fig.set_dpi(150)
ax.loglog()
plt.grid(b=True)
R2 = np.corrcoef(merged2.net_generation_mwh_plant_eia, merged2.net_generation_mwh_plant)[0,1]**2
ax.scatter(merged2.net_generation_mwh_plant_eia, merged2.net_generation_mwh_plant, s=10, color='blue')
ax.legend(loc='upper left')
plt.title("Annual net generation by plant, EIA 923 vs. FERC Form 1 ($r^2$={:.2f})".format(R2))
plt.xlabel("Net generation by PUDL plant from EIA923 (MWh)")
plt.ylabel("Net generation by PUDL plant from FERC1 (MWh)")
plt.show();

In [None]:
merged3 = merged.dropna(subset=['fuel_cost_annual_eia', 'expns_fuel'])

In [None]:
fuels = ['gas','oil','coal']
fig, (ax) = plt.subplots(ncols=1, nrows=1)
fig.set_figwidth(10)
fig.set_figheight(10)
fig.set_dpi(150)
ax.loglog()
plt.grid(b=True)
R2 = np.corrcoef(merged3.fuel_cost_annual_eia, merged3.expns_fuel)[0,1]**2
ax.scatter(merged3.fuel_cost_annual_eia, merged3.expns_fuel, s=10, color='green')
ax.legend(loc='upper left')
plt.title("Fuel cost per year, EIA 923 vs. FERC Form 1, ($r^2$={:.2f})".format(R2))
plt.xlabel("Fuel cost per year, by PUDL plant from EIA923 ($)")
plt.ylabel("Fuel cost per year, by PUDL plant from FERC1 ($)")
plt.show();

In [None]:
plt.hist(merged.net_generation_mwh_plant_eia, range=(0,500000), bins=100, alpha=0.5, color='green', label="EIA")
plt.hist(merged.net_generation_mwh_plant, range=(0,500000), bins=100, alpha=0.5, color='blue', label='FERC')
plt.xlabel("Total net generation per plant (MWh)")
plt.ylabel("Number of records")
plt.title("Net generation (MWh)")
plt.legend()

In [None]:
#quick look at spread in fuel_cost_per_mwh for these plants
plt.hist(merged.fuel_cost_per_mwh, range=(0,200), bins=200, alpha=0.5, color='green')
ferc_fuel_cost_per_mwh = merged.expns_fuel / merged.net_generation_mwh_plant
plt.hist(ferc_fuel_cost_per_mwh, range=(0,200), bins=200, alpha=0.5, color='blue')
plt.xlabel("Fuel Cost ($/mwh)")
plt.ylabel("Number of records")
plt.title("Cost of fuel")


## A few plots

In [None]:
# Take a look at heat rates from EIA
plt.hist(heat_rate.heat_rate_mmbtu_mwh, range=[0,50],bins=100,weights=heat_rate.net_generation_mwh, alpha=0.5)
#heat_rate['heat_rate_mmbtu_mwh'].plot(kind='hist', bins=100, range=[-20,70])
plt.xlabel('Generator heat rate (MMBtu/MWh)')
plt.ylabel("Generator records (weighted by net generation)")
plt.title("Heat rates")
plt.show()

In [None]:
#quick look at spread in fuel_cost_per_mmbtu
plt.hist(mcoe_by_generator.fuel_cost_per_mmbtu_average, range=(-5,50), bins=200, alpha=0.5, color='green')
plt.xlabel("Fuel Cost ($/mmBTU)")
plt.ylabel("Number of generator records")
plt.title("Cost of fuel")
plt.legend()

In [None]:
#quick look at capacity factors
plt.hist(mcoe_by_generator.capacity_factor, range=(0,1.5), bins=200, alpha=0.5, color='blue')
plt.xlabel("Capacity factor (Net generation/Nameplate capacity)")
plt.ylabel("Number of generator records")
plt.title("Capacity factors")
plt.legend()

In [None]:
#quick look at spread in fuel_cost_per_mwh for these plants
plt.hist(merged.fuel_cost_per_mwh, range=(0,200), bins=200, alpha=0.5, color='green')
plt.xlabel("Fuel Cost ($/mwh)")
plt.ylabel("Number of generator records")
plt.title("Cost of fuel")
plt.legend()