In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import sys
import os
import numpy as np
import pandas as pd
import sqlalchemy as sa
sys.path.append(os.path.abspath(os.path.join('..','..','..')))
from pudl import pudl, ferc1, eia923, settings, constants, analysis
from pudl import models, models_ferc1, models_eia923, outputs
from pudl import clean_eia923, clean_ferc1, clean_pudl, mcoe
import matplotlib.pyplot as plt
import matplotlib as mpl
pd.options.mode.chained_assignment = None
plt.style.use('ggplot')
mpl.rcParams['figure.figsize'] = (10,6)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 56
pd.options.display.max_rows = 100

In [2]:
pudl_engine  = pudl.db_connect_pudl()

# Pulling the required tables

In [3]:
g9_summed = mcoe.generation_pull_eia923(pudl_engine)
g8, g8_es = mcoe.generators_pull_eia860(pudl_engine)
frc9_summed, frc9_summed_plant = mcoe.fuel_reciepts_costs_pull_eia923(pudl_engine)
bga8 = mcoe.boiler_generator_pull_eia860(pudl_engine)
bf9_summed, bf9_plant_summed = mcoe.boiler_fuel_pull_eia923(pudl_engine)

## This forcing of energy source column to coal is problematic & needs to be undone

In [None]:
# change energy source column to coal
# frc9['energy_source'] = 'Coal'

In [None]:
# Get yearly fuel cost by plant_id, year and energy_source
# frc9_summed = analysis.yearly_sum_eia(frc9 , 'fuel_cost', columns=['plant_id_eia', 'plant_id_pudl','report_year','energy_source_cons']) #toggle
# frc9_summed = frc9_summed.reset_index()
# frc9_summed = frc9_summed.rename(columns={'primary_fuel': 'energy_source'})

In [None]:
frc9_summed.head()

In [None]:
# frc9_mmbtu_summed = analysis.yearly_sum_eia(frc9 , 'mmbtu', columns=['plant_id_eia', 'plant_id_pudl','report_year','energy_source_cons']) #toggle
# # frc9_mmbtu_summed = frc9_mmbtu_summed.rename(columns={'primary_fuel': 'energy_source'})
# frc9_mmbtu_summed = frc9_mmbtu_summed.reset_index()
# frc9_mmbtu_summed.head(2)

In [None]:
# frc9_summed = frc9_mmbtu_summed.merge(frc9_summed)
# frc9_summed['fuel_cost_per_mmbtu_average'] = (frc9_summed.fuel_cost/frc9_summed.mmbtu)
# frc9_summed.head()

### Generators EIA923

In [None]:
# Convert the generation_eia923 table into a dataframe
g9_summed = mcoe.generation_pull_eia923(pudl_engine)

## ONLY COAL PLANTS:
#subset the g9 data frame to only coal plants
# # g9 = g9[g9['plant_id_eia'].isin(plants)]

# # Get yearly net generation by plant_id, year and generator_id
# g9_summed = analysis.yearly_sum_eia(g9, 'net_generation_mwh', columns=['report_year','plant_id_eia', 'plant_id_pudl', 'generator_id'])
# g9_summed.reset_index(inplace=True)

In [None]:
g9_summed.head()

In [None]:
g9_summed[g9_summed["plant_id_pudl"]==217].head()

### Generators EIA860

In [None]:
# Convert the generators_eia860 table into a dataframe
g8, g8_es = mcoe.generators_pull_eia860(pudl_engine)
# g8 = analysis.simple_select('generators_eia860', pudl_engine)

In [None]:
# In order to use the energy source associated with each generator
#subset the g8 data frame to only coal plants

# g8_es['energy_source'] = 'Coal'
# g8 = g8[g8['plant_id_eia'].isin(plants)]
# g8_es = g8_es[g8_es['plant_id_eia'].isin(plants)]

In [None]:
g8.head()

In [None]:
# In order to use the capacity of each generator
# g8 = g8[['plant_id_eia','plant_id_pudl','report_year','plant_name', 'operator_name', 'state', 'generator_id','nameplate_capacity_mw','summer_capacity_mw','winter_capacity_mw']]


In [None]:
# re-assign all energy source values to coal
# g8['energy_source'] = 'Coal'

### Calculate a generator's proportion of plant capacity

In [None]:
# capacity_eia = analysis.capacity_proportion_eia923(g8, id_col='plant_id_eia')
# capacity_pudl = analysis.capacity_proportion_eia923(g8, id_col='plant_id_pudl')

### Boiler Fuel EIA923

In [None]:
# Convert the boiler_fuel_eia923 table into a dataframe
bf9 = mcoe.boiler_fuel_pull_eia923(pudl_engine)

In [None]:
# #subset the bf9 data frame to only coal plants
# bf9 = bf9[bf9['plant_id_eia'].isin(plants)]

# # re-assign all energy source values to coal
# bf9['energy_source'] = 'Coal'

In [None]:
# Get yearly fuel consumed by plant_id, year and boiler_id
bf9_summed = bf9[0]
# Get yearly fuel consumed by plant_id, year and boiler_id
bf9_plant_summed = bf9[1]

### Boiler Generator Association EIA860

In [None]:
# Convert the boiler_generator_assn_eia860 table into a dataframe
bga8 = mcoe.boiler_generator_pull_eia860(pudl_engine)

In [None]:
# get the primary fuel for each plant (to be used to combine plants that have unassociated generators)
gf9 = analysis.simple_select('generation_fuel_eia923', pudl_engine)
# primary_fuel = analysis.primary_fuel_gf_eia923(gf9).rename(columns={'year':'report_date'})

In [None]:
#subset the bf9 data frame to only coal plants
# gf9 = gf9[gf9['plant_id_eia'].isin(plants)]

In [None]:
g9_summed.head(2)

In [None]:
g8.head(2)

In [None]:
g9_summed.head()

## Generating Capacity Factor

In [None]:
capacity_factor = analysis.capacity_factor(g9_summed,g8, id_col='plant_id_eia')

In [None]:
capacity_factor.head(2)

In [None]:
testplant = capacity_factor[capacity_factor['plant_id_pudl']==217]
# testplant = capacity_factor[capacity_factor['plant_id_eia']==7343]
# testplant = capacity_factor[capacity_factor['plant_id_eia']==1091]

In [None]:
testplant.head(8)

## Generating Heat Rate

In [None]:
# Calculate heat rate here (instead of with function) to retain 
# some of the underlying fields used for calculations

In [None]:
g9_summed.head(2)

In [None]:
gens_with_bga = mcoe.gens_with_bga(bga8, g9_summed, id_col='plant_id_pudl')

In [None]:
g8.head(5)

In [5]:
heat_rate = mcoe.heat_rate(bga8, g9_summed, bf9_summed, bf9_plant_summed, pudl_engine, id_col='plant_id_eia')

In [None]:
heat_rate.head()

In [6]:
fuel_cost = mcoe.fuel_cost(g8_es, g9_summed, frc9_summed, frc9_summed_plant, heat_rate)

In [None]:
fuel_cost.head(2)

## Calculating proportions of generation & capacity

In [None]:
# proportion_gen_eia = analysis.generator_proportion_eia923(g9, id_col='plant_id_eia')
# proportion_gen_pudl = analysis.generator_proportion_eia923(g9, id_col='plant_id_pudl')

In [None]:
# convert Y-M-D to year
# proportion_gen_eia.report_date = proportion_gen_eia.report_date.dt.year
# proportion_gen_pudl.report_date = proportion_gen_pudl.report_date.dt.year

In [None]:
# testplant = proportion[proportion['plant_id_pudl']==217]
# testplant = proportion[proportion['plant_id_eia']==7343]
# testplant = proportion[proportion['plant_id_eia']==1091]

In [None]:
# g9[g9["plant_id_eia"]==1091] #this is plant_id_pudl 217
# g9[g9["plant_id_eia"]==7343] #this is also plant_id_pudl 217

## Generating Fuel Cost

In [None]:
fuel_cost = mcoe.fuel_cost(g8_es, g9_summed, frc9_summed, frc9_summed_plant, heat_rate)

In [None]:
# fuel_cost.plant_id_pudl.astype(int)
heat_rate.head(2)

In [None]:
fuel_cost.head(2)

### Merge fields together

In [None]:
mcoe_by_generator = fuel_cost[['plant_id_eia',
                               'plant_id_pudl',
                               'report_year',
                               'generator_id',
                               'energy_source',
                               'mmbtu_es',
                               'mmbtu_plant',
                               'fuel_cost_es',
                               'fuel_cost_plant',
                               'fuel_cost_per_mmbtu_average',
                               'fuel_cost_per_mwh']].merge(heat_rate,
                                                           on=['plant_id_eia', 'plant_id_pudl','report_year','generator_id'])

In [None]:
mcoe_by_generator.head(2)

In [None]:
mcoe_by_generator = mcoe_by_generator.merge(capacity_factor.drop('net_generation_mwh', axis=1),
                                            on=['plant_id_eia','plant_id_pudl','report_year','generator_id'])


In [None]:
mcoe_by_generator.head(2)

In [None]:
mcoe_by_generator = mcoe_by_generator.merge(bf9_plant_summed, on=['plant_id_eia', 'plant_id_pudl','report_year'])

In [None]:
mcoe_by_generator.head()

In [None]:
fuel_cost.head(5)

### Filter to include only plants >= 10 MW

In [None]:
# Now, let's grab only those plants larger than 10MW
mcoe_by_generator = mcoe_by_generator[mcoe_by_generator['nameplate_capacity_mw']>=10]

## Aggregate at fuel cost and net gen at plant level for comparison with FERC

In [None]:
frc9 = analysis.simple_select('fuel_receipts_costs_eia923', pudl_engine)
frc9['fuel_cost'] = (frc9['fuel_quantity'] *
                     frc9['average_heat_content'] *
                     frc9['fuel_cost_per_mmbtu'])

In [None]:
fuel_summed = analysis.yearly_sum_eia(frc9, 'fuel_cost', columns=['plant_id_pudl',
                                           'report_year'])
fuel_summed = fuel_summed.reset_index()
fuel_summed.rename(columns={'fuel_cost': 'fuel_cost_annual_eia'}, inplace=True)
fuel_summed = fuel_summed.reset_index()

In [None]:
fuel_summed.head(2)

In [None]:
eia_netgen = mcoe_by_generator.groupby(by=['plant_id_eia', 'plant_id_pudl','report_year'])
netgen_summed = eia_netgen.agg({'net_generation_mwh':np.sum})
netgen_summed.rename(columns={'net_generation_mwh': 'net_generation_mwh_plant_eia'}, inplace=True)
netgen_summed = netgen_summed.reset_index()

In [None]:
mcoe_by_generator2 = mcoe_by_generator.merge(fuel_summed, how="left", left_on=['plant_id_pudl', 'report_year'], 
                                             right_on=['plant_id_pudl', 'report_year'])

In [None]:
mcoe_by_generator2.head(2)

In [None]:
mcoe_by_generator3 = mcoe_by_generator2.merge(netgen_summed, how="left", on=['plant_id_eia', 'plant_id_pudl', 'report_year'])

In [None]:
mcoe_by_generator3.head(2)

In [None]:
mcoe_by_generator = mcoe_by_generator3

## Export the data frame

In [None]:
# rearrange columns
mcoe_by_generator = mcoe_by_generator3[[
    'plant_id_eia', 
    'plant_id_pudl',
    'plant_name',
    'operator_name', 
    'state', 
    'report_year', 
    'generator_id', 
    'energy_source',
    'mmbtu_es',
    'mmbtu_plant',
    'fuel_cost_es',
    'fuel_cost_plant',
    'fuel_cost_per_mmbtu_average',
    'fuel_consumed_mmbtu',
    'fuel_consumed_mmbtu_per_gen',
    'fuel_cost_annual_eia',
    'heat_rate_mmbtu_mwh', 
    'plant_assn', 
    'net_generation_mwh', 
    'net_generation_mwh_plant_eia',
    'fuel_cost_per_mwh',
    'nameplate_capacity_mw', 
    'capacity_factor']]

In [None]:
mcoe_by_generator.head(2)

# Pull in FERC data & identify plants with simple

In [None]:
simple_ferc = analysis.simple_ferc1_plant_ids(pudl_engine)

In [None]:
# convert simple ferc list into a pandas series for import to analysis.ferc_expenses() function
simple_ferc = simple_ferc.iloc[:,0]

In [None]:
pd.options.display.max_rows = 999

In [None]:
fp = outputs.plants_steam_ferc1(pudl_engine)
# fp = analysis.get_steam_ferc1_df(pudl_engine)

In [None]:
# ferc_expenses returns:
# ferc1_expns_corr: A dictionary of expense categories
#             and their correlations to the plant's net electricity generation.
# steam_df: a dataframe with all the operating expenses broken out for each simple FERC PUDL plant.

In [None]:
ferc_expenses = analysis.ferc_expenses(pudl_engine, simple_ferc)
ferc_expenses_df = ferc_expenses[1]

In [None]:
ferc_expenses_df.head()

In [None]:
# create a correlation table to export
corr = pd.DataFrame.from_dict(ferc_expenses[0], orient='index')
corr.index.names = ['expense_variable']
corr = corr.reset_index()
corr = corr.rename(columns={0: 'correlation_with_net_generation_mwh_plant'})

In [None]:
# create a ferc_expenses data frame to merge with EIA data
ferc_expenses_df = ferc_expenses[1]

In [None]:
ferc_expenses_df.rename(columns={'net_generation_mwh': 'net_generation_mwh_plant',
                                 'plant_name': 'plant_name_ferc',
                                 'respondent_id': 'respondent_id_ferc',
                                 'respondent_name': 'respondent_name_ferc',
                                 'total_capacity_mw': 'total_capacity_mw_plant'}, inplace=True)

In [None]:
mcoe_by_generator.rename(columns={'net_generation_mwh': 'net_generation_mwh_generator',
                                  'plant_name': 'plant_name_eia',
                                  'operator_name': 'operator_name_eia',
                                  'nameplate_capacity_mw': 'nameplate_capacity_mw_generator',
                                  'summer_capacity_mw': 'summer_capacity_mw_generator',
                                  'winter_capacity_mw': 'winter_capacity_mw_generator'}, inplace=True)

In [None]:
mcoe_by_generator.head(2)

In [None]:
# mcoe_by_generator.drop('plant_id_pudl', axis=1, inplace=True)

In [None]:
# Left merge keeps all rows of mcoe_by_generator and duplicates values of ferc_expenses_df 
# for each combination of plant_id_pudl and report_date
merged = mcoe_by_generator.merge(ferc_expenses_df, how="left", left_on=['plant_id_pudl', 'report_year'], right_on=['plant_id_pudl', 'report_year'])

In [None]:
# remove the non-simple plants by dropping rows where plant_name_ferc = NA
merged = merged.dropna(subset=['plant_name_ferc'])

In [None]:
merged.head()

In [None]:
# rearrange columns
merged = merged[[
    'plant_id_pudl', 'plant_id_eia', 'plant_name_eia',
    'operator_name_eia', 'state', 'report_year', 
    'generator_id', 
    'energy_source', 'mmbtu_es', 'mmbtu_plant',
    'fuel_cost_es', 'fuel_cost_plant',
    'fuel_cost_per_mmbtu_average',
    'fuel_consumed_mmbtu', 
    'fuel_consumed_mmbtu_per_gen', 'fuel_cost_annual_eia',
    'heat_rate_mmbtu_mwh', 'plant_assn', 
    'net_generation_mwh_generator', 'net_generation_mwh_plant_eia',
    'fuel_cost_per_mwh', 'nameplate_capacity_mw_generator', 
    'capacity_factor',
    'report_year', 'respondent_id_ferc', 'util_id_pudl', 
    'respondent_name_ferc', 'plant_name_ferc',
    'total_capacity_mw_plant', 'year_constructed', 'year_installed',
    'peak_demand_mw', 'water_limited_mw', 'not_water_limited_mw',
    'plant_hours', 'net_generation_mwh_plant', 'expns_operations',
    'expns_fuel', 'expns_coolants', 'expns_steam', 'expns_steam_other',
    'expns_transfer', 'expns_electric', 'expns_misc_power', 'expns_rents',
    'expns_allowances', 'expns_engineering', 'expns_structures',
    'expns_boiler', 'expns_plants', 'expns_misc_steam',
    'expns_production_total', 'expns_per_mwh',
    'expns_total_nonfuel_production', 'expns_total_nonproduction']]

In [None]:
# number of plants in this data set
len(merged.plant_id_pudl.unique())

In [None]:
mcoe_annotations = pd.read_csv('mcoe_field_annotations_detailed.csv')
mcoe_notes = pd.read_csv('mcoe_notes.csv')

In [None]:
# # to export all the data
# xlsx_writer = pd.ExcelWriter('coal_plants_detailed_MCOE_by_generator_2011-2016.xlsx')
# merged.to_excel(xlsx_writer, sheet_name='MCOE by Generator', index=False, na_rep='NA')
# corr.to_excel(xlsx_writer, sheet_name='Expense Correlations', index=False, na_rep='NA')
# mcoe_annotations.to_excel(xlsx_writer, sheet_name='MCOE Annotations', index=False)
# mcoe_notes.to_excel(xlsx_writer, sheet_name='MCOE Notes', index=False)

# xlsx_writer.save()

In [None]:
# to export all the data
# xlsx_writer = pd.ExcelWriter('test_primary_fuel_cost_attribution.xlsx')
# xlsx_writer = pd.ExcelWriter('test_plant_category_cost_attribution.xlsx')
# xlsx_writer = pd.ExcelWriter('coal_plants_NSPC&MPI_detailed_MCOE_by_generator_2011-2016.xlsx')

# test.to_excel(xlsx_writer, sheet_name='MCOE by Generator', index=False, na_rep='NA')
# corr.to_excel(xlsx_writer, sheet_name='Expense Correlations', index=False, na_rep='NA')
# mcoe_annotations.to_excel(xlsx_writer, sheet_name='MCOE Annotations', index=False)
# mcoe_notes.to_excel(xlsx_writer, sheet_name='MCOE Notes', index=False)

# xlsx_writer.save()

In [None]:
merged.head(3)

In [None]:
merged2 = merged.dropna(subset=['net_generation_mwh_plant_eia', 'net_generation_mwh_plant'])

In [None]:
fuels = ['gas','oil','coal']
fig, (ax) = plt.subplots(ncols=1, nrows=1)
fig.set_figwidth(10)
fig.set_figheight(10)
fig.set_dpi(150)
ax.loglog()
plt.grid(b=True)
R2 = np.corrcoef(merged2.net_generation_mwh_plant_eia, merged2.net_generation_mwh_plant)[0,1]**2
ax.scatter(merged2.net_generation_mwh_plant_eia, merged2.net_generation_mwh_plant, s=10, color='blue')
ax.legend(loc='upper left')
plt.title("Annual net generation by plant, EIA 923 vs. FERC Form 1 ($r^2$={:.2f})".format(R2))
plt.xlabel("Net generation by PUDL plant from EIA923 (MWh)")
plt.ylabel("Net generation by PUDL plant from FERC1 (MWh)")
plt.show();

In [None]:
merged3 = merged.dropna(subset=['fuel_cost_annual_eia', 'expns_fuel'])

In [None]:
fuels = ['gas','oil','coal']
fig, (ax) = plt.subplots(ncols=1, nrows=1)
fig.set_figwidth(10)
fig.set_figheight(10)
fig.set_dpi(150)
ax.loglog()
plt.grid(b=True)
R2 = np.corrcoef(merged3.fuel_cost_annual_eia, merged3.expns_fuel)[0,1]**2
ax.scatter(merged3.fuel_cost_annual_eia, merged3.expns_fuel, s=10, color='green')
ax.legend(loc='upper left')
plt.title("Fuel cost per year, EIA 923 vs. FERC Form 1, ($r^2$={:.2f})".format(R2))
plt.xlabel("Fuel cost per year, by PUDL plant from EIA923 ($)")
plt.ylabel("Fuel cost per year, by PUDL plant from FERC1 ($)")
plt.show();

In [None]:
plt.hist(merged.net_generation_mwh_plant_eia, range=(0,500000), bins=100, alpha=0.5, color='green', label="EIA")
plt.hist(merged.net_generation_mwh_plant, range=(0,500000), bins=100, alpha=0.5, color='blue', label='FERC')
plt.xlabel("Total net generation per plant (MWh)")
plt.ylabel("Number of records")
plt.title("Net generation (MWh)")
plt.legend()

In [None]:
#quick look at spread in fuel_cost_per_mwh for these plants
plt.hist(merged.fuel_cost_per_mwh, range=(0,200), bins=200, alpha=0.5, color='green')
ferc_fuel_cost_per_mwh = merged.expns_fuel / merged.net_generation_mwh_plant
plt.hist(ferc_fuel_cost_per_mwh, range=(0,200), bins=200, alpha=0.5, color='blue')
plt.xlabel("Fuel Cost ($/mwh)")
plt.ylabel("Number of records")
plt.title("Cost of fuel")


## A few plots

In [None]:
# Take a look at heat rates from EIA
plt.hist(heat_rate.heat_rate_mmbtu_mwh, range=[0,50],bins=100,weights=heat_rate.net_generation_mwh, alpha=0.5)
#heat_rate['heat_rate_mmbtu_mwh'].plot(kind='hist', bins=100, range=[-20,70])
plt.xlabel('Generator heat rate (MMBtu/MWh)')
plt.ylabel("Generator records (weighted by net generation)")
plt.title("Heat rates")
plt.show()

In [None]:
#quick look at spread in fuel_cost_per_mmbtu
plt.hist(mcoe_by_generator.fuel_cost_per_mmbtu_average, range=(-5,50), bins=200, alpha=0.5, color='green')
plt.xlabel("Fuel Cost ($/mmBTU)")
plt.ylabel("Number of generator records")
plt.title("Cost of fuel")
plt.legend()

In [None]:
#quick look at capacity factors
plt.hist(mcoe_by_generator.capacity_factor, range=(0,1.5), bins=200, alpha=0.5, color='blue')
plt.xlabel("Capacity factor (Net generation/Nameplate capacity)")
plt.ylabel("Number of generator records")
plt.title("Capacity factors")
plt.legend()

In [None]:
#quick look at spread in fuel_cost_per_mwh for these plants
plt.hist(merged.fuel_cost_per_mwh, range=(0,200), bins=200, alpha=0.5, color='green')
plt.xlabel("Fuel Cost ($/mwh)")
plt.ylabel("Number of generator records")
plt.title("Cost of fuel")
plt.legend()