In [1]:
import sys
import os
import numpy as np
import pandas as pd
import sqlalchemy as sa
from pudl import pudl, ferc1, eia923, settings, constants, analysis
from pudl import models, models_ferc1, models_eia923
from pudl import clean_eia923, clean_ferc1, clean_pudl, mcoe
import matplotlib.pyplot as plt
import matplotlib as mpl
pd.options.mode.chained_assignment = None
%matplotlib inline
plt.style.use('ggplot')
mpl.rcParams['figure.figsize'] = (10,6)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 56

In [2]:
pudl_engine  = pudl.connect_db()

### Pulling the required tables

In [3]:
g9_summed = mcoe.generation_pull_eia923(pudl_engine)
g8, g8_es = mcoe.generators_pull_eia860(pudl_engine)
frc9_summed, frc9_summed_plant = mcoe.fuel_reciepts_costs_pull_eia923(pudl_engine)
bga8 = mcoe.boiler_generator_pull_eia860(pudl_engine)
bf9_summed, bf9_plant_summed = mcoe.boiler_fuel_pull_eia923(pudl_engine)

## Generating Capacity Factor

In [4]:
capacity_factor = analysis.capacity_factor(g9_summed,g8)

## Generating Heat Rate

In [5]:
heat_rate = mcoe.heat_rate(bga8, g9_summed, bf9_summed, bf9_plant_summed, pudl_engine)

## Generating Fuel Cost

In [6]:
fuel_cost = mcoe.fuel_cost(g8_es, g9_summed, frc9_summed, frc9_summed_plant, heat_rate)

### Merge fields together

In [7]:
mcoe_by_generator = fuel_cost[['plant_id_eia',
                               'plant_id_pudl',
                               'report_year',
                               'generator_id',
                               'energy_source',
                               'fuel_cost_per_mmbtu_average',
                               'fuel_cost_per_mwh']].merge(heat_rate,
                                                           on=['plant_id_eia',
                                                               'plant_id_pudl',
                                                               'report_year',
                                                               'generator_id'])

In [9]:
mcoe_by_generator = mcoe_by_generator.merge(capacity_factor.drop('net_generation_mwh', axis=1),
                                            on=['plant_id_eia','plant_id_pudl','report_year','generator_id'])

### Filter to include only plants >= 10 MW

In [11]:
# Now, let's grab only those plants larger than 10MW
mcoe_by_generator = mcoe_by_generator[mcoe_by_generator['nameplate_capacity_mw']>=10]

In [12]:
mcoe_by_generator.head()

Unnamed: 0,plant_id_eia,plant_id_pudl,report_year,generator_id,energy_source,fuel_cost_per_mmbtu_average,fuel_cost_per_mwh,complete_assn,fuel_consumed_mmbtu_per_gen,heat_rate_mmbtu_mwh,net_generation_mwh,plant_assn,plant_name,operator_name,state,nameplate_capacity_mw,summer_capacity_mw,winter_capacity_mw,capacity_factor
0,7,204,2011,1,BIT,3.968971,63.964246,True,2976913.0,16.116077,184717.0,True,Gadsden,Alabama Power Co,AL,69.0,64.0,64.0,0.3056
1,7,204,2011,2,BIT,3.968971,57.239085,True,655939.6,14.421642,45483.0,True,Gadsden,Alabama Power Co,AL,69.0,66.0,66.0,0.075248
2,8,227,2011,10,BIT,4.13972,40.567746,True,32816580.0,9.799634,3348756.0,True,Gorgas,Alabama Power Co,AL,788.8,703.0,703.0,0.484632
3,8,227,2011,6,BIT,4.13972,51.88191,True,3018039.0,12.532709,240813.0,True,Gorgas,Alabama Power Co,AL,125.0,103.0,103.0,0.219921
4,8,227,2011,7,BIT,4.13972,47.555815,True,3142376.0,11.487687,273543.0,True,Gorgas,Alabama Power Co,AL,125.0,104.0,104.0,0.249811


## Aggregate at fuel cost and net gen at plant level for comparison with FERC

In [19]:
frc9 = analysis.simple_select('fuel_receipts_costs_eia923', pudl_engine)

In [13]:
eia_fuel = analysis.yearly_sum_eia(frc9 , 'fuel_cost', columns=['plant_id_eia','report_year'])
fuel_summed = eia_fuel.reset_index()
fuel_summed.rename(columns={'fuel_cost': 'fuel_cost_annual_eia'}, inplace=True)
fuel_summed = fuel_summed.reset_index()

NameError: name 'frc9' is not defined

In [15]:
eia_netgen = mcoe_by_generator.groupby(by=['plant_id_eia','report_year'])
netgen_summed = eia_netgen.agg({'net_generation_mwh':np.sum})
netgen_summed.rename(columns={'net_generation_mwh': 'net_generation_mwh_plant_eia'}, inplace=True)
netgen_summed = netgen_summed.reset_index()

In [None]:
mcoe_by_generator2 = mcoe_by_generator.merge(fuel_summed, how="left", left_on=['plant_id_eia', 'report_date'], right_on=['plant_id', 'report_date'])

In [None]:
mcoe_by_generator3 = mcoe_by_generator2.merge(netgen_summed, how="left", left_on=['plant_id_eia', 'report_date'], right_on=['plant_id_eia', 'report_date'])

In [None]:
#mcoe_by_generator3.drop('heat_rate_mmbtu_mwh_x', axis=1, inplace=True)
#mcoe_by_generator3.drop('heat_rate_mmbtu_mwh_y', axis=1, inplace=True)

In [None]:
mcoe_by_generator = mcoe_by_generator3

## Export the data frame

In [None]:
# rearrange columns
mcoe_by_generator = mcoe_by_generator3[[
    'plant_id_eia', 
    'plant_id_pudl',
    'plant_name',
    'operator_name', 
    'state', 
    'report_date', 
    'generator_id', 
    'energy_source',
    'fuel_cost_per_mmbtu_average',
    'fuel_consumed_mmbtu_per_gen',
    'fuel_cost_annual_eia',
    'heat_rate_mmbtu_mwh', 
    'plant_assn', 
    'net_generation_mwh', 
    'net_generation_mwh_plant_eia',
    'fuel_cost_per_mwh',
    'nameplate_capacity_mw', 
    'summer_capacity_mw',
    'winter_capacity_mw', 
    'capacity_factor']]

# Pull in FERC data & identify plants with simple

In [None]:
simple_ferc = analysis.simple_ferc1_plant_ids(pudl_engine)

In [None]:
# convert simple ferc list into a pandas series for import to analysis.ferc_expenses() function
simple_ferc = simple_ferc.iloc[:,0]

In [None]:
# ferc_expenses returns:
# ferc1_expns_corr: A dictionary of expense categories
#             and their correlations to the plant's net electricity generation.
# steam_df: a dataframe with all the operating expenses broken out for each simple FERC PUDL plant.

In [None]:
ferc_expenses = analysis.ferc_expenses(pudl_engine, simple_ferc)
ferc_expenses_df = ferc_expenses[1]

In [None]:
# create a correlation table to export
corr = pd.DataFrame.from_dict(ferc_expenses[0], orient='index')
corr.index.names = ['expense_variable']
corr = corr.reset_index()
corr = corr.rename(columns={0: 'correlation_with_net_generation_mwh_plant'})

In [None]:
# create a ferc_expenses data frame to merge with EIA data
ferc_expenses_df = ferc_expenses[1]

In [None]:
ferc_expenses_df.rename(columns={'net_generation_mwh': 'net_generation_mwh_plant',
                                 'plant_name': 'plant_name_ferc',
                                 'respondent_id': 'respondent_id_ferc',
                                 'respondent_name': 'respondent_name_ferc',
                                 'total_capacity_mw': 'total_capacity_mw_plant'}, inplace=True)

In [None]:
mcoe_by_generator.rename(columns={'net_generation_mwh': 'net_generation_mwh_generator',
                                  'plant_name': 'plant_name_eia',
                                  'operator_name': 'operator_name_eia',
                                  'nameplate_capacity_mw': 'nameplate_capacity_mw_generator',
                                  'summer_capacity_mw': 'summer_capacity_mw_generator',
                                  'winter_capacity_mw': 'winter_capacity_mw_generator'}, inplace=True)

In [None]:
# Left merge keeps all rows of mcoe_by_generator and duplicates values of ferc_expenses_df 
# for each combination of plant_id_pudl and report_date
merged = mcoe_by_generator.merge(ferc_expenses_df, how="left", left_on=['plant_id_pudl', 'report_date'], right_on=['plant_id_pudl', 'report_year'])

In [None]:
# remove the non-simple plants by dropping rows where plant_name_ferc = NA
merged = merged.dropna(subset=['plant_name_ferc'])

In [None]:
# rearrange columns
merged = merged[[
    'plant_id_pudl', 'plant_id_eia', 'plant_name_eia',
    'operator_name_eia', 'state', 'report_date', 
    'generator_id', 'energy_source', 'fuel_cost_per_mmbtu_average',
    'fuel_consumed_mmbtu_per_gen', 'fuel_cost_annual_eia',
    'heat_rate_mmbtu_mwh', 'plant_assn', 
    'net_generation_mwh_generator', 'net_generation_mwh_plant_eia',
    'fuel_cost_per_mwh', 'nameplate_capacity_mw_generator', 
    'summer_capacity_mw_generator', 'winter_capacity_mw_generator', 'capacity_factor',
    'report_year', 'respondent_id_ferc', 'util_id_pudl', 
    'respondent_name_ferc', 'plant_name_ferc',
    'total_capacity_mw_plant', 'year_constructed', 'year_installed',
    'peak_demand_mw', 'water_limited_mw', 'not_water_limited_mw',
    'plant_hours', 'net_generation_mwh_plant', 'expns_operations',
    'expns_fuel', 'expns_coolants', 'expns_steam', 'expns_steam_other',
    'expns_transfer', 'expns_electric', 'expns_misc_power', 'expns_rents',
    'expns_allowances', 'expns_engineering', 'expns_structures',
    'expns_boiler', 'expns_plants', 'expns_misc_steam',
    'expns_production_total', 'expns_per_mwh',
    'expns_total_nonfuel_production', 'expns_total_nonproduction']]

In [None]:
mcoe_annotations = pd.read_csv('mcoe_field_annotations.csv')
mcoe_notes = pd.read_csv('mcoe_notes.csv')

In [None]:
# to export all the data
xlsx_writer = pd.ExcelWriter('MCOE_by_generator_FERC&EIA_simple_FERC_plants_2011-2016.xlsx')
merged.to_excel(xlsx_writer, sheet_name='MCOE by Generator', index=False, na_rep='NA')
corr.to_excel(xlsx_writer, sheet_name='Expense Correlations', index=False, na_rep='NA')
mcoe_annotations.to_excel(xlsx_writer, sheet_name='MCOE Annotations', index=False)
mcoe_notes.to_excel(xlsx_writer, sheet_name='MCOE Notes', index=False)

xlsx_writer.save()

In [None]:
merged2 = merged.dropna(subset=['net_generation_mwh_plant_eia', 'net_generation_mwh_plant'])

In [None]:
fuels = ['gas','oil','coal']
fig, (ax) = plt.subplots(ncols=1, nrows=1)
fig.set_figwidth(10)
fig.set_figheight(10)
fig.set_dpi(150)
ax.loglog()
plt.grid(b=True)
R2 = np.corrcoef(merged2.net_generation_mwh_plant_eia, merged2.net_generation_mwh_plant)[0,1]**2
ax.scatter(merged2.net_generation_mwh_plant_eia, merged2.net_generation_mwh_plant, s=10, color='blue')
ax.legend(loc='upper left')
plt.title("Annual net generation by plant, EIA 923 vs. FERC Form 1 ($r^2$={:.2f})".format(R2))
plt.xlabel("Net generation by PUDL plant from EIA923 (MWh)")
plt.ylabel("Net generation by PUDL plant from FERC1 (MWh)")
plt.show();

In [None]:
merged3 = merged.dropna(subset=['fuel_cost_annual_eia', 'expns_fuel'])

In [None]:
fuels = ['gas','oil','coal']
fig, (ax) = plt.subplots(ncols=1, nrows=1)
fig.set_figwidth(10)
fig.set_figheight(10)
fig.set_dpi(150)
ax.loglog()
plt.grid(b=True)
R2 = np.corrcoef(merged3.fuel_cost_annual_eia, merged3.expns_fuel)[0,1]**2
ax.scatter(merged3.fuel_cost_annual_eia, merged3.expns_fuel, s=10, color='green')
ax.legend(loc='upper left')
plt.title("Fuel cost per year, EIA 923 vs. FERC Form 1, ($r^2$={:.2f})".format(R2))
plt.xlabel("Fuel cost per year, by PUDL plant from EIA923 ($)")
plt.ylabel("Fuel cost per year, by PUDL plant from FERC1 ($)")
plt.show();

In [None]:
plt.hist(merged.net_generation_mwh_plant_eia, range=(0,500000), bins=100, alpha=0.5, color='green', label="EIA")
plt.hist(merged.net_generation_mwh_plant, range=(0,500000), bins=100, alpha=0.5, color='blue', label='FERC')
plt.xlabel("Total net generation per plant (MWh)")
plt.ylabel("Number of records")
plt.title("Net generation (MWh)")
plt.legend()

In [None]:
#quick look at spread in fuel_cost_per_mwh for these plants
plt.hist(merged.fuel_cost_per_mwh, range=(0,200), bins=200, alpha=0.5, color='green')
ferc_fuel_cost_per_mwh = merged.expns_fuel / merged.net_generation_mwh_plant
plt.hist(ferc_fuel_cost_per_mwh, range=(0,200), bins=200, alpha=0.5, color='blue')
plt.xlabel("Fuel Cost ($/mwh)")
plt.ylabel("Number of records")
plt.title("Cost of fuel")
plt.legend()

## A few plots

In [None]:
# Take a look at heat rates from EIA
plt.hist(heat_rate.heat_rate_mmbtu_mwh, range=[0,50],bins=100,weights=heat_rate.net_generation_mwh, alpha=0.5)
#heat_rate['heat_rate_mmbtu_mwh'].plot(kind='hist', bins=100, range=[-20,70])
plt.xlabel('Generator heat rate (MMBtu/MWh)')
plt.ylabel("Generator records (weighted by net generation)")
plt.title("Heat rates")
plt.show()

In [None]:
#quick look at spread in fuel_cost_per_mmbtu
plt.hist(mcoe_by_generator.fuel_cost_per_mmbtu_average, range=(-5,50), bins=200, alpha=0.5, color='green')
plt.xlabel("Fuel Cost ($/mmBTU)")
plt.ylabel("Number of generator records")
plt.title("Cost of fuel")
plt.legend()

In [None]:
#quick look at capacity factors
plt.hist(mcoe_by_generator.capacity_factor, range=(0,1.5), bins=200, alpha=0.5, color='blue')
plt.xlabel("Capacity factor (Net generation/Nameplate capacity)")
plt.ylabel("Number of generator records")
plt.title("Capacity factors")
plt.legend()

In [None]:
#quick look at spread in fuel_cost_per_mwh for these plants
plt.hist(merged.fuel_cost_per_mwh, range=(0,200), bins=200, alpha=0.5, color='green')
plt.xlabel("Fuel Cost ($/mwh)")
plt.ylabel("Number of generator records")
plt.title("Cost of fuel")
plt.legend()