In [1]:
import sys
import os
import numpy as np
import pandas as pd
import sqlalchemy as sa
sys.path.append(os.path.abspath(os.path.join('..','..','..')))
from pudl import pudl, ferc1, eia923, settings, constants, analysis
from pudl import models, models_ferc1, models_eia923
from pudl import clean_eia923, clean_ferc1, clean_pudl
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import matplotlib as mpl
pd.options.mode.chained_assignment = None
%matplotlib inline
plt.style.use('ggplot')
mpl.rcParams['figure.figsize'] = (10,6)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 56

In [3]:
pudl_engine  = pudl.db_connect_pudl()

In [4]:
# Convert the generation_eia923 table into a dataframe
g9 = analysis.simple_select('generation_eia923', pudl_engine)

In [5]:
g9.head()

Unnamed: 0,id,plant_id,prime_mover,generator_id,report_date,net_generation_mwh
0,1,3,ST,1,2009-01-01,39699.0
1,2,3,ST,1,2009-02-01,5594.0
2,3,3,ST,1,2009-03-01,13015.0
3,4,3,ST,1,2009-04-01,15858.0
4,5,3,ST,1,2009-05-01,68232.0


In [15]:
# Convert the generation_eia923 table into a dataframe
g9 = analysis.simple_select('generation_eia923', pudl_engine)
# Get yearly net generation by plant_id, year and generator_id
g9_summed = analysis.yearly_sum_eia(g9 , 'net_generation_mwh')
g9_summed.reset_index(inplace=True)
g9_summed.rename(columns={'report_date':'report_year'},inplace=True) #take this away once different select is integrated

In [13]:
g9_summed.head()

Unnamed: 0,plant_id,report_date,generator_id,net_generation_mwh
0,3,2009,1,221908.0
1,3,2009,2,394031.0
2,3,2009,3,1286393.0
3,3,2009,4,1626547.0
4,3,2009,5,4513101.0


In [16]:
# Convert the generators_eia860 table into a dataframe
g8 = analysis.simple_select('generators_eia860', pudl_engine)

In [17]:
g8_es = g8[['plant_id','generator_id', 'energy_source_1','report_year']]
g8_es = g8_es.rename(columns={'energy_source_1': 'energy_source'})
g8_es.drop_duplicates(['plant_id','generator_id','report_year'], inplace=True)

In [18]:
#Create a count of the types of energy sources
g8_es_count = g8_es[['plant_id','energy_source','report_year']].drop_duplicates().groupby(['plant_id','report_year']).count()
g8_es_count.reset_index(inplace=True)
g8_es_count = g8_es_count.rename(columns={'energy_source':'energy_source_count'})
g8_es = g8_es.merge(g8_es_count,how='left',on=['plant_id','report_year'])

In [20]:
# Cheating to duplicate 2011 EIA860 energy srouce for 2010 and 2009:
g8_es_2010 = g8_es.loc[g8_es['report_year'] == 2011].copy()
g8_es_2010.report_year.replace([2011], [2010], inplace=True)
g8_es_2009 = g8_es.loc[g8_es['report_year'] == 2011].copy()
g8_es_2009.report_year.replace([2011], [2009], inplace=True)
g8_es_2016 = g8_es.loc[g8_es['report_year'] == 2015].copy()
g8_es_2016.report_year.replace([2015], [2016], inplace=True)
# Append 2009 and 2010
g8_es = g8_es.append([g8_es_2009, g8_es_2010, g8_es_2016])
#g8_es.reindex()

In [26]:
frc9.energy_source.drop_duplicates()

0          BIT
5           NG
32         DFO
56         SUB
75         LIG
164        RFO
171         OG
240         WO
313        KER
407         PC
556         WC
826         PG
1334        JF
3858       BFG
124145     SGP
283938    None
Name: energy_source, dtype: object

In [27]:
energy_source_map = {'coal': ['ANT', 'BIT', 'LIG', 'SUB', 'WC', 'RC'],
                     'gas': ['BFG','NG','OG','SGP', 'PG', 'SGC'],
                     'oil':['DFO', 'JF', 'KER', 'PC', 'RFO','WO'],
                     'solid_renewable' : ['AB', 'MSW', 'OBS', 'WDS'],
                     'biomass_solid':['OBL', 'SLW', 'BLQ', 'WDL'],
                     'biomass_gas':['LFG','OBG'],
                     'renewable_other':['SUN', 'WND','GEO', 'WAT'],
                     'other': ['NUC', 'PUR', 'WH', 'TDF', 'MWH', 'OTH','SG'] #wtf is SG?
                    }

In [28]:
g8_es['energy_source_cons'] = g8_es.energy_source.copy()

In [29]:
for energy_source_cons in energy_source_map.keys():
    for energy_source in energy_source_map[energy_source_cons]:
        g8_es.loc[g8_es.energy_source==energy_source, 'energy_source_cons'] = energy_source_cons

In [30]:
g8_es.energy_source_cons.drop_duplicates()

0                    coal
5         renewable_other
7                     oil
9                     gas
25        solid_renewable
30                  other
65            biomass_gas
1077        biomass_solid
107114               None
Name: energy_source_cons, dtype: object

In [22]:
# Convert the fuel_receipts_costs_eia923 table into a dataframe
frc9 = analysis.simple_select('fuel_receipts_costs_eia923', pudl_engine)
frc9['fuel_cost'] = (frc9['fuel_quantity'] * frc9['average_heat_content'] * frc9['fuel_cost_per_mmbtu'])
frc9['mmbtu'] =  (frc9['fuel_quantity'] * frc9['average_heat_content'])

In [34]:
for energy_source_cons in energy_source_map.keys():
    for energy_source in energy_source_map[energy_source_cons]:
        frc9.loc[frc9.energy_source==energy_source, 'energy_source_cons'] = energy_source_cons

In [23]:
# Get yearly fuel cost by plant_id, year and energy_source
frc9_summed = analysis.yearly_sum_eia(frc9 , 'fuel_cost', columns=['plant_id','report_date','energy_source'])
frc9_summed = frc9_summed.reset_index()
frc9_mmbtu_summed = analysis.yearly_sum_eia(frc9 , 'mmbtu', columns=['plant_id','report_date','energy_source'])
frc9_mmbtu_summed = frc9_mmbtu_summed.reset_index()
frc9_summed = frc9_mmbtu_summed.merge(frc9_summed)
frc9_summed['fuel_cost_per_mmbtu_average'] = (frc9_summed.fuel_cost/frc9_summed.mmbtu)

In [24]:
# Get yearly fuel cost by plant_id and year
# For use in calculating fuel cost for plants with one main energy soure
frc9_summed_plant = analysis.yearly_sum_eia(frc9 , 'fuel_cost', columns=['plant_id','report_date'])
frc9_summed_plant = frc9_summed_plant.reset_index()
frc9_mmbtu_summed_plant = analysis.yearly_sum_eia(frc9 , 'mmbtu', columns=['plant_id','report_date'])
frc9_mmbtu_summed_plant = frc9_mmbtu_summed_plant.reset_index()
frc9_summed_plant = frc9_mmbtu_summed_plant.merge(frc9_summed_plant)
frc9_summed_plant['fuel_cost_per_mmbtu_average'] = (frc9_summed_plant.fuel_cost/frc9_summed_plant.mmbtu)

Heat Rate Components

In [36]:
# Convert the boiler_generator_assn_eia860 table into a dataframe
bga8 = analysis.simple_select('boiler_generator_assn_eia860', pudl_engine)
bga8.drop(['id','operator_id'],axis=1, inplace=True)
bga8.drop_duplicates(['plant_id','boiler_id','generator_id'],inplace=True)

In [37]:
# Convert the boiler_fuel_eia923 table into a dataframe
bf9 = analysis.simple_select('boiler_fuel_eia923', pudl_engine)
bf9['fuel_consumed_mmbtu'] = bf9['fuel_qty_consumed']*bf9['fuel_mmbtu_per_unit']
# Get yearly fuel consumed by plant_id, year and boiler_id
bf9_summed = analysis.yearly_sum_eia(bf9 , 'fuel_consumed_mmbtu', columns=['plant_id','report_date','boiler_id'])
bf9_summed.reset_index(inplace=True)
# Get yearly fuel consumed by plant_id, year and boiler_id
bf9_plant_summed = analysis.yearly_sum_eia(bf9 , 'fuel_consumed_mmbtu', columns=['plant_id','report_date'])
bf9_plant_summed.reset_index(inplace=True)

In [38]:
heat_rate = analysis.heat_rate(bga8, g9_summed, bf9_summed, bf9_plant_summed, pudl_engine)

KeyError: 'report_date'

Guts inside the fuel cost formula

In [39]:
one_fuel_plants = g8_es[g8_es['energy_source_count'] == 1]

In [40]:
multi_fuel_plants = g8_es[g8_es['energy_source_count'] > 1]

In [41]:
multi_fuel_plants.count()

plant_id               34142
generator_id           34142
energy_source          34140
report_year            34142
energy_source_count    34142
energy_source_cons     34140
dtype: int64

One Fuel Plants

In [None]:
# Merge the generation table with the generator table to include energy_source
net_gen_one_fuel = g9_summed.merge(one_fuel_plants, how='left', on=[
                                   'plant_id', 'generator_id','report_date'])
net_gen_one_fuel.dropna(inplace=True)

In [None]:
net_gen_one_fuel.head()

In [None]:
fuel_cost_per_mmbtu = net_gen_one_fuel.merge(frc9_summed_plant,
                                    how='left',
                                    on=['plant_id',
                                        'report_date'])

In [None]:
frc9_summed_plant[frc9_summed_plant['plant_id'] == 7]

In [None]:
frc9_summed = frc9_summed.rename(columns={'energy_source':'energy_source_fuel'})
one_fuel_plants = one_fuel_plants.rename(columns={'energy_source':'energy_source_plant'})

In [None]:
mmbtu_one_fuel_es = frc9_summed.merge(one_fuel_plants.drop_duplicates(subset=['plant_id','energy_source_plant','report_date']),
                                    how='left',
                                    on=['plant_id',
                                        'report_date'])
# we'll merge with a total plant mmbtu, so we need to rename the per energy_source mmtbu
# we also need to drop all the NaN columns bc they are the multi fuel plants
# we also only need a few columns
mmbtu_one_fuel_es = mmbtu_one_fuel_es.rename(columns={'mmbtu': 'mmbtu_es'})\
                                     .dropna()\
                                     .drop(['fuel_cost',
                                            'fuel_cost_per_mmbtu_average',
                                            'generator_id',
                                            'energy_source_count'],axis=1)

In [None]:
mmbtu_one_fuel_es.head()

In [None]:
mmbtu_one_fuel_plant = frc9_summed_plant.merge(one_fuel_plants.drop_duplicates(subset=['plant_id','energy_source_plant','report_date']),
                                    how='left',
                                    on=['plant_id',
                                        'report_date'])
mmbtu_one_fuel_plant = mmbtu_one_fuel_plant.rename(columns={'mmbtu': 'mmbtu_plant'})\
                                           .dropna()\
                                           .drop(['fuel_cost',
                                                  'fuel_cost_per_mmbtu_average',
                                                  'generator_id',
                                                  'energy_source_count'],axis=1)

In [None]:
mmbtu_one_fuel_plant[mmbtu_one_fuel_plant['energy_source_plant'] == 'LIG']

In [None]:
mmbtu_one_fuel = mmbtu_one_fuel_es.merge(mmbtu_one_fuel_plant,how='left',on=['plant_id','report_date','energy_source_plant'])


In [None]:
mmbtu_one_fuel['mmbtu_propotion_es'] = mmbtu_one_fuel['mmbtu_es']/mmbtu_one_fuel['mmbtu_plant']

In [None]:
mmbtu_one_fuel[mmbtu_one_fuel['plant_id'] == 1379]

In [None]:
mmbtu_one_fuel[mmbtu_one_fuel['energy_source_fuel'] == 'SUB'].count()

In [None]:
mmbtu_one_fuel_bit = mmbtu_one_fuel[(mmbtu_one_fuel['energy_source_plant'] == 'BIT') & (mmbtu_one_fuel['energy_source_fuel'] == 'BIT')]

In [None]:
mmbtu_one_fuel_sub = mmbtu_one_fuel[(mmbtu_one_fuel['energy_source_plant'] == 'SUB') & 
                                    (mmbtu_one_fuel['energy_source_fuel'] == 'SUB')]

In [None]:
mmbtu_one_fuel_lig = mmbtu_one_fuel[(mmbtu_one_fuel['energy_source_plant'] == 'LIG') & 
                                    (mmbtu_one_fuel['energy_source_fuel'] == 'LIG')]

In [None]:
mmbtu_one_fuel_bit[mmbtu_one_fuel_bit['mmbtu_propotion_es'] < .8].drop_duplicates(subset=['plant_id'])

In [None]:
mmbtu_one_fuel_bit.describe()

In [None]:
mmbtu_one_fuel_sub.describe()

In [None]:
mmbtu_one_fuel_lig.describe()

In [None]:
plt.hist(mmbtu_one_fuel_bit.mmbtu_propotion_es, range=(.4,1), bins=100, alpha=0.5, color='green', label="BIT")
plt.hist(mmbtu_one_fuel_sub.mmbtu_propotion_es, range=(.4,1), bins=100, alpha=0.5, color='yellow', label="SUB")
plt.hist(mmbtu_one_fuel_lig.mmbtu_propotion_es, range=(.4,1), bins=100, alpha=0.5, color='blue', label="LIG")
plt.xlabel("Proportion")
plt.ylabel("Number of records")
plt.title("Proportion of Coal Receipts")
plt.legend()

Multi Fuel Plants

In [None]:
net_gen_mutli_fuel = g9_summed.merge(multi_fuel_plants, how='left', on=[
                                     'plant_id', 'generator_id','report_date'])
net_gen_mutli_fuel.dropna(inplace=True)

Problems...

In [None]:
# There are 1,376 generators out of 27,000 that don't have records in the generators table
# A chunk of these appear to be plants that are retired mid-way through a year
# this will be fixed when we pull in the retired tab into the generators table.
# A chunk of these appear to be genertors added after 2016. This should be fiexed
# when we pull in 2016 data.
test = g9_summed.merge(net_gen_one_fuel.append(net_gen_mutli_fuel),how='left')
test[test.energy_source.isnull()].count()

In [None]:
#simple plants

In [None]:
net_gen_one_fuel.head()

In [None]:
frc9_summed

In [None]:
# Merge this net_gen table with frc9_summed to have
# fuel_cost_per_mmbtu_total associated with generators
fuel_cost_per_mmbtu = net_gen_one_fuel.merge(frc9_summed,
                                             how='left',
                                             on=['plant_id',
                                                'report_date'])
        #                                        'energy_source'])

In [None]:
fuel_cost_per_mmbtu[(fuel_cost_per_mmbtu['plant_id'] == 3644) & (fuel_cost_per_mmbtu['report_date'] == 2014)]

In [None]:
fuel_cost = fuel_cost_per_mmbtu.merge(heat_rate[['plant_id',
                                                 'report_date',
                                                 'generator_id',
                                                 'net_generation_mwh',
                                                 'heat_rate_mmbtu_mwh']],
                                      on=['plant_id',
                                          'report_date',
                                          'generator_id',
                                          'net_generation_mwh'])

In [None]:
# Calculate fuel cost per mwh using average fuel cost given year, plant,
# fuel type; divide by generator-specific heat rate
fuel_cost['fuel_cost_per_mwh'] = (fuel_cost['fuel_cost_per_mmbtu_average']
                                  * fuel_cost['heat_rate_mmbtu_mwh'])

In [None]:
fuel_cost[(fuel_cost['plant_id'] == 3644) & (fuel_cost['report_date'] == 2014)]

In [None]:
net_gen = g9_summed.merge(g8_es, how='left', on=[
                          'plant_id', 'generator_id'])
# Merge this net_gen table with frc9_summed to have
# fuel_cost_per_mmbtu_total associated with generators
fuel_cost_per_mmbtu = net_gen.merge(frc9_summed,
                                    how='left',
                                    on=['plant_id',
                                        'report_date',
                                        'energy_source'])

fuel_cost = fuel_cost_per_mmbtu.merge(heat_rate[['plant_id',
                                                 'report_date',
                                                 'generator_id',
                                                 'net_generation_mwh',
                                                 'heat_rate_mmbtu_mwh']],
                                      on=['plant_id',
                                          'report_date',
                                          'generator_id',
                                          'net_generation_mwh'])

# Calculate fuel cost per mwh using average fuel cost given year, plant,
# fuel type; divide by generator-specific heat rate
fuel_cost['fuel_cost_per_mwh'] = (fuel_cost['fuel_cost_per_mmbtu_average']
                                  * fuel_cost['heat_rate_mmbtu_mwh'])

In [None]:
frc9_summed[(frc9_summed['plant_id'] == 3) & (frc9_summed['report_date'] == 2015)]

In [None]:
fuel_cost_per_mwh = net_gen.merge(frc9_summed,how='left',on=['plant_id','report_date','energy_source'])
fuel_cost_per_mwh['fuel_cost_per_mwh'] = fuel_cost_per_mwh['fuel_cost'] / fuel_cost_per_mwh['net_generation_mwh']

In [None]:
fuel_cost_per_mwh[(fuel_cost_per_mwh['plant_id'] == 3) & (fuel_cost_per_mwh['report_date'] == 2015)]