## EIA and Ferc Annual Fuel Cost Correlations

In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..','..')))
from pudl import pudl, ferc1, eia923, settings, constants
from pudl import models, models_ferc1, models_eia923
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlalchemy as sa
from sqlalchemy import and_, tuple_
%matplotlib inline

In [2]:
pudl_engine = pudl.db_connect_pudl()

## Get a list of all EIA plants' EIA plant_ids and PUDL plant_ids

* Find all the FERC respondent IDs
* Use the list of FERC respondent IDs to look up all the associated PUDL plant IDs
* Find all the EIA plant IDs from the list of FERC respondent IDs
* Find all the PUDL IDs associated with the EIA plants
* Then we can look up all of the PUDL plant_id values associated with that utility_id
* Finally we look up up all of the EIA 923 plant_id values associated with those PUDL plant_ids, FERC respondent IDs and plant names (and a tuple of those values)

In [4]:
Session = sa.orm.sessionmaker()
Session.configure(bind = pudl_engine)
session = Session()

ferc_respondent_ids = [u.respondent_id for u in session.query(models.UtilityFERC1).all()]
ferc_pudl_plant_ids = [u.plant_id_pudl for u in session.query(models.PlantFERC1).\
                                                    filter(models.PlantFERC1.respondent_id.in_(ferc_respondent_ids))]
ferc_respondent_ids = [u.respondent_id for u in session.query(models.PlantFERC1).\
                                                    filter(models.PlantFERC1.plant_id_pudl.in_(ferc_pudl_plant_ids))]

eia_plant_ids = [p.plant_id for p in session.query(models.PlantEIA923).\
                                                    filter(models.PlantEIA923.plant_id_pudl.in_(ferc_pudl_plant_ids))]
shared_pudl_plant_ids = [p.plant_id_pudl for p in session.query(models.PlantEIA923).\
                                                    filter(models.PlantEIA923.plant_id.in_(eia_plant_ids))]

ferc_plant_names = [p.plant_name for p in session.query(models.PlantFERC1).\
                                                     filter(models.PlantFERC1.\
                                                            plant_id_pudl.in_(shared_pudl_plant_ids))]
ferc_plant_rids = [p.respondent_id for p in session.query(models.PlantFERC1).\
                                                     filter(models.PlantFERC1.\
                                                            plant_id_pudl.in_(shared_pudl_plant_ids))]

shared_rids_names = [(p.respondent_id, p.plant_name) for p in session.query(models.PlantFERC1).\
                                                     filter(models.PlantFERC1.\
                                                            plant_id_pudl.in_(shared_pudl_plant_ids))]

ProgrammingError: (psycopg2.ProgrammingError) relation "utilities_ferc1" does not exist
LINE 2: FROM utilities_ferc1
             ^
 [SQL: 'SELECT utilities_ferc1.respondent_id AS utilities_ferc1_respondent_id, utilities_ferc1.respondent_name AS utilities_ferc1_respondent_name, utilities_ferc1.util_id_pudl AS utilities_ferc1_util_id_pudl \nFROM utilities_ferc1']

We'll connect to the database and pull the necessary fuel quantity and cost information from the EIA 923 fuel receipts and costs table, as well as the PUDL ID and the plant name from the EIA923 plants table.

In [9]:
Session = sa.orm.sessionmaker()
Session.configure(bind = pudl_engine)
session = Session()

frc_table = models.PUDLBase.metadata.tables['fuel_receipts_costs_eia923']
plants_eia923_tbl = models.PUDLBase.metadata.tables['plants_eia923']

frc_select = sa.sql.select([frc_table.c.plant_id,
                            plants_eia923_tbl.c.plant_name,
                            plants_eia923_tbl.c.plant_id_pudl,
                            frc_table.c.fuel_quantity,
                            frc_table.c.average_heat_content,
                            frc_table.c.report_date,
                            frc_table.c.fuel_cost]).\
                            where(frc_table.c.plant_id.in_(eia_plant_ids)).\
                            where(frc_table.c.plant_id == plants_eia923_tbl.c.plant_id)
    
frc_df = pd.read_sql(frc_select, pudl_engine)

We'll make a handful of adjustments to the dataframe, calculating the total cost of each delivery and making the report date easier to work with.

In [10]:
frc_df['total_fuel_cost'] = frc_df['fuel_cost'] * frc_df['fuel_quantity'] * frc_df['average_heat_content'] * .01
frc_df['report_date'] = pd.to_datetime(frc_df['report_date'])
frc_df['year'] = frc_df['report_date'].dt.year
frc_df.index = frc_df['report_date']

In [11]:
frc_df

Unnamed: 0_level_0,plant_id,plant_name,plant_id_pudl,fuel_quantity,average_heat_content,report_date,fuel_cost,total_fuel_cost,year
report_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2009-01-01,3,Barry,32,120393.0,24.000,2009-01-01,631.1,1.823521e+07,2009
2009-01-01,3,Barry,32,199388.0,23.000,2009-01-01,350.3,1.606449e+07,2009
2009-01-01,3,Barry,32,43105.0,22.785,2009-01-01,355.7,3.493498e+06,2009
2009-01-01,3,Barry,32,9458.0,23.790,2009-01-01,498.0,1.120529e+06,2009
2009-01-01,3,Barry,32,9094.0,24.000,2009-01-01,629.0,1.372830e+06,2009
2009-01-01,3,Barry,32,1902799.0,1.036,2009-01-01,680.9,1.342258e+07,2009
2009-01-01,3,Barry,32,28469.0,1.045,2009-01-01,568.0,1.689806e+05,2009
2009-01-01,7,Gadsden,203,21205.0,24.908,2009-01-01,397.6,2.100020e+06,2009
2009-01-01,7,Gadsden,203,3189.0,1.014,2009-01-01,638.1,2.063390e+04,2009
2009-01-01,7,Gadsden,203,11.0,1.009,2009-01-01,612.1,6.793698e+01,2009


In [None]:
test_df = frc_df.groupby(['plant_id_pudl','year']).aggregate(np.sum)['total_fuel_cost'].unstack()
test_df

Grouping data by PUDL ID and year, we can calculate the total annual fuel cost for each plant using the EIA 923 data.

In [None]:
frc_df['annual_fuel_cost'] = frc_df.groupby(['plant_id_pudl','year'])['total_fuel_cost'].transform(sum)

In [12]:
frc_df

Unnamed: 0_level_0,plant_id,plant_name,plant_id_pudl,fuel_quantity,average_heat_content,report_date,fuel_cost,total_fuel_cost,year
report_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2009-01-01,3,Barry,32,120393.0,24.000,2009-01-01,631.1,1.823521e+07,2009
2009-01-01,3,Barry,32,199388.0,23.000,2009-01-01,350.3,1.606449e+07,2009
2009-01-01,3,Barry,32,43105.0,22.785,2009-01-01,355.7,3.493498e+06,2009
2009-01-01,3,Barry,32,9458.0,23.790,2009-01-01,498.0,1.120529e+06,2009
2009-01-01,3,Barry,32,9094.0,24.000,2009-01-01,629.0,1.372830e+06,2009
2009-01-01,3,Barry,32,1902799.0,1.036,2009-01-01,680.9,1.342258e+07,2009
2009-01-01,3,Barry,32,28469.0,1.045,2009-01-01,568.0,1.689806e+05,2009
2009-01-01,7,Gadsden,203,21205.0,24.908,2009-01-01,397.6,2.100020e+06,2009
2009-01-01,7,Gadsden,203,3189.0,1.014,2009-01-01,638.1,2.063390e+04,2009
2009-01-01,7,Gadsden,203,11.0,1.009,2009-01-01,612.1,6.793698e+01,2009


We'll drop duplicates, leaving one entry per plant, per year.

In [None]:
frc_df.drop_duplicates(subset=['plant_id_pudl','year'],inplace=True)

We'll do a similar exercise with FERC Form 1, pulling fuel expense data from the Steam table.

In [None]:
Session = sa.orm.sessionmaker()
Session.configure(bind = pudl_engine)
session = Session()

steam_table = models.PUDLBase.metadata.tables['plants_steam_ferc1']
plants_ferc1_tbl = models.PUDLBase.metadata.tables['plants_ferc1']

steam_select = sa.sql.select([steam_table.c.respondent_id,
                            steam_table.c.plant_name,
                            steam_table.c.total_capacity_mw,
                            steam_table.c.report_year,
                            steam_table.c.expns_fuel,
                            plants_ferc1_tbl.c.plant_id_pudl]).\
                            where(steam_table.c.plant_name == plants_ferc1_tbl.c.plant_name)

steam_df = pd.read_sql(steam_select, pudl_engine)

We'll generate a list of tuples containing the identifying information for a FERC plant, the respondent ID and plant name.

In [None]:
steam_df['rid_name'] = list(zip(steam_df.respondent_id,steam_df.plant_name))
steam_df = steam_df[steam_df.rid_name.isin(shared_rids_names)]
steam_df.drop_duplicates(subset=['rid_name','report_year'],inplace=True)

There are a few things we can do now. One thing would be to group fuel expenses by respondent_id and name, which will give us an idea of the fuel expenses per plant each year as reported by each utility individually.

In [None]:
rid_df = steam_df.groupby(['plant_id_pudl','respondent_id','plant_name','report_year']).aggregate(np.sum)\
[['expns_fuel','total_capacity_mw']].unstack()
rid_df

Alternatively we can group by pudl_id and year, which will give us an idea of the fuel expenses per PUDL plant - one or more FERC plants co-located - each year. Since there may be some duplication in the co-located data, where multiple FERC respondents are reporting the same fuel expenses, it might be useful to break out the data by respondent_id and plant name and then compare that broken out data to the EIA fuel expense data.

In [None]:
pudl_id_df = steam_df.groupby(['plant_id_pudl','report_year']).aggregate(np.sum)[['expns_fuel','total_capacity_mw']].unstack()
pudl_id_df

Merge the two dataframes.

In [None]:
m = rid_df.merge(pudl_id_df,how='left',left_index=True,right_index=True)
m

Or, the original way we did it. We'll then group again by year and by PUDL ID to get an annual sum of fuel expenses.

In [None]:
steam_df['ferc_fuel_sums'] = steam_df.groupby(['plant_id_pudl','report_year'])['expns_fuel'].transform(sum)

Merging the two dataframes on the PUDL ID will give us records where there is a total fuel cost value missing from EIA 923 and a total fuel expense value missing from FERC Form 1 so we'll drop NaN values after merging.

In [None]:
merged_df = steam_df.merge(frc_df,how="left",left_on=["plant_id_pudl","report_year"],right_on=['plant_id_pudl','year'])
merged_df = merged_df[(merged_df.ferc_fuel_sums > 0) & (merged_df.annual_fuel_cost > 0)]
merged_df.dropna(subset=['annual_fuel_cost','ferc_fuel_sums'],inplace=True)

We're now in a position to plot the data and see what the correlation between fuel cost is.

In [None]:
plt.figure(figsize=(14,10))
plt.scatter(merged_df.ferc_fuel_sums,merged_df.annual_fuel_cost)
plt.xlabel('FERC annual fuel expenses (dollars)')
plt.ylabel('EIA fuel receipts and costs, annual cost of fuel delivered (dollars)')
plt.title('FERC and EIA: Annual fuel cost comparsion')

In [None]:
merged_df.corr()

In [None]:
merged_df

We have a dataframe with PUDL IDs 'plant_id_pudl' and years 'year'.

In [None]:
def fuel_type_assigner(plant_id_pudls, years, threshold):
    
    Session = sa.orm.sessionmaker()
    Session.configure(bind = pudl_engine)
    session = Session()
    
    frc_table = models.PUDLBase.metadata.tables['fuel_receipts_costs_eia923']
    plants_eia923_tbl = models.PUDLBase.metadata.tables['plants_eia923']
    
    frc_select = sa.sql.select([frc_table.c.plant_id,
                            plants_eia923_tbl.c.plant_name,
                            plants_eia923_tbl.c.plant_id_pudl,
                            frc_table.c.report_date,
                            frc_table.c.fuel_group,
                            frc_table.c.fuel_quantity,
                            frc_table.c.average_heat_content,
                            frc_table.c.fuel_cost]).\
                            where(frc_table.c.plant_id == plants_eia923_tbl.c.plant_id)
        
    f1_fuel_table = models.PUDLBase.metadata.tables['fuel_ferc1']
    plants_ferc1_tbl = models.PUDLBase.metadata.tables['plants_ferc1']
    
    f1_select = sa.sql.select([plants_ferc1_tbl.c.plant_name,
                            plants_ferc1_tbl.c.plant_id_pudl,
                            f1_fuel_table.c.report_year,
                            f1_fuel_table.c.fuel,
                            f1_fuel_table.c.fuel_qty_burned,
                            f1_fuel_table.c.fuel_avg_mmbtu_per_unit,
                            f1_fuel_table.c.fuel_cost_per_unit_burned]).\
                            where(f1_fuel_table.c.respondent_id == plants_ferc1_tbl.c.respondent_id).\
                            where(f1_fuel_table.c.plant_name == plants_ferc1_tbl.c.plant_name)
    
    frc_df = pd.read_sql(frc_select, pudl_engine)
    f1_df = pd.read_sql(f1_select, pudl_engine)
    
    frc_df['report_date'] = pd.to_datetime(frc_df['report_date'])
    frc_df['mmbtu_delivered'] = frc_df['fuel_quantity'] * frc_df['average_heat_content']
    frc_df['year'] = frc_df['report_date'].dt.year
    
    eia_plant_types = []
    ferc_plant_types = []
    combined_plant_types = []
    pudl_ids = []
    year_list = []
    
    for plant_id_pudl, year in zip(plant_id_pudls, years):
        
        if (plant_id_pudl, year) in zip(f1_df.plant_id_pudl, f1_df.report_year):
    
            eia_selected_plant = frc_df[(frc_df.plant_id_pudl == plant_id_pudl) & (frc_df.year == year)]
    
            eia_total_mmbtu_delivered = eia_selected_plant['mmbtu_delivered'].sum()
    
            eia_fuel_group = eia_selected_plant.groupby('fuel_group')
            eia_fuel_sums = eia_fuel_group.aggregate(np.sum)['mmbtu_delivered']
    
            if any(eia_selected_plant.fuel_group == 'Coal'):
                eia_coal_percentage = eia_fuel_sums['Coal'] / eia_total_mmbtu_delivered * 100
            else:
                eia_coal_percentage = 0
            if any(eia_selected_plant.fuel_group == 'Natural Gas'):
                eia_gas_percentage = eia_fuel_sums['Natural Gas'] / eia_total_mmbtu_delivered * 100
            else:
                eia_gas_percentage = 0
            if any(eia_selected_plant.fuel_group == 'Petroleum'):
                eia_oil_percentage = eia_fuel_sums['Petroleum'] / eia_total_mmbtu_delivered * 100
            else:
                eia_oil_percentage = 0

            if eia_coal_percentage > threshold:
                eia_plant_type = 'coal'
    
            elif eia_gas_percentage > threshold:
                eia_plant_type = 'gas'
        
            elif eia_oil_percentage > threshold:
                eia_plant_type = 'oil'
    
            else:
                eia_plant_type = 'NA'
        
            ferc_selected_plant = f1_df[(f1_df.plant_id_pudl == plant_id_pudl) & (f1_df.report_year == year)]
        
            mmbtu_burned = ferc_selected_plant.loc[:,'fuel_qty_burned'] *\
            ferc_selected_plant.loc[:,'fuel_avg_mmbtu_per_unit']
    
            mmbtu_burned_df = pd.DataFrame(mmbtu_burned, columns=['mmbtu_burned'])
    
            ferc_selected_plant = ferc_selected_plant.merge(mmbtu_burned_df,how='left',\
                                                            left_index=True, right_index=True)
    
            ferc_total_mmbtu_burned = ferc_selected_plant['mmbtu_burned'].sum()  
            ferc_fuel_group = ferc_selected_plant.groupby('fuel')
            ferc_fuel_sums = ferc_fuel_group.aggregate(np.sum)['mmbtu_burned']
    
            if any(ferc_selected_plant.fuel == 'coal'):
                ferc_coal_percentage = ferc_fuel_sums['coal'] / ferc_total_mmbtu_burned * 100
            else:
                ferc_coal_percentage = 0
            if any(ferc_selected_plant.fuel == 'gas'):
                ferc_gas_percentage = ferc_fuel_sums['gas'] / ferc_total_mmbtu_burned * 100
            else:
                ferc_gas_percentage = 0
            if any(ferc_selected_plant.fuel == 'oil'):
                ferc_oil_percentage = ferc_fuel_sums['oil'] / ferc_total_mmbtu_burned * 100
            else:
                ferc_oil_percentage = 0
    
            if ferc_coal_percentage > threshold:
                ferc_plant_type = 'coal'
    
            elif ferc_gas_percentage > threshold:
                ferc_plant_type = 'gas'
        
            elif ferc_oil_percentage > threshold:
                ferc_plant_type = 'oil'
    
            else:
                ferc_plant_type = 'NA'
        
            if eia_plant_type == 'coal' and ferc_plant_type == 'coal':
                combined_plant_type = 'coal'
            
            elif eia_plant_type == 'gas' and ferc_plant_type == 'gas':
                combined_plant_type = 'gas'
            elif eia_plant_type == 'oil' and ferc_plant_type == 'oil':
                combined_plant_type = 'oil'
            else:
                combined_plant_type = 'NA'
        
            eia_plant_types.append(eia_plant_type)
            ferc_plant_types.append(ferc_plant_type)
            combined_plant_types.append(combined_plant_type)
            pudl_ids.append(plant_id_pudl)
            year_list.append(year)
        
            types_df = pd.DataFrame([pudl_ids, year_list, eia_plant_types,\
                                ferc_plant_types, combined_plant_types]).transpose()
        
            types_df.rename(columns={0:'plant_id_pudl',1:'report_year',2:'eia_plant_type',\
                                3:'ferc_plant_type',4:'combined_plant_type'},inplace=True)
    
    return types_df

In [None]:
def revised_fuel_type_assigner(threshold):
    
    Session = sa.orm.sessionmaker()
    Session.configure(bind = pudl_engine)
    session = Session()
    
    frc_table = models.PUDLBase.metadata.tables['fuel_receipts_costs_eia923']
    plants_eia923_tbl = models.PUDLBase.metadata.tables['plants_eia923']
    
    frc_select = sa.sql.select([frc_table.c.plant_id,
                            plants_eia923_tbl.c.plant_name,
                            plants_eia923_tbl.c.plant_id_pudl,
                            frc_table.c.report_date,
                            frc_table.c.fuel_group,
                            frc_table.c.fuel_quantity,
                            frc_table.c.average_heat_content,
                            frc_table.c.fuel_cost]).\
                            where(frc_table.c.plant_id == plants_eia923_tbl.c.plant_id)
        
    f1_fuel_table = models.PUDLBase.metadata.tables['fuel_ferc1']
    plants_ferc1_tbl = models.PUDLBase.metadata.tables['plants_ferc1']
    
    f1_select = sa.sql.select([plants_ferc1_tbl.c.plant_name,
                            plants_ferc1_tbl.c.plant_id_pudl,
                            f1_fuel_table.c.report_year,
                            f1_fuel_table.c.fuel,
                            f1_fuel_table.c.fuel_qty_burned,
                            f1_fuel_table.c.fuel_avg_mmbtu_per_unit,
                            f1_fuel_table.c.fuel_cost_per_unit_burned]).\
                            where(f1_fuel_table.c.respondent_id == plants_ferc1_tbl.c.respondent_id).\
                            where(f1_fuel_table.c.plant_name == plants_ferc1_tbl.c.plant_name)
    
    frc_df = pd.read_sql(frc_select, pudl_engine)
    f1_df = pd.read_sql(f1_select, pudl_engine)
    
    frc_df['report_date'] = pd.to_datetime(frc_df['report_date'])
    frc_df['year'] = frc_df['report_date'].dt.year
    frc_df['mmbtu_delivered'] = frc_df['fuel_quantity'] * frc_df['average_heat_content']

    group_1 = frc_df.groupby(['plant_id_pudl','year','fuel_group']).aggregate(np.sum)['mmbtu_delivered'].unstack()
    group_1['eia_mmbtu_sum'] = group_1.sum(axis=1)
    group_1['eia_coal_percent'] = group_1['Coal'] / group_1['eia_mmbtu_sum'] * 100
    group_1['eia_gas_percent'] = group_1['Natural Gas'] / group_1['eia_mmbtu_sum'] * 100
    group_1['eia_oil_percent'] = group_1['Petroleum'] / group_1['eia_mmbtu_sum'] * 100
    
    group_1['eia_fuel_type'] = np.where(group_1['eia_coal_percent'] > threshold, 'coal',\
                                         (np.where(group_1['eia_gas_percent'] > threshold, 'gas',\
                                            (np.where(group_1['eia_oil_percent'] > threshold, 'oil','NA')))))
    eia_fuel_types = pd.DataFrame(group_1['eia_fuel_type']).reset_index()
    
    
    mmbtu_burned = f1_df['fuel_qty_burned'] *\
            f1_df['fuel_avg_mmbtu_per_unit']
    
    mmbtu_burned_df = pd.DataFrame(mmbtu_burned, columns=['mmbtu_burned'])
    
    f1_df = f1_df.merge(mmbtu_burned_df,how='left',left_index=True, right_index=True)
    
    group_2 = f1_df.groupby(['plant_id_pudl','report_year','fuel']).aggregate(np.sum)['mmbtu_burned'].unstack()
    group_2['ferc_mmbtu_sum'] = group_2.sum(axis=1)
    group_2['ferc_coal_percent'] = group_2['coal'] / group_2['ferc_mmbtu_sum'] * 100
    group_2['ferc_gas_percent'] = group_2['gas'] / group_2['ferc_mmbtu_sum'] * 100
    group_2['ferc_oil_percent'] = group_2['oil'] / group_2['ferc_mmbtu_sum'] * 100
    
    group_2['ferc_fuel_type'] = np.where(group_2['ferc_coal_percent'] > threshold, 'coal',\
                                         (np.where(group_2['ferc_gas_percent'] > threshold, 'gas',\
                                            (np.where(group_2['ferc_oil_percent'] > threshold, 'oil','NA')))))
    
    ferc_fuel_types = pd.DataFrame(group_2['ferc_fuel_type']).reset_index()
    
    ft_df = eia_fuel_types.merge(ferc_fuel_types,how='left',left_on=['plant_id_pudl','year'],\
                         right_on=['plant_id_pudl','report_year'])
    ft_df.loc[(ft_df['eia_fuel_type'] == 'coal') & (ft_df['ferc_fuel_type'] == 'coal'), 'combined'] = 'coal'
    ft_df.loc[(ft_df['eia_fuel_type'] == 'gas') & (ft_df['ferc_fuel_type'] == 'gas'), 'combined'] = 'gas'
    ft_df.loc[(ft_df['eia_fuel_type'] == 'oil') & (ft_df['ferc_fuel_type'] == 'oil'), 'combined'] = 'oil'
    
    return ft_df

In [None]:
ft_df = revised_fuel_type_assigner(67)
ft_df.drop_duplicates()

In [None]:
Session = sa.orm.sessionmaker()
Session.configure(bind = pudl_engine)
session = Session()

utility_info = models.PUDLBase.metadata.tables['utilities']

utility_select = sa.sql.select([utility_info.c.id,
                               utility_info.c.name])

utility_df = pd.read_sql(utility_select, pudl_engine)
utility_df

In [None]:
Session = sa.orm.sessionmaker()
Session.configure(bind = pudl_engine)
session = Session()

plant_assn = models.PUDLBase.metadata.tables['util_plant_assn']

plant_assn_select = sa.sql.select([plant_assn.c.utility_id,
                                 plant_assn.c.plant_id])

plant_assn_df = pd.read_sql(plant_assn_select, pudl_engine)
plant_assn_df

In [None]:
Session = sa.orm.sessionmaker()
Session.configure(bind = pudl_engine)
session = Session()

plant_info = models.PUDLBase.metadata.tables['plant_info_eia923']
plants_eia923_tbl = models.PUDLBase.metadata.tables['plants_eia923']

eia_capacity_select = sa.sql.select([plant_info.c.nameplate_capacity_mw,
                                    plant_info.c.plant_id,
                                    plants_eia923_tbl.c.plant_name,
                                    plants_eia923_tbl.c.plant_id_pudl,
                                    ]).\
                                        where(plant_info.c.plant_id == plants_eia923_tbl.c.plant_id)
    
eia_capacity = pd.read_sql(eia_capacity_select, pudl_engine)


steam_table = models.PUDLBase.metadata.tables['plants_steam_ferc1']
plants_ferc1_tbl = models.PUDLBase.metadata.tables['plants_ferc1']

steam_select = sa.sql.select([steam_table.c.respondent_id,
                            steam_table.c.plant_name,
                            steam_table.c.total_capacity_mw,
                            steam_table.c.report_year,
                            plants_ferc1_tbl.c.plant_id_pudl]).\
                            where(steam_table.c.plant_name == plants_ferc1_tbl.c.plant_name)
    
ferc_capacity = pd.read_sql(steam_select,pudl_engine)

Session = sa.orm.sessionmaker()
Session.configure(bind = pudl_engine)
session = Session()

plant_assn = models.PUDLBase.metadata.tables['util_plant_assn']

plant_assn_select = sa.sql.select([plant_assn.c.utility_id,
                                 plant_assn.c.plant_id])

plant_assn_df = pd.read_sql(plant_assn_select, pudl_engine)


ferc_capacity.merge(plant_assn_df,how='left',left_on=['plant_id_pudl'],right_on=['plant_id'])

In [None]:
Session = sa.orm.sessionmaker()
Session.configure(bind = pudl_engine)
session = Session()

plant_info = models.PUDLBase.metadata.tables['plant_info_eia923']
plants_eia923_tbl = models.PUDLBase.metadata.tables['plants_eia923']

eia_capacity_select = sa.sql.select([plant_info.c.nameplate_capacity_mw,
                                    plant_info.c.plant_id,
                                    plants_eia923_tbl.c.plant_name,
                                    plants_eia923_tbl.c.plant_id_pudl,
                                    ]).\
                                        where(plant_info.c.plant_id == plants_eia923_tbl.c.plant_id)
    
eia_capacity = pd.read_sql(eia_capacity_select, pudl_engine)


steam_table = models.PUDLBase.metadata.tables['plants_steam_ferc1']
plants_ferc1_tbl = models.PUDLBase.metadata.tables['plants_ferc1']

steam_select = sa.sql.select([steam_table.c.respondent_id,
                            steam_table.c.plant_name,
                            steam_table.c.total_capacity_mw,
                            steam_table.c.report_year,
                            plants_ferc1_tbl.c.plant_id_pudl]).\
                            where(steam_table.c.plant_name == plants_ferc1_tbl.c.plant_name)
    
ferc_capacity = pd.read_sql(steam_select,pudl_engine)

plant_assn = models.PUDLBase.metadata.tables['util_plant_assn']

plant_assn_select = sa.sql.select([plant_assn.c.utility_id,
                                 plant_assn.c.plant_id])

plant_assn_df = pd.read_sql(plant_assn_select, pudl_engine)


ferc_capacity = ferc_capacity.merge(plant_assn_df,how='left',left_on=['plant_id_pudl'],right_on=['plant_id'])

ferc_plant_caps = ferc_capacity.groupby(['utility_id','plant_id_pudl','plant_name','report_year',]).aggregate(np.sum)['total_capacity_mw']
ferc_plant_caps = ferc_plant_caps.unstack()
ferc_plant_caps.drop([2007, 2008, 2009, 2010, 2012, 2013, 2014, 2015],axis=1,inplace=True)
ferc_plant_caps.columns = ['ferc_plant_cap']
ferc_plant_caps = ferc_plant_caps.reset_index(['utility_id','plant_name'])

ferc_tot_cap = ferc_capacity.groupby(['utility_id','plant_id_pudl','report_year',]).aggregate(np.sum)['total_capacity_mw']
ferc_tot_cap = ferc_tot_cap.unstack()
ferc_tot_cap.drop([2007, 2008, 2009, 2010, 2012, 2013, 2014, 2015],axis=1,inplace=True)
ferc_tot_cap.columns = ['ferc_tot_cap']
ferc_tot_cap = ferc_tot_cap.reset_index(['utility_id'])

eia_capacity.rename(columns={'nameplate_capacity_mw':'eia_tot_cap'})
eia_capacity.set_index(['plant_id_pudl'],inplace=True)

cap_1 = ferc_plant_caps.merge(ferc_tot_cap,right_index=True,left_index=True)

cap_df = cap_1.merge(eia_capacity,how='left',left_index=True,right_index=True)
cap_df['total_cap_ratio'] = cap_df['ferc_tot_cap'] / cap_df['nameplate_capacity_mw']
cap_df

In [None]:
cap_1 = ferc_plant_caps.merge(ferc_tot_cap,right_index=True,left_index=True)

cap_1.reset_index(inplace=True)
eia_capacity.reset_index(inplace=True)

cap_df = cap_1.merge(eia_capacity,how='left',on=['plant_id_pudl'])
cap_df['total_cap_ratio'] = cap_df['ferc_tot_cap'] / cap_df['nameplate_capacity_mw']

In [None]:
uplt.figure(figsize=(10,10))
plt.hist(cap_df.total_cap_ratio,bins=100,range=(0,10))
plt.xlabel('FERC / EIA capacities ratio, grouped by PUDL Utility and Plant ID')
plt.ylabel('Occurrences')
plt.title('FERC EIA PUDL ID capacities ratio distribution')
plt.show()

In [None]:
f1_fuel_table = models.PUDLBase.metadata.tables['fuel_ferc1']
    
f1_select = sa.sql.select([plants_ferc1_tbl.c.plant_name,
                            plants_ferc1_tbl.c.plant_id_pudl,
                            f1_fuel_table.c.report_year,
                            f1_fuel_table.c.fuel,
                            f1_fuel_table.c.fuel_qty_burned,
                            f1_fuel_table.c.fuel_avg_mmbtu_per_unit,
                            f1_fuel_table.c.fuel_cost_per_unit_burned]).\
                            where(f1_fuel_table.c.respondent_id == plants_ferc1_tbl.c.respondent_id).\
                            where(f1_fuel_table.c.plant_name == plants_ferc1_tbl.c.plant_name)
    
f1_df = pd.read_sql(f1_select, pudl_engine)

ferc_selected_plant = f1_df[(f1_df.plant_id_pudl == 28) & (f1_df.report_year == 2015)]

any(f1_df.plant_id_pudl.isin(ferc_selected_plant.plant_id_pudl)) == False

In [None]:
eia_capacity

In [None]:
plant_id_list = list(merged_df.plant_id_pudl)
years_list = list(merged_df.report_year)

n = 4295
m = 0

years = list(plant_year.iloc[m:n,:][1])
plants = list(plant_year.iloc[m:n,:][0])

ft_df = fuel_type_assigner(plants,years,67)

In [None]:
types_merged_df = merged_df.merge(ft_df,how='left',on=['report_year','plant_id_pudl'])
types_merged_df.drop_duplicates()
coal = types_merged_df[types_merged_df['combined'] == 'coal']
gas = types_merged_df[types_merged_df['combined'] == 'gas']
oil = types_merged_df[types_merged_df['combined'] == 'oil']

In [None]:
#initial version - rework to match initial function?
types_merged_df = merged_df.merge(ft_df,how='left',on=['report_year','plant_id_pudl'])
types_merged_df.drop_duplicates()
coal = types_merged_df[types_merged_df['combined_plant_type'] == 'coal']
gas = types_merged_df[types_merged_df['combined_plant_type'] == 'gas']
oil = types_merged_df[types_merged_df['combined_plant_type'] == 'oil']

In [None]:
types_merged_df['expense_ratio'] = types_merged_df['ferc_fuel_sums'] / types_merged_df['annual_fuel_cost']
types_merged_df = types_merged_df.drop_duplicates()

In [None]:
len(types_merged_df[(types_merged_df['expense_ratio'] > .9) & (types_merged_df['expense_ratio'] < 1.1)])

In [None]:
plt.figure(figsize=(10,10))
plt.hist(types_merged_df.expense_ratio,bins=100,range=(0,5))
plt.xlabel('FERC / EIA ratio')
plt.ylabel('Occurrences')
plt.title('FERC EIA fuel expense ratio distribution')
plt.show()

In [None]:
def expense_ratio(df, tolerance):
    df = df[(df['expense_ratio'] > (1-tolerance)) & (df['expense_ratio'] < (1+tolerance))]
    return len(df)

In [None]:
expense_ratio(types_merged_df,.50)

In [None]:
len(types_merged_df)

In [None]:
3132/4295

In [None]:
ferc_high = types_merged_df[(types_merged_df['expense_ratio'] > 1.10)]
ferc_high.sort_values(by='expense_ratio',ascending=False)
ferc_high.combined.value_counts()

In [None]:
ferc_high.sort_values(by='expense_ratio',ascending=False)
ferc_high.plant_id_pudl.nunique()

In [None]:
ferc_low = types_merged_df[(types_merged_df['expense_ratio'] < .9)]
ferc_low.plant_id_pudl.nunique()

In [None]:
psco = coal[coal['respondent_id'] == 145]

In [None]:
coal[coal['plant_name_x'] == 'Craig']

In [None]:
plt.figure(figsize=(14,10))
plt.scatter(coal.ferc_fuel_sums,coal.annual_fuel_cost,label='coal',c='black')
plt.scatter(gas.ferc_fuel_sums,gas.annual_fuel_cost,label='gas',c='orange')
plt.scatter(oil.ferc_fuel_sums,oil.annual_fuel_cost,label='oil',c='red')
plt.xlabel('FERC annual fuel expenses (dollars)')
plt.ylabel('EIA fuel receipts and costs, annual cost of fuel delivered (dollars)')
plt.title('FERC and EIA: Annual fuel cost comparsion')
plt.legend()

In [None]:
plt.figure(figsize=(14,10))
plt.scatter(coal.ferc_fuel_sums,coal.annual_fuel_cost,label='coal',c='black')
slope, intercept = np.polyfit(coal.ferc_fuel_sums, coal.annual_fuel_cost, 1)
coal_trend = intercept + (slope * coal.ferc_fuel_sums)
plt.plot(coal.ferc_fuel_sums, coal_trend, color='red', linestyle='--')
plt.xlabel('FERC annual fuel expenses (dollars)')
plt.ylabel('EIA fuel receipts and costs, annual cost of fuel delivered (dollars)')
plt.title('FERC and EIA: Annual fuel cost comparsion')
plt.legend()
plt.annotate(slope,(0.05, 0.9), xycoords='axes fraction')
plt.annotate(intercept,(0.05, 0.8), xycoords='axes fraction')