# Create a Technology Type Column for FERC Steam Table

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Standard libraries
import logging
import sys
import os
import pathlib

# 3rd party libraries
import geopandas as gpd
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa
#import pickle

# Local libraries
import pudl
from pudl.analysis.fill_ferc1_fuel_gaps import *

In [3]:
# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [4]:
# Display settings
sns.set()
%matplotlib inline
mpl.rcParams['figure.dpi'] = 75
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [5]:
# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine=pudl_engine,
    freq='AS'
)

In [6]:
value_cols_no_cap = [
    'net_generation_mwh','avg_num_employees',
    'capex_land', 'capex_equipment', 'capex_structures', 'capex_total', 'asset_retirement_cost',
    'opex_operations', 'opex_fuel', 'opex_coolants', 'opex_steam', 'opex_steam_other', 'opex_transfer', 
    'opex_electric', 'opex_misc_power', 'opex_rents', 'opex_allowances', 'opex_engineering', 'opex_structures', 
    'opex_boiler', 'opex_plants', 'opex_misc_steam', 'opex_production_total'
]

value_cols = value_cols_no_cap + ['capacity_mw']

test_view = ['report_year', 'utility_name_ferc1', 'plant_name_ferc1', 'plant_id_pudl', 
             'plant_id_ferc1', 'primary_fuel', 'plant_type', 
             'tech_desc', 'tech_desc_flag','record_id', 'capacity_mw']

total_view = test_view + ['total_type']

ferc_merge_cols = ['report_year', 'utility_id_ferc1', 'plant_name_ferc1']
eia_merge_cols = ['report_date', 'plant_id_pudl', 'generator_id']

In [7]:
# with open('/Users/aesharpe/Desktop/ferc1_transformed2.pickle', 'rb') as handle:
#     ferc1_transformed_dfs_test = pickle.load(handle)

# with open('/Users/aesharpe/Desktop/steam_w_eia.pkl', 'rb') as handle:
#     steam_w_eia = pickle.load(handle)

In [8]:
# # Load the tables you'll need with some basic alterations
# steam2 = ferc1_transformed_dfs_test['plants_steam_ferc1'].copy()
# glue_dicts = pudl.glue.ferc1_eia.glue(ferc1=True) # For steam you'll have to mimic the glue process to get plant_id_pudl and add a column for primary fuel
# steam2 = pd.merge(steam2, glue_dicts['plants_ferc1'], on=['plant_name_ferc1', 'utility_id_ferc1'], how='left')
# steam2 = pd.merge(steam2, glue_dicts['utilities_ferc1'][['utility_id_ferc1', 'utility_name_ferc1']], on=['utility_id_ferc1'], how='left')

In [53]:
steam = pudl_out.plants_steam_ferc1().assign(primary_fuel=np.nan)

fbp = pudl_out.fbp_ferc1()
fbp_small = fbp[ferc_merge_cols + ['primary_fuel_by_mmbtu']] #, 'primary_fuel_by_cost']]
eia = pudl_out.gens_eia860()#.assign(report_year=lambda x: x.report_date.dt.year)
small_plants = pudl_out.plants_small_ferc1()#.dropna(subset=['plant_name_ferc1'])
eia_one_plant = eia[eia['plant_name_eia'].notna()].drop_duplicates(subset='plant_name_eia').reset_index()
small_plants_one_plant = small_plants[small_plants['plant_name_ferc1'].notna()].drop_duplicates(subset='plant_name_ferc1').reset_index()

### Test Small Generators Table

In [10]:
from fuzzywuzzy import process, fuzz

In [11]:
plant_name_eia = []
similarity = []
for i in small_plants_one_plant.plant_name_ferc1:
        ratio = process.extract( i, eia_one_plant.plant_name_eia, limit=1)
        plant_name_eia.append(ratio[0][0])
        similarity.append(ratio[0][1])
small_plants_one_plant['plant_name_eia'] = pd.Series(plant_name_eia)
small_plants_one_plant['similarity'] = pd.Series(similarity)

### Run Module Code

In [107]:
test_ff = impute_tech_desc(steam, eia)
# 14487

merging single-tech EIA technology_description with FERC
25506 / 28533
backfilling EIA technology_description by year if no new units installed
[16]
[110]
[197]
[163]
[1210]
[1149]
[2509]
[2079]
[2078]
[2077]
[2076]
[3008]
[2768]
[2510]
[1554]
[1836]
[1836]
[1830]
[1832]
[2533]
[1613]
[2512]
[7620]
[2562]
[8551]
[528]
[612]
[ 246 1141]


ValueError: can only convert an array of size 1 to a Python scalar

In [110]:
test2[test2['plant_id_ferc1']==120][test_view]
# 246 1141

Unnamed: 0,report_year,utility_name_ferc1,plant_name_ferc1,plant_id_pudl,plant_id_ferc1,primary_fuel,plant_type,tech_desc,tech_desc_flag,record_id,capacity_mw
1350,1994.0,"Entergy Arkansas, Inc.",hamilton moses,246.0,120.0,gas,steam,,,f1_steam_1994_12_8_0_2,138.0
1351,1995.0,"Entergy Arkansas, Inc.",hamilton moses,246.0,120.0,gas,steam,,,f1_steam_1995_12_8_0_2,138.0
1352,1996.0,"Entergy Arkansas, Inc.",hamilton moses,246.0,120.0,gas,steam,,,f1_steam_1996_12_8_0_2,138.0
1353,1997.0,"Entergy Arkansas, Inc.",hamilton moses,246.0,120.0,gas,steam,,,f1_steam_1997_12_8_0_2,138.0
1354,1998.0,"Entergy Arkansas, Inc.",hamilton moses,246.0,120.0,gas,steam,,,f1_steam_1998_12_8_2_1,138.0
1355,1999.0,"Entergy Arkansas, Inc.",hamilton moses,246.0,120.0,gas,steam,,,f1_steam_1999_12_8_1_1,138.0
21254,2000.0,"Entergy Arkansas, Inc.",hamiliton moses,1141.0,120.0,gas,steam,natural_gas_steam_turbine,backfill from eia year,f1_steam_2000_12_8_1_1,138.0
21255,2001.0,"Entergy Arkansas, Inc.",hamiliton moses,1141.0,120.0,gas,steam,natural_gas_steam_turbine,backfill from eia year,f1_steam_2001_12_8_1_1,144.0
21256,2002.0,"Entergy Arkansas, Inc.",hamiliton moses,1141.0,120.0,gas,steam,natural_gas_steam_turbine,backfill from eia year,f1_steam_2002_12_8_1_1,138.0
21257,2003.0,"Entergy Arkansas, Inc.",hamiliton moses,1141.0,120.0,gas,steam,natural_gas_steam_turbine,backfill from eia year,f1_steam_2003_12_8_1_1,138.0


In [55]:
print('unlabled plants:', len(test[test['tech_desc'].isna()]['plant_id_pudl'].unique()))

unlabled plants: 749


In [56]:
test2 = impute_fuel_type(test, pudl_out)

filling fuels with obvious names
26511 / 28533
filling in primary fuel by mmbtu
3444 / 28533
filling in eia plants with one reported fuel
2211 / 28544
filling in primary fuel by cost
1958 / 28544
filling in raw ferc1 fuels
1806 / 28544
filling in ferc plants with one fuel
1069 / 28544
filling in pudl plants with one fuel
970 / 28544
filling in manually mapped fuels
885 / 28544
front and backfilling values with the same ferc1 id
660 / 28544
flipping single fuel outliers for plant_id_ferc1
660 / 28544
flipping multiple fuel outliers
660 / 28544


### Test accuracy of fuel + plant type

In [57]:
test2 = test2.assign(
    tech_desc=lambda x: x.tech_desc.replace({' ': '_'}, regex=True),
    primary_fuel=lambda x: x.primary_fuel.replace({'oil': 'petroleum'}, regex=True),
    similar_plant=np.nan,
    similar_fuel=np.nan
)

no_null_plant = test2['plant_type'].notna()
no_null_fuel = test2['primary_fuel'].notna()
no_null_tech = test2['tech_desc'].notna()

mini_plant = (
    test2[no_null_plant & no_null_tech].copy()
    .assign(similar_plant=lambda x: x.apply(lambda x: x.plant_type in x.tech_desc, axis=1))
)

mini_fuel = (
    test2[no_null_fuel & no_null_tech].copy()
    .assign(similar_fuel=lambda x: x.apply(lambda x: x.primary_fuel in x.tech_desc, axis=1))
)

test2.update(mini_plant, overwrite=True)
test2.update(mini_fuel, overwrite=True)

#test2.loc[(test2['similar_plant'].notna()) & (test2['similar_fuel'].notna()), 'similar'] =  
test2['similar'] = test2['similar_plant'] & test2['similar_fuel']

In [58]:
test3 = test2[no_null_plant & no_null_fuel & no_null_tech].copy()
print(len(test3))
print(len(test3[test3['similar']]))

13994
10941


In [24]:
n1 = test3['similar_fuel'] == False
n2 = test3['similar_plant'] == False

test3[n1 & n2][test_view].plant_id_pudl.astype('int').unique()

array([  242,  1283,   364,  2260,  2261,   283,    41,   554,   197,
         476,   647,  1136,  1158,    15,  1150,   549,  2628,   518,
         489,  5933, 14388, 13443,   122,  1526,     5,   472,  2134])

In [102]:
#test2[test2['plant_id_pudl']==242][test_view + ['installation_year']].sort_values(['report_year'])

In [None]:
# VIRGINIA and CON ED are problematic

# SUMMARY: some just need to be flipped, some are the wrong plant id, some are weird because there 
#          are sub-units that aren't reported in EIA, some are weird because they are waste energy,
#          some

# Problems:
# - 2133 ninemile - has a bad plant_id
# - 241 lee cc - should be coal pre 2012 but issues with ferc id and installation year... :/
# - 1282 Waterside - not a lot of information on what this is or when it stopped...
# - 275 hudson avenue - just need some gas fuels to get flipped
# - 363 JC McNeil - biomass, good
# - 2259 hunterstown - con-ed petro unclear...capacities don't match what I found online at alll
# - 2260 Mountain - another weird con-ed petro plant....no idea
# - 282 Huron - outliers must be flipped
# - 41 beebee - wrong plant id
# - 533 Sheepskin? - wrong plant id
# - 196 four corners - outliers must be flipped
# - 475 putnam - wrong plant id
# - 646 wilmarth - waste
# - 1537 south meadow - flip outliers
# - 648 wilson - flip outliers
# - 1135 grand tower energy center - wonkyness with backfill by year
# - 147 darbytown - flip outliers
# - 1157 Joppa - confusing subunits
# - 15 Altavista - went from coal to biomass in 2013
# - 1149 hopewell there is another hopewell in eia (3335) but the capacity from 1149 matches....:/
# - 548 southampton - went from coal to biomass in 2013
# - 45 bellmeade - some of it's right some not
# - 176 elizabeth river - plant type just wrong
# - 1132 gibson city - some weird outlier fuels and plant types
# - 2628 heart mountain / bird city wrong plant id
# - 517 rothschild - waste
# - 488 remington - some solar plants thrown in at the end! need to fix
# - 5930 - airport - seems that the primary fuel is right (rather than petro from eia)
# - 14385 river valley - MAYBE wrong plant id? unless they are co-located...
# - 13440 Okeechobee - MAYBE wrong plant id? unless they are co-located...
# - 121 coit - unknown valeus causing a problem
# - 1525 devon - flip outliers
# - 5 74th street - flip outliers
# - 471 parr gt - flip outliers

In [205]:
# Problems:
# - 2297 --> gap between ferc years and eia years so can't use the construction type col to see whether there was a new unit added or not...
#            Can use this as a backfill IF the operating_date from eia matches the closest one from FERC
# - 1656 --> False is the result of a NA in the construction_year field which should probably be ignored?

#ferc_eia_tech_desc[ferc_eia_tech_desc['plant_id_pudl']==190]

In [63]:
test2['tech_type'] = test2['primary_fuel'] + '_' + test2['plant_type']

In [93]:
ty = test2['tech_desc'].unique().tolist()

In [96]:
#test2[test2['tech_desc'] == 'all_other'][test_view + ['primary_fuel_flag']]
#test2[test2['plant_id_pudl']==1179][test_view + ['primary_fuel_flag']]

In [83]:
#ss = eia[(eia['plant_name_eia'].notna()) & (eia['plant_name_eia'].str.contains('elanese'))].head(20)
#ss[ss['plant_id_pudl']==1089]

In [94]:
#test2['tech_type'].unique().tolist()
#pfm[pfm['tech_type']=='waste_steam']
#steam[steam['plant_id_pudl']==12299]
[print(x,':', len(test2[test2['tech_desc']==x])) for x in ty]

conventional_steam_coal : 5267
nan : 0
nuclear : 1699
petroleum_liquids : 1919
natural_gas_fired_combustion_turbine : 2991
natural_gas_steam_turbine : 765
natural_gas_fired_combined_cycle : 870
wood/wood_waste_biomass : 84
geothermal : 31
conventional_hydroelectric : 11
all_other : 18
municipal_solid_waste : 51
wind : 48
natural_gas_internal_combustion_engine : 30
onshore_wind_turbine : 122
solar : 57
solar_photovoltaic : 88
landfill_gas : 4


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [62]:
len(test2[(test2['tech_desc'].isna()) & (test2['tech_type'].isna())].plant_id_pudl.unique())

125

In [49]:
test2['tech_desc'].unique().tolist()

['conventional_steam_coal',
 nan,
 'nuclear',
 'petroleum_liquids',
 'natural_gas_fired_combustion_turbine',
 'natural_gas_steam_turbine',
 'natural_gas_fired_combined_cycle',
 'wood/wood_waste_biomass',
 'geothermal',
 'conventional_hydroelectric',
 'all_other',
 'municipal_solid_waste',
 'wind',
 'natural_gas_internal_combustion_engine',
 'onshore_wind_turbine',
 'solar',
 'solar_photovoltaic',
 'landfill_gas']

In [330]:
eia2 = eia.dropna(subset=['plant_name_eia'])
eia[eia['plant_id_pudl']==108]
fuel[fuel['plant_name_ferc1'].str.contains('ickasaw')]
steam[steam['plant_name_ferc1'].str.contains('esbitt')]

Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,plant_type,construction_type,construction_year,installation_year,capacity_mw,peak_demand_mw,plant_hours_connected_while_generating,plant_capability_mw,not_water_limited_capacity_mw,water_limited_capacity_mw,avg_num_employees,capex_land,capex_structures,capex_equipment,capex_total,opex_operations,opex_fuel,opex_coolants,opex_steam,opex_steam_other,opex_transfer,opex_electric,opex_misc_power,opex_rents,opex_allowances,opex_engineering,opex_structures,opex_boiler,opex_plants,opex_misc_steam,opex_production_total,asset_retirement_cost,record_id,capex_per_mw,opex_per_mwh,net_generation_mwh,plant_id_ferc1,plant_id_pudl,utility_name_ferc1,primary_fuel
20010,22,2010,nesbitt unit 1,steam,outdoor,1975,1975,445.5,419.0,7390.0,,422.0,,116.0,1810392.0,12221268.0,62669019.0,76700679.0,79812.0,56107431.0,,296778.0,,,298638.0,308731.0,,-622.0,315549.0,463012.0,987616.0,291272.0,262827.0,59411044.0,,f1_steam_2010_12_22_0_2,172167.6,73.4,809148.0,1803,413,Cleco Power LLC,
20996,22,2011,nesbitt unit 1,steam,outdoor,1975,1975,445.5,389.0,6590.0,,422.0,,125.0,1810392.0,14121823.0,63883179.0,79815394.0,66536.0,40730854.0,,319809.0,,,298650.0,350214.0,,-128.0,264271.0,99556.0,559016.0,315596.0,271004.0,43275378.0,,f1_steam_2011_12_22_0_2,179159.1,61.3,705984.0,1803,413,Cleco Power LLC,
22259,22,2012,nesbitt unit 1,steam,outdoor,1975,1975,445.5,417.0,5014.0,,422.0,,164.0,1810392.0,14084102.0,63967922.0,79862416.0,42966.0,21756926.0,,404378.0,,,330555.0,403712.0,,1485.0,232794.0,115899.0,1185429.0,640312.0,359676.0,25474132.0,,f1_steam_2012_12_22_0_2,179264.7,40.9,623504.0,1803,413,Cleco Power LLC,
23367,22,2013,nesbitt unit 1,steam,outdoor,1975,1975,445.5,427.0,5425.0,,421.0,,165.0,1810392.0,14128466.0,66057404.0,81996262.0,51718.0,28632475.0,,335261.0,,,326277.0,263334.0,,203.0,263683.0,448658.0,1259531.0,3346071.0,547589.0,35474800.0,,f1_steam_2013_12_22_0_2,184054.5,59.0,601032.0,1803,413,Cleco Power LLC,
24223,22,2014,nesbitt unit 1,steam,outdoor,1975,1975,445.5,427.0,1171.0,,427.0,,164.0,1810392.0,14248420.0,66800124.0,82858936.0,67493.0,9698412.0,,341808.0,,,327778.0,194618.0,,-130.0,239066.0,237076.0,995596.0,296884.0,184361.0,12582962.0,,f1_steam_2014_12_22_0_2,185990.9,75.4,166791.0,1803,413,Cleco Power LLC,
24627,22,2015,nesbitt unit 1,steam,outdoor,1975,1975,445.5,420.0,1068.0,,419.0,,162.0,1810392.0,14472075.0,69837872.0,86120339.0,73568.0,5797476.0,,464936.0,,,373329.0,152554.0,,102.0,248220.0,178491.0,1285238.0,375532.0,153581.0,9103027.0,,f1_steam_2015_12_22_0_2,193311.6,52.2,174290.0,1803,413,Cleco Power LLC,
25657,22,2016,nesbitt unit 1,steam,outdoor,1975,1975,445.5,420.0,3004.0,,421.0,,162.0,1810392.0,15127260.0,70841512.0,87779164.0,94053.0,16406374.0,,385888.0,,,364543.0,373300.0,,-4.0,304066.0,257127.0,1370567.0,626980.0,227647.0,20410541.0,,f1_steam_2016_12_22_0_2,197035.2,39.4,518662.0,1803,413,Cleco Power LLC,
26475,22,2017,nesbitt unit 1,steam,outdoor,1975,1975,445.5,424.0,2300.0,,422.0,,163.0,1810392.0,16047921.0,73000191.0,90858504.0,108470.0,13474739.0,,394589.0,,,451747.0,155949.0,,,304033.0,177740.0,629811.0,209553.0,259975.0,16166606.0,,f1_steam_2017_12_22_0_2,203947.3,46.1,350577.0,1803,413,Cleco Power LLC,
27523,22,2018,nesbitt unit 1,steam,outdoor,1975,1975,445.5,417.0,1456.0,,416.0,,160.0,1810392.0,16366967.0,74657529.0,92834888.0,433282.0,9231472.0,,393096.0,,,386543.0,404221.0,,,138673.0,133655.0,978459.0,224561.0,219460.0,12543422.0,,f1_steam_2018_12_22_0_2,208383.6,52.7,237973.0,1803,413,Cleco Power LLC,
29244,22,2019,nesbitt unit 1,steam,outdoor,1975,1975,445.5,424.0,3510.0,,424.0,,161.0,2279491.0,17114841.0,76997172.0,96420985.0,155292.0,15426822.0,,410606.0,,,437831.0,364183.0,,-3.0,323624.0,103239.0,1152622.0,279446.0,373315.0,19026977.0,29481.0,f1_steam_2019_12_22_0_2,216433.2,35.8,531456.0,1803,413,Cleco Power LLC,


In [294]:
# Make the plant type and primary fuel columns more compatible for comparison
test = (
    f4.copy()
    .assign(
        plant_type = lambda x: x.plant_type.replace('_', ' ', regex=True),
        primary_fuel = lambda x: x.primary_fuel.str.lower(),
        same_tech = lambda x: x.same_tech.str.lower())
)

In [295]:
test2 = test.dropna(subset=['plant_type', 'same_tech']).copy()
test2['similar'] = test2.apply(lambda x: x.plant_type in x.same_tech, axis=1)

In [302]:
len(test2[test2['similar']==False])

3242

In [97]:
#test2[test2['similar']==False]