# Playing with the FERC small generators table

In [1]:
%load_ext autoreload
%autoreload 2

In [26]:
# Standard libraries
import logging
import sys
import os
import pathlib
import random

# 3rd party libraries
import geopandas as gpd
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa
#import pickle

# Local libraries
import pudl
from pudl.analysis.fill_ferc1_fuel_gaps import *
from pudl.analysis.flag_ferc1_totals import *

# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

# Display settings
sns.set()
%matplotlib inline
mpl.rcParams['figure.dpi'] = 75
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [3]:
# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine=pudl_engine,
    freq='AS'
)

ferc_engine = sa.create_engine(pudl_settings['ferc1_db'])

In [4]:
small_plants_raw = pd.read_sql("f1_gnrt_plant", ferc_engine)
small_plants = pudl_out.plants_small_ferc1()#.dropna(subset=['plant_name_ferc1'])

#### Use Zane's transformations without dropping the NA rows...

In [5]:
# This is to run the raw df through the transform function to see if the header get dropped -- they dont

fake_dict = {'plants_small_ferc1': small_plants_raw}
new_dict = {}
small_plants_dict = pudl.transform.ferc1.plants_small(fake_dict, new_dict)
small_plants_out = small_plants_dict['plants_small_ferc1']

# drop rows with no plant name because we can't use that
small_plants_out = small_plants_out.dropna(subset=['plant_name_original'])

  warn(msg)


In [239]:
# View random utility groups! (thanks trenton)
def view_random_utility(df):
    df = df.reset_index(drop=True)
    util_groups = df.groupby(['utility_id_ferc1'])
    random_utility = random.choice(list(util_groups.groups.keys()))
    return df.iloc[util_groups.groups[random_utility]]

#### Try to flag some headers...

In [368]:
# If these columns are nan, we can assume it is either a header row or isn't useful
nan_cols = ['net_generation_mwh', 'total_cost_of_plant', 'capex_per_mw', 'opex_total', 
            'opex_fuel', 'opex_maintenance', 'fuel_cost_per_mmbtu']

# If a potential header column has these strings, it's probably a useful header
header_strings = ['hydro', 'hyrdo', 'internal', 'ice', 'wind', 'solar', 'gas', 'diesel', 'steam', 'other',
                  'combustion', 'fuel cell', 'hydraulic', 'waste', 'combustine', 'landfill', 'photovoltaic', 
                  'nuclear', 'oil', 'renewable', ] # make sure this is comprehensive and weed out bad ones

In [196]:
# Not using RN
# See if all entries have headers! 
def does_it_have_headers(util_group, nan_cols):
    """See if there are header rows in each utility group.
    
    If there are headers, ideally there should be at least one per year. Headers can be characterized
    as rows where all of the obvious value columns are NA (depicted as a list called nan_cols). So long
    as every utility has more of these NA rows than years reported, we can assume that they have header
    rows.
    """
    num_report_years = len(util_group.report_year.unique())
    num_nan_rows = len(util_group[util_group.filter(nan_cols).isna().all(1)])
    
    if num_nan_rows == 0:
        output = 'no_headers'
    elif num_nan_rows > (num_report_years - 1):
        output = 'good_headers'
    else:
        output = 'bad_headers'
    util_group['header_type'] = output
    
    return util_group

In [369]:
# remove utilities with all NAN rows and label header rows b/c that is useless
sph = (
    small_plants_out
    .groupby('utility_id_ferc1').filter(lambda x: ~x[nan_cols].isna().all().all())
    .groupby('utility_id_ferc1').apply(lambda x: does_it_have_headers(x, nan_cols)) # not using rn
)

sph['is_header'] = False
sph.loc[sph.filter(nan_cols).isna().all(1), 'is_header'] = True

In [377]:
# Look at potential header rows that qualify as headers
is_header = (sph['is_header'])
is_good_header = (sph['plant_name_original'].str.contains('|'.join(header_strings)))
#sph[is_header & is_good_header].plant_name_original.unique()

In [379]:
#sph[sph['plant_name_original'].str.contains('|'.join(header_strings))].plant_name_original.unique()

array(['hydro', 'internal combustion', 'hydro electric',
       'total hydro (small plants)', 'hydro plants',
       'license project: 2069', 'a-internal combustion unit',
       'b-combustion turbine unit', 'medway license #2666',
       'milford license #2534', 'veazie license #2403',
       'ellsworth license #2727', 'howland license #2721',
       'stillwater license #2712', 'orono license #2710',
       '(plant not yet in service)', 'blackstone (steam)',
       'internal combustion:', 'marshall hydro',
       'ferc licensed project no. 2380', 'internal combustion auxiliary',
       'eagle pass hydro', 'diesel plant', 'valley diesel',
       'ferc licensed project number', 'lynch diesel', 'hydro:',
       'steam heating plant', 'internal combustion peaking units',
       'diesel', 'salmon diesel', 'gas turbine', 'steam',
       'wichita (diesel)', 'east hampton diesel', 'montauk diesel',
       'hydraulic', 'cummins diesel #3', 'cummins diesel #4',
       'cummins diesel #5', 'cumm

#### Assign headers to the groups...

In [352]:
# Group by utility and year
testA = sph[sph['utility_id_ferc1']==74].copy()
testA2 = testA.head(15)

testB = sph[sph['utility_id_ferc1']==137]
testB2 = testB.head(18)

In [365]:
def assign_headers_all(df):
    # start by removing non header rows
    conditions = (df['is_header']) & (~df['plant_name_original'].str.contains('|'.join(header_strings)))
    aa = df.drop(df[conditions].index)

    # now make groups based on the year and whether there is a header
    header_groups = aa.groupby(['utility_id_ferc1', 'report_year', aa['is_header'].cumsum()])

    # now assign that header to the rows that follow
    def assign_header(df):
        header = df[df['is_header']]['plant_name_ferc1'].item()
        #df['header'] = header
        df.insert(3, 'header', header)
        return df

    return header_groups.apply(lambda x: assign_header(x))

In [348]:
def assign_header(df):
    header = df[df['is_header']]['plant_name_ferc1'].item()
    df['header'] = header
    return df

header_groups.apply(lambda x: assign_header(x))

Unnamed: 0,utility_id_ferc1,report_year,plant_name_original,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,opex_total,opex_fuel,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_ferc1,plant_type,ferc_license_id,record_id,header_type,is_header,header
350,137,1994,combustine turbine,,0.0,0.0,,,,,,,,,combustine turbine,,,f1_gnrt_plant_1994_12_137_0_1,good_headers,True,combustine turbine
352,137,1994,niles,1972.0,3.9,5.2,409200.0,609095.0,156178.0,4502.0,37415.0,7098.0,oil,4.49,niles,,,f1_gnrt_plant_1994_12_137_0_3,good_headers,False,combustine turbine
353,137,1994,edgewater,1973.0,8.1,6.2,392900.0,974880.0,12356.0,7722.0,40053.0,3591.0,oil,4.53,edgewater,,,f1_gnrt_plant_1994_12_137_0_4,good_headers,False,combustine turbine
354,137,1994,internal combustion,,0.0,0.0,,,,,,,,,internal combustion,,,f1_gnrt_plant_1994_12_137_0_7,good_headers,True,internal combustion
356,137,1994,w. h. sammis,1972.0,1.8,1.9,304400.0,225530.0,125294.0,9837.0,13443.0,7648.0,oil,3.72,w. h. sammis,,,f1_gnrt_plant_1994_12_137_0_9,good_headers,False,internal combustion
357,137,1994,r. e. burger,1972.0,1.1,1.2,259900.0,165242.0,150220.0,1943.0,11362.0,6815.0,oil,4.17,r. e. burger,,,f1_gnrt_plant_1994_12_137_0_10,good_headers,False,internal combustion
358,137,1994,new castle,1968.0,2.0,2.4,274000.0,255551.0,127776.0,1618.0,12196.0,7956.0,oil,4.12,new castle,,,f1_gnrt_plant_1994_12_137_0_11,good_headers,False,internal combustion
1729,137,1995,gas turbine,,0.0,0.0,,,,,,,,,gas turbine,,,f1_gnrt_plant_1995_12_137_0_1,good_headers,True,gas turbine
1731,137,1995,niles,1972.0,3.9,4.0,331600.0,607953.0,155885.0,4703.0,30479.0,921.0,oil,4.16,niles,,,f1_gnrt_plant_1995_12_137_0_3,good_headers,False,gas turbine
1732,137,1995,edgewater,1973.0,8.1,6.0,433100.0,947333.0,116955.0,176.0,37545.0,11768.0,oil,5.81,edgewater,,,f1_gnrt_plant_1995_12_137_0_4,good_headers,False,gas turbine


In [373]:
view_random_utility(sph)

Unnamed: 0,utility_id_ferc1,report_year,plant_name_original,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,opex_total,opex_fuel,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_ferc1,plant_type,ferc_license_id,record_id,header_type,is_header
155,85,1994,east hampton diesel,1962.0,6.0,6.0,1131.0,1544323.0,,,,,,,east hampton diesel,,,f1_gnrt_plant_1994_12_85_0_2,bad_headers,False
156,85,1994,montauk diesel,1961.0,6.0,7.0,722.0,1344676.0,224113.0,75243.0,19637.0,28957.0,oil,5.63,montauk diesel,,,f1_gnrt_plant_1994_12_85_0_7,bad_headers,False
1731,85,1995,east hampton diesel,1962.0,6.0,6.0,1053.0,936028.0,,,,,,,east hampton diesel,,,f1_gnrt_plant_1995_12_85_0_2,bad_headers,False
1732,85,1995,montauk diesel,1961.0,6.0,6.0,522.0,1751559.0,291927.0,40900.0,32451.0,39695.0,oil,,montauk diesel,,,f1_gnrt_plant_1995_12_85_0_7,bad_headers,False
2291,85,1996,east hampton diesel,1962.0,6.0,6.0,421000.0,1421852.0,236975.0,,,,,,east hampton diesel,,,f1_gnrt_plant_1996_12_85_0_2,bad_headers,False
2292,85,1996,montauk diesel,1961.0,6.0,6.0,342000.0,1350755.0,225126.0,48964.0,23698.0,49272.0,oil,5.23,montauk diesel,,,f1_gnrt_plant_1996_12_85_0_4,bad_headers,False
3427,85,1997,east hampton diesel,1962.0,6.0,6.0,421000.0,1421852.0,236975.0,,,,,,east hampton diesel,,,f1_gnrt_plant_1997_12_85_0_2,bad_headers,False
3428,85,1997,montauk diesel,1961.0,6.0,6.0,342000.0,1350755.0,225126.0,48964.0,23698.0,49272.0,oil,5.23,montauk diesel,,,f1_gnrt_plant_1997_12_85_0_4,bad_headers,False
4258,85,1998,east hampton diesel,1962.0,6.0,265.0,463000.0,1403560.0,5296.0,,,,,,east hampton diesel,,,f1_gnrt_plant_1998_12_85_0_2,bad_headers,False
4259,85,1998,with east hampton gas turbine page 403-b,,0.0,0.0,,,,,,,,,with east hampton gas turbine page 403-b,,,f1_gnrt_plant_1998_12_85_0_4,bad_headers,True
