# Playing with the FERC small generators table

To Do: 
* [ ] extract headers from names col
* [ ] label totals fromnames col
* [x] remove other extraneous values

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Standard libraries
import logging
import sys
import os
import pathlib
import random

# 3rd party libraries
import geopandas as gpd
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa
#import pickle

# Local libraries
import pudl
from pudl.analysis.fill_ferc1_fuel_gaps import *
from pudl.analysis.flag_ferc1_totals import *

# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

# Display settings
sns.set()
%matplotlib inline
mpl.rcParams['figure.dpi'] = 75
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [3]:
# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine=pudl_engine,
    freq='AS'
)

ferc_engine = sa.create_engine(pudl_settings['ferc1_db'])

In [4]:
small_plants_raw = pd.read_sql("f1_gnrt_plant", ferc_engine)
small_plants = pudl_out.plants_small_ferc1()#.dropna(subset=['plant_name_ferc1'])

In [None]:
# View random utility groups! (thanks trenton)
def view_random_utility(df):
    df = df.reset_index(drop=True)
    util_groups = df.groupby(['utility_id_ferc1'])
    random_utility = random.choice(list(util_groups.groups.keys()))
    return df.iloc[util_groups.groups[random_utility]]

#### Use Zane's transformations without dropping the NA rows...

In [680]:
# This is to run the raw df through the transform function to see if the header get dropped -- they dont

fake_dict = {'plants_small_ferc1': small_plants_raw}
new_dict = {}
small_plants_dict = pudl.transform.ferc1.plants_small(fake_dict, new_dict)
small_plants_out = small_plants_dict['plants_small_ferc1']

# drop rows with no plant name because we can't use that
small_plants_out = small_plants_out.dropna(subset=['plant_name_original'])

  warn(msg)


#### Clean it up a bit...

In [901]:
# If these columns are nan, we can assume it is either a header row or isn't useful
nan_cols = ['construction_year', 'net_generation_mwh', 'total_cost_of_plant', 'capex_per_mw', 'opex_total', 
            'opex_fuel', 'opex_maintenance', 'fuel_cost_per_mmbtu']

# Remove utilities with all NAN rows
spc = (
    small_plants_out
    .groupby('utility_id_ferc1').filter(lambda x: ~x[nan_cols].isna().all().all())
)

# Remove rows with ---------- or '' for names
spc = spc[~spc['plant_name_original'].str.contains('------')]
spc = spc[~spc['plant_name_original'].isin(['', 'none', 'na', 'n/a', 'not applicable'])]
spc = spc.reset_index(drop=True)

#### Flag total rows...

In [879]:
spc.insert(3, 'is_total', False)
spc.loc[spc['plant_name_original'].str.contains('total'), 'is_total'] = True

#### Flag possible headers...

In [880]:
# If a potential header column has these strings, it's probably a useful header
header_strings = ['hydro', 'hyrdo', 'internal', 'wind', 'solar', 'gas', 'diesel', 'diesal', 'steam', 'other',
                  'combustion', 'combustine', 'fuel cell', 'hydraulic', 'waste', 'landfill', 'photovoltaic', 
                  'nuclear', 'oil', 'renewable', 'facilities']

exclude = ['license', '#', '\*', 'pg', 'solargenix', '\@']

exceptions = ['hydro plants: licensed proj. no.', 'hydro license no.', 'hydro: license no.', 'hydro plants: licensed proj no.']

In [881]:
# Add some new columns
spc.insert(3, 'is_header', False)
spc.insert(3, 'header_type', np.nan)

# Label possible header rows
spc.loc[spc.filter(nan_cols).isna().all(1), 'is_header'] = True

# Label good header rows
is_header = spc['is_header']
is_good_header = spc['plant_name_original'].str.contains('|'.join(header_strings))
not_bad = ~spc['plant_name_original'].str.contains('|'.join(exclude))


spc.loc[is_header & is_good_header & not_bad, 'header_type'] = 'good_header'
spc.loc[spc['plant_name_original'].isin(exceptions), 'header_type'] = 'good_header'

In [882]:
# Not using RN
# See if all entries have headers! 
def does_it_have_headers(util_group, nan_cols):
    """See if there are header rows in each utility group.
    
    If there are headers, ideally there should be at least one per year. Headers can be characterized
    as rows where all of the obvious value columns are NA (depicted as a list called nan_cols). So long
    as every utility has more of these NA rows than years reported, we can assume that they have header
    rows.
    """
    num_report_years = len(util_group.report_year.unique())
    num_nan_rows = len(util_group[util_group.filter(nan_cols).isna().all(1)])
    
    if num_nan_rows == 0:
        output = 'no_headers'
    elif num_nan_rows > (num_report_years - 1):
        output = 'good_headers'
    else:
        output = 'bad_headers'
    util_group.insert(3, 'header_type', output)
    
    return util_group

#### Find header clumps and flag them...

In [883]:
from pudl.analysis.fill_ferc1_fuel_gaps import * 

def get_header_clumps_all(df):
    util_groups = df.groupby(['utility_id_ferc1', 'report_year'])
    
    def get_header_clumps(util_year_group):
        # Create mini groups that count pockets of true and false
        group, header_count = create_groups(util_year_group, 'is_header')
        max_idx_val = header_count.index.max()
        max_df_val = util_year_group.index.max()
        # Create a list of the index values of clumps of headers
        idx_list = list(header_count[
            (header_count['fuel']) & (header_count['val_count'] > 1)].index)
        # If the last row is not a clump (i.e. there is just one value) but it is a header (i.e. has nan values)
        # then also include it in the index values to be flagged because it might be a one-liner note
        last_row = header_count.tail(1)
        if (last_row['fuel'].item()) & (last_row['val_count'].item()==1):
            idx_list = idx_list + list(last_row.index)
        # If there are any clumped/end headers:
        if idx_list:
            for idx in idx_list:
                idx_range = group.groups[idx+1]
                # Check to see if last clump bit is not a header...
                is_middle_clump = group.groups[idx+1].max() < max_df_val
                is_good_header = util_year_group.loc[
                    util_year_group.index.isin(group.groups[idx+1])].tail(1)['plant_name_original'].str.contains('|'.join(header_strings)).all()  #.isin(header_strings).all()
                # If the clump is in the middle and the last row looks like a header, then drop it from the idx range
                if is_middle_clump & is_good_header:
                    idx_range = [x for x in idx_range if x != idx_range.max()]
                # Label the clump as a clump
                util_year_group.loc[
                    util_year_group.index.isin(idx_range), 'header_type'] = 'clump'
        return util_year_group
    
    return util_groups.apply(lambda x: get_header_clumps(x))

In [884]:
header_clumps = get_header_clumps_all(spc)

#### Assign headers to the groups...

In [None]:
# wierd ones: 
# - utility id 3 year 1995/6
# - utility id 120 year 2011
# - utility id 29 year 2002
# - utility id 224 has rows accidentally split into two...
# 120 has a problem where something gets wrongly categorized as internal combusition
# 155 problem two clumps one is header one just happens to have no values which makes downstream rows badly categorized!

In [791]:
# Group by utility and year
testA = header_clumps[header_clumps['utility_id_ferc1']==74].copy()
testA2 = testA.head(15)

testB = header_clumps[header_clumps['utility_id_ferc1']==137].copy()
testB2 = testB.head(18)

In [937]:
def assign_headers_all(df):
    # Start by dropping the clumps
    df = df.drop(df[(df['is_header']) & (df['header_type']=='clump')].index).reset_index(drop=True)
    
    # Turn good headers into booleans
    df['is_good_header'] = False
    df.loc[df['header_type']=='good_header', 'is_good_header'] = True

    # Now make groups based on the year and whether there is a header
    header_groups = df.groupby(['utility_id_ferc1', 'report_year', df['is_good_header'].cumsum()])

    # Now assign that header to the rows that follow and remove the row
    def assign_header(util_year_group):
        header = util_year_group[util_year_group['is_good_header']]['plant_name_ferc1'].item()
        util_year_group.insert(3, 'header', header)
        
        return df

    out = header_groups.apply(lambda x: assign_header(x))
    
    return out

In [903]:
# List of things that don't make the cut as headers
list(header_clumps[(header_clumps['is_header']) & (header_clumps['header_type'].isna())].head(30).plant_name_original.unique())

['ferc licensed project number',
 'lanai city plant',
 'miki basin plant',
 'molokai',
 'cummins diesel #6',
 'pumping station:',
 'mammoth pool',
 'main gate house:',
 "u.s. gov't. lock #7",
 'fuel storage tanks',
 'welding training school',
 'environmental',
 'main gatehouse:',
 'santa catalina island',
 '* mammoth pool',
 'leased plants',
 'refineries:',
 'pumping plant',
 'reservoirs']

In [904]:
# List of acceptable headers
li = list(header_clumps[header_clumps['header_type']=='good_header']['plant_name_original'].unique())
li.sort()
li

['additional hydro plant',
 'albany steam hydro expense',
 'auke bay internal combustion',
 'auke bay internal combustion:',
 'black start diesel units',
 'black start diesel units:',
 'combustine turbine',
 'combustion turbine',
 'diesal',
 'diesel',
 'diesel plant',
 'diesel plants',
 'diesel plants:',
 'diesel:',
 'dinner lake gas',
 'fuel cell',
 'fuel cell:',
 'gas plant',
 'gas turbine',
 'gas turbine plant',
 'gas turbine plants',
 'gas turbine:',
 'gas turbines',
 'gas turbines:',
 'gold creek internal combustion:',
 'hydraulic',
 'hydraulic (1):',
 'hydraulic:',
 'hydro',
 'hydro -',
 'hydro electric',
 'hydro lic project no.',
 'hydro license no.',
 'hydro plant:',
 'hydro plants',
 'hydro plants:',
 'hydro plants: licensed proj no.',
 'hydro plants: licensed proj. no.',
 'hydro-electric',
 'hydro:',
 'hydro: lic project no.',
 'hydro: lic. project no.',
 'hydro: license no.',
 'hydroelectric',
 'hydroelectric generating plants',
 'hydroelectric generating plants :',
 'hydroe

In [900]:
view_random_utility(header_clumps)
header_clumps[header_clumps['plant_name_original'].str.contains('not applicable')]

Unnamed: 0,utility_id_ferc1,report_year,plant_name_original,header_type,is_header,is_total,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,opex_total,opex_fuel,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_ferc1,plant_type,ferc_license_id,record_id
70,20,1994,not applicable,clump,True,False,,0.0,0.0,,,,,,,,,not applicable,,,f1_gnrt_plant_1994_12_20_0_1
454,182,1994,not applicable,clump,True,False,,0.0,0.0,,,,,,,,,not applicable,,,f1_gnrt_plant_1994_12_182_0_1
549,203,1994,not applicable,clump,True,False,,0.0,0.0,,,,,,,,,not applicable,,,f1_gnrt_plant_1994_12_203_0_1
807,32,1994,note: columns (d) and (l) are not applicable.,clump,True,False,,0.0,0.0,,,,,,,,,note: columns (d) and (l) are not applicable.,,,f1_gnrt_plant_1994_12_32_0_11
950,32,1995,note: columns (d) and (l) are not applicable,clump,True,False,,0.0,0.0,,,,,,,,,note: columns (d) and (l) are not applicable,,,f1_gnrt_plant_1995_12_32_0_12
1299,182,1995,not applicable,clump,True,False,,0.0,0.0,,,,,,,,,not applicable,,,f1_gnrt_plant_1995_12_182_0_1
1642,203,1995,not applicable,clump,True,False,,0.0,0.0,,,,,,,,,not applicable,,,f1_gnrt_plant_1995_12_203_0_1
2244,203,1996,not applicable,clump,True,False,,,,,,,,,,,,not applicable,,,f1_gnrt_plant_1996_12_203_0_1
2863,203,1997,not applicable,clump,True,False,,,,,,,,,,,,not applicable,,,f1_gnrt_plant_1997_12_203_0_1
3327,182,1997,not applicable,clump,True,False,,,,,,,,,,,,not applicable,,,f1_gnrt_plant_1997_12_182_0_1


In [943]:
#with_headers = assign_headers_all(header_clumps)

def assign_header(util_year_group):
    header = util_year_group[util_year_group['is_header']]['plant_name_ferc1'].item()
    util_year_group.insert(3, 'header', header)
    util_year_group

    return df

df = header_clumps.copy()

df['is_good_header'] = False
df.loc[df['header_type']=='good_header', 'is_good_header'] = True
df.groupby(['utility_id_ferc1', 'report_year', df['is_header'].cumsum()])['is_good_header'].agg(list).reset_index()

Unnamed: 0,utility_id_ferc1,report_year,is_header,is_good_header
0,2,2017,3667,"[True, False, False]"
1,2,2018,3833,"[True, False, False]"
2,2,2019,3988,"[True, False, False]"
3,3,1994,123,[False]
4,3,1994,124,"[True, False, False, False, False, False]"
...,...,...,...,...
4681,432,2019,4032,"[False, False]"
4682,522,2017,3741,[False]
4683,522,2018,3894,[False]
4684,522,2019,4040,"[False, False]"


In [938]:
assign_headers_all(header_clumps)

ValueError: can only convert an array of size 1 to a Python scalar