In [None]:
#

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Standard libraries
import logging
import sys
import os
import pathlib
import random

# 3rd party libraries
import geopandas as gpd
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import sqlalchemy as sa
#import pickle

# Local libraries
import pudl
#from pudl.analysis.fill_ferc1_fuel_gaps import *
#from pudl.analysis.flag_ferc1_totals import *

# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

# Display settings
sns.set()
%matplotlib inline
mpl.rcParams['figure.dpi'] = 75
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [3]:
# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine=pudl_engine,
    freq='AS'
)

ferc_engine = sa.create_engine(pudl_settings['ferc1_db'])

In [4]:
small_plants_raw = pd.read_sql("f1_gnrt_plant", ferc_engine)
small_plants = pudl_out.plants_small_ferc1()#.dropna(subset=['plant_name_ferc1'])

In [5]:
# Here we create a fake raw dfs dictionary with just the small plants df to run it through
# Zane's existing transform feature.
fake_dict = {'plants_small_ferc1': small_plants_raw}
new_dict = {}
small_plants_dict = pudl.transform.ferc1.plants_small(fake_dict, new_dict)
small_plants_out = small_plants_dict['plants_small_ferc1']

# drop rows with no plant name because we can't use that
small_plants_out = small_plants_out.dropna(subset=['plant_name_ferc1'])

  warn(msg)


## Constants

In [6]:
# If these columns are nan, we can assume it is either a header row or isn't useful
nan_cols = ['construction_year', 'net_generation_mwh', 'total_cost_of_plant', 'capex_per_mw', 'opex_total', 
            'opex_fuel', 'opex_maintenance', 'fuel_cost_per_mmbtu']

# If a potential header column has these strings, it's probably a useful header
header_strings = ['hydro', 'hyrdo', 'internal', 'wind', 'solar', 'gas', 'diesel', 'diesal', 
                  'steam', 'other', 'combustion', 'combustine', 'fuel cell', 'hydraulic', 
                  'waste', 'landfill', 'photovoltaic', 'nuclear', 'oil', 'renewable', 
                  'facilities', 'combined cycle']

# If a potential header has these strings, it is not a header...
exclude = ['#', '\*', 'pg', 'solargenix', 'solargennix', '\@', 'rockton', 'albany steam']

# ...unless it also has one of these strings
exceptions = ['hydro plants: licensed proj. no.', 'hydro license no.', 
              'hydro: license no.', 'hydro plants: licensed proj no.']

# What we will rename the headers once we remove them as rows 
new_header_labels = {
    'hydroelectric': ['hydro', 'hyrdo'],
    'internal combustion': ['internal', 'interal', 'international combustion'],
    'combustion turbine': ['combustion turbine'],
    'combined cycle': ['combined cycle'],
    'gas turbine': ['gas'],
    'petroleum liquids': ['oil', 'diesel', 'diesal'],
    'solar': ['solar', 'photovoltaic'],
    'wind': ['wind'],
    'geothermal': ['geothermal'],
    'waste': ['waste', 'landfill'],
    'steam': ['steam'],
    'nuclear': ['nuclear'],
    'fuel_cell': ['fuel cell']
}

# Header names that match the one's that zane used in his manual mapping (so we can 
# compare processes)
zane_header_labels = {
    'solar_pv': ['solar', 'photovoltaic'],
    'wind': ['wind'],
    'hydro': ['hydro', 'hyrdo'],
    'internal_combustion': ['internal', 'interal', 'international combustion', ],
    'combustion_turbine': ['combustion turbine', 'combustine turbine'],
    'combined_cycle': ['combined cycle'],
    'diesel_turbine': ['oil', 'diesel', 'diesal'],
    'gas_turbine': ['gas'],
    'geothermal': ['geothermal'],
    'waste_heat': ['waste', 'landfill'],
    'steam_heat': ['steam'],
    'nuclear': ['nuclear'],
    'fuel_cell': ['fuel cell']
}

## Clean Data

First lets remove some **obviously bad rows**. That includes: 
* Utilities that have reported NA values for all `nan_cols` for ALL PLANTS in a given year
* Rows just comprised or three or more dashes: `---`
* Rows with obvious NA plant names: `'', 'none', 'na', 'n/a', 'not applicable'`

In [7]:
# Remove utilities with all NAN rows because these won't contain anything meaningful
# spc = small_plants_clean
spc = (
    small_plants_out
    .groupby('utility_id_ferc1').filter(lambda x: ~x[nan_cols].isna().all().all())
)
# Show what was removed
print('REMOVED NAN VALUES: \n')
pd.concat([small_plants_out,spc]).drop_duplicates(keep=False)[nan_cols].info()

REMOVED NAN VALUES: 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 233 entries, 0 to 19552
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   construction_year    0 non-null      float64
 1   net_generation_mwh   0 non-null      float64
 2   total_cost_of_plant  0 non-null      float64
 3   capex_per_mw         0 non-null      float64
 4   opex_total           0 non-null      float64
 5   opex_fuel            0 non-null      float64
 6   opex_maintenance     0 non-null      float64
 7   fuel_cost_per_mmbtu  0 non-null      float64
dtypes: float64(8)
memory usage: 16.4 KB


In [8]:
# Remove rows with --- or '' for names
spc2 = spc[~spc['plant_name_ferc1'].str.contains('---')].copy()
spc3 = spc2[~spc2['plant_name_ferc1'].isin(['', 'none', 'na', 'n/a', 'not applicable'])].reset_index(drop=True)

# Show what was removed
print('REMOVED NAN NAMES:\n', pd.concat([spc, spc3]).drop_duplicates(keep=False).plant_name_ferc1.value_counts(), '\n')

REMOVED NAN NAMES:
                                        183
none                                    81
------------------                      25
not applicable                          22
-------------------                     16
na                                       8
n/a                                      7
-----                                    3
-----------                              3
--------------------                     2
------------------------                 1
-------------------------                1
------------                             1
----------------                         1
-------------                            1
---------------------------------        1
-----------------------------------      1
Name: plant_name_ferc1, dtype: int64 



In [9]:
# Show total rows removed
print(f'TOTAL ROWS REMOVED: {len(small_plants_out) - len(spc3)} rows. Current row total: {len(spc3)}')

TOTAL ROWS REMOVED: 590 rows. Current row total: 18969


Now lets find **clumps of notes** that can be removed. The notes are similar to the headers in that they don't contain any useful information in certain columns. At first, we'll lump them all together under `is_header == True`, and then we'll tease out the ones that are note clumps vs actual headers.

In [10]:
# Add some new helper columns
spc3.insert(3, 'possible_header', False)
spc3.insert(3, 'header_type', np.nan)

# Label possible header rows (based on the nan cols specified above)
spc3.loc[spc3.filter(nan_cols).isna().all(1), 'possible_header'] = True

# Label good header rows (based on whether they contain key strings)
possible_header = spc3['possible_header']
good_header = spc3['plant_name_ferc1'].str.contains('|'.join(header_strings))
not_bad = ~spc3['plant_name_ferc1'].str.contains('|'.join(exclude))

spc3.loc[possible_header & good_header & not_bad, 'header_type'] = 'good_header'
spc3.loc[spc3['plant_name_ferc1'].isin(exceptions), 'header_type'] = 'good_header'

In [11]:
def create_groups(group, group_col):
    """Count groups of headers in a given utiltiy group.

    This function takes a utility group and regroups it by of rows where 
    possible_header = True (i.e.: all values in the specified nan_cols are NA)
    vs. False. Rows where possible_header = True can be bad data, headers, or notes. 
    The result is a DataFrame that contains one row per clump of similar adjecent
    possible_header values with columns val_col depicting the number of rows per
    possible_header clump.

    Ex: If you pass in a df with the possible_header values: True, False False, True, True,
    the header_groups output df will look like this: {'header':[True, False, True], 'val_col:
    [1, 2, 2]}.

    Args:
        group (pandas.DataFrameGroupBy): A groupby object that you'd like to condense by group_col.
        group_col (str): The name of the column you'd like to make sub groups from.

    Returns:
        pandas.DataFrame: A condensed version of that dataframe input grouped by
            breaks in fuel type over the years.

    """
    # Make groups based on consecutive sections where the group_col is alike.
    header_groups = group.groupby((group[f'{group_col}'].shift() !=
                        group[f'{group_col}']).cumsum(), as_index=False)
    
    # Identify the first (and only) group_col value for each group and count how many
    # rows are in each group.
    header_groups_df = header_groups.agg(header=(f'{group_col}', 'first'), val_count=(f'{group_col}', 'count'))

    return header_groups, header_groups_df

In [12]:
def get_header_clumps_all(df):
    """
    Remove clumps of consecutive rows flagged as possible headers.
    
    FERC has lots of note rows that are not headers but are also not useful for analysis.
    This function looks for rows flagged as possible headers (based on NAN values) and checks to
    see if there are multiple in a row. A header row is (usually) defined as a row with NAN values
    followed by rows without NAN values, so when there are more than one clumped together they are
    likely either notes or not helpful.
    
    Sometimes note clumps will end with a meaningful header. This function also checks for this and will
    unclump any headers at the bottom of clumps. There is one exception to this case which is a header that 
    is followed by a plant that had no values reported... Unfortunately I haven't built a work around,
    but hopefully there aren't very many of these. Currently, that header and plant will be categorized
    as clumps and removed.
    
    """
    util_groups = df.groupby(['utility_id_ferc1', 'report_year'])
    
    def get_header_clumps(util_year_group):
        
        # Create mini groups that count pockets of true and false for each utility and year
        # create_groups() is a function from the fill_ferc1_fuel_gaps module-- basically what
        # it does is create a df where each row represents a clump of adjecent, equal values for
        # a given column. Ex: a column of True, True, True, False, True, False, False, will
        # appear as True, False, True, False with value counts for each
        group, header_count = create_groups(util_year_group, 'possible_header')
        
        # These are used later to enable exceptions
        max_idx_val = header_count.index.max()
        max_df_val = util_year_group.index.max()
        
        # Create a list of the index values that comprise each of the header clumps
        # It's only considered a clump if it is greater than 1.
        idx_list = list(header_count[
            (header_count['header']) & (header_count['val_count'] > 1)].index)
        
        # If the last row is not a clump (i.e. there is just one value) but it is a header (i.e. has nan values)
        # then also include it in the index values to be flagged because it might be a one-liner note. And
        # because it is at the bottom there is no chance it can actually be a useful header because there are
        # no value rows below it.
        last_row = header_count.tail(1)
        if (last_row['header'].item()) & (last_row['val_count'].item()==1):
            idx_list = idx_list + list(last_row.index)
        # If there are any clumped/end headers:
        if idx_list:
            for idx in idx_list:
                # Check to see if last clump bit is not a header... sometimes you might find a clump of
                # notes FOLLOWED by a useful header. This next bit will check the last row in each of
                # the identified clumps and "unclump" it if it looks like a valid header. We only need
                # to check clumps that fall in the middle because, as previously mentioned, the last row
                # cannot contain any meaningful header information because there are no values below it.
                idx_range = group.groups[idx+1]
                is_middle_clump = group.groups[idx+1].max() < max_df_val
                is_good_header = util_year_group.loc[
                    util_year_group.index.isin(group.groups[idx+1])].tail(1)['plant_name_ferc1'].str.contains('|'.join(header_strings)).all()  #.isin(header_strings).all()
                # If the clump is in the middle and the last row looks like a header, then drop it from the idx range
                if is_middle_clump & is_good_header:
                    idx_range = [x for x in idx_range if x != idx_range.max()]
                # Label the clump as a clump
                util_year_group.loc[
                    util_year_group.index.isin(idx_range), 'header_type'] = 'clump'
        return util_year_group
    
    return util_groups.apply(lambda x: get_header_clumps(x))

In [13]:
spc4 = get_header_clumps_all(spc3)

In [183]:
# EXAMPLE OF HOW SOME NOTE CLUMPS CONTAIN FERC LICENSE ID INFORMATION
header_clumps[header_clumps['header_type']=='clump']
header_clumps[(header_clumps['utility_id_ferc1'] == 115) & (header_clumps['report_year']==1994)]

Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,header_type,possible_header,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,opex_total,opex_fuel,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_clean,plant_type,ferc_license_id,record_id
0,115,1994,hydro,good_header,True,,0.0,0.0,,,,,,,,,hydro,,,f1_gnrt_plant_1994_12_115_0_1
1,115,1994,seneca falls (a),,False,1917.0,8.0,6.0,17695840.0,2342818.0,293.0,45573.0,,33766.0,,,seneca falls (a),,,f1_gnrt_plant_1994_12_115_0_2
2,115,1994,rainbow falls (c),,False,1926.0,2.64,2.8,17108000.0,2666266.0,1010.0,57909.0,,40082.0,,,rainbow falls (c),,,f1_gnrt_plant_1994_12_115_0_3
3,115,1994,cadyville (b),,False,1922.0,5.53,4.9,26108906.0,15002778.0,2713.0,55345.0,,140692.0,,,cadyville (b),,,f1_gnrt_plant_1994_12_115_0_4
4,115,1994,waterloo (a),,False,1915.0,1.92,1.3,5027300.0,1103365.0,575.0,71829.0,,13091.0,,,waterloo (a),,,f1_gnrt_plant_1994_12_115_0_5
5,115,1994,keuka (d),,False,1928.0,2.0,2.0,2079000.0,1198285.0,599.0,35313.0,,56379.0,,,keuka (d),,,f1_gnrt_plant_1994_12_115_0_6
6,115,1994,mill,,False,,1943.0,6.1,5.0,23683943.0,13908293.0,2299.0,49015.0,,77509,,mill,,,f1_gnrt_plant_1994_12_115_0_7
7,115,1994,internal combustion,good_header,True,,0.0,0.0,,,,,,,,,internal combustion,,,f1_gnrt_plant_1994_12_115_0_9
8,115,1994,milliken,,False,1967.0,5.5,5.6,254820.0,446541.0,81.0,31044.0,17404.0,6994.0,oil,5.48,milliken,,,f1_gnrt_plant_1994_12_115_0_10
9,115,1994,harris lake,,False,1967.0,1.75,1.3,73728.0,266708.0,152.0,14301.0,11428.0,52608.0,oil,6.58,harris lake,,,f1_gnrt_plant_1994_12_115_0_11


In [113]:
import random

def get_rand_group():
    random_set = random.choice(list(util_groups.groups.keys()))
    return util_groups.get_group(random_set)