# Clean up the small generators FERC table

**TO DO:** 
* [ ] Add fuel type for all obvious name drops (and account for outliers)
* [ ] Create ferc id column
* [ ] Reconsile fuel_type with plant_type columns
* [ ] 

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [143]:
# Standard libraries
import logging
import sys
import os
import pathlib
import random

# 3rd party libraries
import geopandas as gpd
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import sqlalchemy as sa
import re

# Local libraries
import pudl
from pudl.analysis.clean_up_ferc1 import *

# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [144]:
# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine=pudl_engine,
    freq='AS'
)

ferc_engine = sa.create_engine(pudl_settings['ferc1_db'])

In [146]:
small_plants_raw = pd.read_sql("f1_gnrt_plant", ferc_engine)
small_plants = pudl_out.plants_small_ferc1()

In [147]:
# Here we create a fake raw dfs dictionary with just the small plants df to run it through
# Zane's existing transform feature.
fake_dict = {'plants_small_ferc1': small_plants_raw}
new_dict = {}
small_plants_dict = pudl.transform.ferc1.plants_small(fake_dict, new_dict)
small_plants_out = small_plants_dict['plants_small_ferc1']

# drop rows with no plant name because we can't use that
small_plants_out = small_plants_out.dropna(subset=['plant_name_ferc1'])

  warn(msg)


## Constants

In [6]:
# If these columns are nan, we can assume it is either a header row or isn't useful
nan_cols = ['construction_year', 'net_generation_mwh', 'total_cost_of_plant', 'capex_per_mw', 'opex_total', 
            'opex_fuel', 'opex_maintenance', 'fuel_cost_per_mmbtu']

# If a potential header column has these strings, it's probably a useful header
header_strings = ['hydro', 'hyrdo', 'internal', 'wind', 'solar', 'gas', 'diesel', 'diesal', 
                  'steam', 'other', 'combustion', 'combustine', 'fuel cell', 'hydraulic', 
                  'waste', 'landfill', 'photovoltaic', 'nuclear', 'oil', 'renewable', 
                  'facilities', 'combined cycle']

# If a potential header has these strings, it is not a header...
exclude = ['#', '\*', 'pg', 'solargenix', 'solargennix', '\@', 'rockton', 'albany steam']

# ...unless it also has one of these strings
exceptions = ['hydro plants: licensed proj. no.', 'hydro license no.', 
              'hydro: license no.', 'hydro plants: licensed proj no.']

# plants with two fuel names
two_fuel_names_dict = {'las vegas solar': 'solar_pv', 'solar centaur': 'gas_turbine'}

# What we will rename the headers once we remove them as rows 
new_header_labels = {
    'hydroelectric': ['hydro', 'hyrdo'],
    'internal combustion': ['internal', 'interal', 'international combustion'],
    'combustion turbine': ['combustion turbine'],
    'combined cycle': ['combined cycle'],
    'gas turbine': ['gas'],
    'petroleum liquids': ['oil', 'diesel', 'diesal'],
    'solar': ['solar', 'photovoltaic'],
    'wind': ['wind'],
    'geothermal': ['geothermal'],
    'waste': ['waste', 'landfill'],
    'steam': ['steam'],
    'nuclear': ['nuclear'],
    'fuel_cell': ['fuel cell'],
    'other': ['other'],
    'renewables': ['renewables'],
}

# Header names that match the one's that zane used in his manual mapping (so we can 
# compare processes)
zane_header_labels = {
    'solar_pv': ['solar', 'photovoltaic'],
    'wind': ['wind'],
    'hydro': ['hydro', 'hyrdo'],
    'internal_combustion': ['internal', 'interal', 'international combustion', ],
    'combustion_turbine': ['combustion turbine', 'combustine turbine'],
    'combined_cycle': ['combined cycle'],
    'diesel_turbine': ['oil', 'diesel', 'diesal'],
    'gas_turbine': ['gas'],
    'geothermal': ['geothermal'],
    'waste_heat': ['waste', 'landfill'],
    'steam_heat': ['steam'],
    'nuclear': ['nuclear'],
    'fuel_cell': ['fuel cell'],
}

def expand_dict(dic):
    d = {}
    for k, lst in dic.items():    
        for i in range(len(lst)):
            d[lst[i]] = k
            #new_d = d.copy()
            #l.append(new_d)
    return d

import random

def get_rand_group(groups):
    random_set = random.choice(list(groups.groups.keys()))
    return groups.get_group(random_set)

## TEST MODULE 

In [112]:
%%time
test1 = remove_bad_rows(small_plants_out, show_removed=False)

Removing rows where an entire utility has reported NA in key columns
Removing rows with three or more dashes for plant name
Removing rows with NA for plant name
CPU times: user 278 ms, sys: 11.1 ms, total: 289 ms
Wall time: 334 ms


In [113]:
%%time
test2 = label_row_type(test1)

Labeling header rows
Labeling total rows
Labeling notes rows
CPU times: user 14.6 s, sys: 144 ms, total: 14.7 s
Wall time: 15.9 s


In [114]:
# TAKE A PEEK
util_groups = test2.groupby(['utility_id_ferc1', 'report_year'])
get_rand_group(util_groups)[['utility_id_ferc1', 'report_year', 'plant_name_ferc1', 'row_type', 'capacity_mw', 'net_generation_mwh', 'construction_year']]

Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,row_type,capacity_mw,net_generation_mwh,construction_year
5504,132,2000,hydro:,header,0.0,,
5505,132,2000,hoot lake,,1.0,3020960.0,1914.0
5506,132,2000,wright,,0.4,2937467.0,1922.0
5507,132,2000,pisgah,,0.52,5326362.0,1917.0
5508,132,2000,dayton hollow,,0.97,7713186.0,1909.0
5509,132,2000,taplin gorge,,0.56,4090800.0,1925.0
5510,132,2000,bemidji,,0.74,1624836.0,
5511,132,2000,internal combustion:,header,0.0,,
5512,132,2000,fergus control center,,1.83,23865.0,1995.0


In [115]:
%%time
test3 = improve_fuel_type(test2)

Mapping header fuels to relevant rows
Getting fuel type from plant name
CPU times: user 99.3 ms, sys: 7.81 ms, total: 107 ms
Wall time: 116 ms


In [117]:
# TAKE A PEEK
util_groups = test3.groupby(['utility_id_ferc1', 'report_year'])
get_rand_group(util_groups)[['utility_id_ferc1', 'report_year', 'plant_name_ferc1', 'row_type', 'fuel_type', 'construction_year']]

Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,row_type,fuel_type,construction_year
15707,121,2015,hydro,header,hydro,
15708,121,2015,apple river,,hydro,1901.0
15709,121,2015,cedar falls,,hydro,1910.0
15710,121,2015,menomonie,,hydro,1958.0
15711,121,2015,riverdale,,hydro,1905.0
15712,121,2015,trego,,hydro,1926.0
15713,121,2015,big falls,,hydro,1922.0
15714,121,2015,hayward,,hydro,1910.0
15715,121,2015,ladysmith,,hydro,1941.0
15716,121,2015,saxon falls,,hydro,1912.0


In [118]:
%%time
test4 = extract_ferc_license(test3)

Extracting FERC license from plant name
CPU times: user 102 ms, sys: 5.67 ms, total: 108 ms
Wall time: 114 ms


In [14]:
# TAKE A PEEK
util_groups = test4.groupby(['utility_id_ferc1', 'report_year'])
get_rand_group(util_groups)[['utility_id_ferc1', 'report_year', 'plant_name_ferc1', 'row_type', 'header_clean', 'fuel_type', 'ferc_license', 'ferc_license_manual']]

Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,row_type,header_clean,fuel_type,ferc_license,ferc_license_manual
17073,88,2017,brown solar,,,,,


In [119]:
%%time
test5 = associate_notes_with_values(test4)

Mapping notes and ferc license from footnotes
CPU times: user 3.29 s, sys: 30.2 ms, total: 3.32 s
Wall time: 3.44 s


In [102]:
# TAKE A PEEK
# weird: 
# 193	2001	
# not captured: 
# 61	2002	
util_groups = test5.groupby(['utility_id_ferc1', 'report_year'])
util_groups_with_notes = (
    util_groups.filter(lambda x: x['row_type'].str.contains('note').any())
    .groupby(['utility_id_ferc1', 'report_year']))
get_rand_group(util_groups_with_notes)[['utility_id_ferc1', 'report_year', 'plant_name_ferc1', 'row_type', 'ferc_license', 'ferc_license_manual', 'notes']]

Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,row_type,ferc_license,ferc_license_manual,notes
7121,61,2002,hydro,header,,,
7122,61,2002,middlesex station # 2,,,,
7123,61,2002,marshfield station # 6,,,,
7124,61,2002,vergennes station # 9 c,,,,
7125,61,2002,"w, danville station # 15",,,,
7126,61,2002,gorge station # 18,,,,
7127,61,2002,essex station # 19 b,,,,
7128,61,2002,waterbury station # 22 a,,,,
7129,61,2002,deforge station # 1 d,,,,
7130,61,2002,diesel,header,,,


In [137]:
print("PERCENT MANUAL LICENSES COVERED BY PROG:")
manual_lic = test5[test5['ferc_license_manual'].notna()]
prog_lic_of_manual = manual_lic[manual_lic['ferc_license'].notna()]
print(f'{round(len(prog_lic_of_manual) / len(manual_lic) * 100)}%')
display(manual_lic[manual_lic['ferc_license'].isna()].sample(5))

print("\nPERCENT MANUAL LICENSES SAME AS PROG:")
same_lic = prog_lic_of_manual[prog_lic_of_manual['ferc_license_manual']==prog_lic_of_manual['ferc_license']]
print(f'{round(len(prog_lic_of_manual) / len(same_lic) * 100)}%')
display(prog_lic_of_manual[prog_lic_of_manual['ferc_license_manual']!=prog_lic_of_manual['ferc_license']])

print('\nNUMBER OF MANUAL LICENSE VS PROG LICENSES:')
prog_lic = test5[test5['ferc_license'].notna()]
print(f'{len(manual_lic)} vs. {len(prog_lic)}')

PERCENT MANUAL LICENSES COVERED BY PROG:
90%


Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,row_type,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,opex_total,opex_fuel,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_clean,plant_type,ferc_license_manual,record_id,ferc_license,notes
12808,25,2011,middlebury lower,,1917.0,2.25,1.7,11225.0,3019446.0,1341976.0,110910.0,,41458.0,hydro,,middlebury lower,hydro,2737,f1_gnrt_plant_2011_12_25_0_13,,
9019,61,2005,waterbury station # 22 a,,1953.0,5.52,0.0,11271.0,1363016.0,246923.0,47784.0,,59541.0,hydro,,waterbury station,hydro,2090,f1_gnrt_plant_2005_12_61_0_8,,
10988,25,2008,cavendish,,1907.0,1.44,1.7,6666.0,1783040.0,1238222.0,57243.0,,114344.0,hydro,,cavendish,hydro,2489,f1_gnrt_plant_2008_12_25_0_10,,
15309,133,2014,potter valley ferc no.77,,1910.0,9.46,9.2,11286.0,35286843.0,3730110.0,3100876.0,,912103.0,hydro,,potter valley,hydro,77,f1_gnrt_plant_2014_12_133_0_17,,
9018,61,2005,essex station # 19 b,,1917.0,7.2,0.0,40934.0,8395411.0,749590.0,94441.0,,229487.0,hydro,,essex station hydro,hydro,2531,f1_gnrt_plant_2005_12_61_0_7,,



PERCENT MANUAL LICENSES SAME AS PROG:
100%


Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,row_type,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,opex_total,opex_fuel,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_clean,plant_type,ferc_license_manual,record_id,ferc_license,notes
9776,151,2006,rochester station #160 (note 3),,1916.0,0.34,0.0,,254356.0,748106.0,,,,,,rochester station #160,hydro,2596,f1_gnrt_plant_2006_12_151_0_5,160,
10472,151,2007,rochester station #160 (note 3),,1916.0,0.34,0.0,,,,,,,,,rochester station #160,hydro,2596,f1_gnrt_plant_2007_12_151_0_5,160,



NUMBER OF MANUAL LICENSE VS PROG LICENSES:
1842 vs. 4754


## 1) Remove Bad Data

First lets remove some **obviously bad rows**. That includes: 
* Utilities that have reported NA values for all `nan_cols` for ALL PLANTS in a given year
* Rows just comprised or three or more dashes: `---`
* Rows with obvious NA plant names: `'', 'none', 'na', 'n/a', 'not applicable'`

In [12]:
remove_bad_rows(small_plants_out, show_removed=True)

Removing rows where an entire utility has reported NA in key columns
Removing rows with three or more dashes for plant name
Removing rows with NA for plant name

 REMOVED NAN VALUES: 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234 entries, 0 to 20243
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   construction_year    0 non-null      float64
 1   net_generation_mwh   0 non-null      float64
 2   total_cost_of_plant  0 non-null      float64
 3   capex_per_mw         0 non-null      float64
 4   opex_total           0 non-null      float64
 5   opex_fuel            0 non-null      float64
 6   opex_maintenance     0 non-null      float64
 7   fuel_cost_per_mmbtu  0 non-null      float64
dtypes: float64(8)
memory usage: 16.5 KB
None 


 REMOVED DASH NAMES: 

 plant_name_ferc1                   
------------------                     25
-------------------                    16
-----           

Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,opex_total,opex_fuel,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_clean,plant_type,ferc_license_id,record_id
1,115,1994,hydro,,0.00,0.0,,,,,,,,,hydro,,,f1_gnrt_plant_1994_12_115_0_1
2,115,1994,seneca falls (a),1917.0,8.00,6.0,17695840.0,2342818.0,293.0,45573.0,,33766.0,,,seneca falls (a),,,f1_gnrt_plant_1994_12_115_0_2
3,115,1994,rainbow falls (c),1926.0,2.64,2.8,17108000.0,2666266.0,1010.0,57909.0,,40082.0,,,rainbow falls (c),,,f1_gnrt_plant_1994_12_115_0_3
4,115,1994,cadyville (b),1922.0,5.53,4.9,26108906.0,15002778.0,2713.0,55345.0,,140692.0,,,cadyville (b),,,f1_gnrt_plant_1994_12_115_0_4
5,115,1994,waterloo (a),1915.0,1.92,1.3,5027300.0,1103365.0,575.0,71829.0,,13091.0,,,waterloo (a),,,f1_gnrt_plant_1994_12_115_0_5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20432,193,2020,pine - 2486 (4),1922.0,3.20,4.2,23517900.0,7081132.0,2212854.0,78338.0,,163434.0,,,pine - 2486 (4),,,f1_gnrt_plant_2020_12_193_0_14
20433,193,2020,solar,,0.00,0.0,,,,,,,,,solar,,,f1_gnrt_plant_2020_12_193_0_16
20434,193,2020,solar now,2019.0,11.00,6.4,8325000.0,17649831.0,1604530.0,236831.0,,,,,solar now,,,f1_gnrt_plant_2020_12_193_0_17
20435,193,2020,other,,0.00,0.0,,,,,,,,,other,,,f1_gnrt_plant_2020_12_193_0_19


In [950]:
# Remove utilities with all NAN rows because these won't contain anything meaningful
# spc = small_plants_clean
spc = (
    small_plants_out
    .groupby('utility_id_ferc1').filter(lambda x: ~x[nan_cols].isna().all().all())
)
# Show what was removed
print('REMOVED NAN VALUES: \n')
pd.concat([small_plants_out,spc]).drop_duplicates(keep=False)[nan_cols].info()

REMOVED NAN VALUES: 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234 entries, 0 to 20243
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   construction_year    0 non-null      float64
 1   net_generation_mwh   0 non-null      float64
 2   total_cost_of_plant  0 non-null      float64
 3   capex_per_mw         0 non-null      float64
 4   opex_total           0 non-null      float64
 5   opex_fuel            0 non-null      float64
 6   opex_maintenance     0 non-null      float64
 7   fuel_cost_per_mmbtu  0 non-null      float64
dtypes: float64(8)
memory usage: 16.5 KB


In [966]:
bb = spc[spc['plant_name_ferc1'].str.contains('---')]
spc.drop(bb.index)

Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,opex_total,opex_fuel,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_clean,plant_type,ferc_license_id,record_id
1,115,1994,hydro,,0.00,0.0,,,,,,,,,hydro,,,f1_gnrt_plant_1994_12_115_0_1
2,115,1994,seneca falls (a),1917.0,8.00,6.0,17695840.0,2342818.0,293.0,45573.0,,33766.0,,,seneca falls (a),,,f1_gnrt_plant_1994_12_115_0_2
3,115,1994,rainbow falls (c),1926.0,2.64,2.8,17108000.0,2666266.0,1010.0,57909.0,,40082.0,,,rainbow falls (c),,,f1_gnrt_plant_1994_12_115_0_3
4,115,1994,cadyville (b),1922.0,5.53,4.9,26108906.0,15002778.0,2713.0,55345.0,,140692.0,,,cadyville (b),,,f1_gnrt_plant_1994_12_115_0_4
5,115,1994,waterloo (a),1915.0,1.92,1.3,5027300.0,1103365.0,575.0,71829.0,,13091.0,,,waterloo (a),,,f1_gnrt_plant_1994_12_115_0_5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20432,193,2020,pine - 2486 (4),1922.0,3.20,4.2,23517900.0,7081132.0,2212854.0,78338.0,,163434.0,,,pine - 2486 (4),,,f1_gnrt_plant_2020_12_193_0_14
20433,193,2020,solar,,0.00,0.0,,,,,,,,,solar,,,f1_gnrt_plant_2020_12_193_0_16
20434,193,2020,solar now,2019.0,11.00,6.4,8325000.0,17649831.0,1604530.0,236831.0,,,,,solar now,,,f1_gnrt_plant_2020_12_193_0_17
20435,193,2020,other,,0.00,0.0,,,,,,,,,other,,,f1_gnrt_plant_2020_12_193_0_19


In [968]:
test = spc[~spc['plant_name_ferc1'].str.contains('---')]
test.loc[:, 'plant_name_ferc1'] = 'hi'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [953]:
# Remove rows with --- or '' for names
aa = spc[~spc['plant_name_ferc1'].str.contains('---')]
spc1 = aa[~aa['plant_name_ferc1'].isin(['', 'none', 'na', 'n/a', 'not applicable'])].reset_index(drop=True)

# Show what was removed
print('REMOVED NAN NAMES:\n', pd.concat([spc, spc1]).drop_duplicates(keep=False).plant_name_ferc1.value_counts(), '\n')

REMOVED NAN NAMES:
                                        186
none                                    81
------------------                      25
not applicable                          23
-------------------                     16
na                                       8
n/a                                      7
-----                                    3
-----------                              3
--------------------                     2
------------------------                 1
-------------------------                1
------------                             1
----------------                         1
-------------                            1
---------------------------------        1
-----------------------------------      1
Name: plant_name_ferc1, dtype: int64 



In [26]:
# Show total rows removed
print(f'TOTAL ROWS REMOVED: {len(small_plants_out) - len(spc1)} rows. Current row total: {len(spc1)}')

TOTAL ROWS REMOVED: 595 rows. Current row total: 19842


## 2/3) Label Row Types: `header, note, total`

Instead of removing rows, now we're going to flag them as: 
* `header`: critical rows are NA and contains predefined header keywords
* `total`: critical rows are not NA and contains the word total
* `note`: critical rows are NA and there are multiple in a row for a given utility report year

We want to flag rather than get rid of these rows, because they contain useful information. For example, headers are often fuel types and notes sometimes contain ferc license ids.

Example of a note containing a FERC license:
* `spc4[spc4['row_type']=='clump']`
* `spc4[(spc4['utility_id_ferc1'] == 115) & (spc4['report_year']==1994)]`

In [27]:
# Add some new helper columns
spc2 = spc1.copy()
spc2.insert(3, 'possible_header', False)
spc2.insert(3, 'row_type', np.nan)

Notes are similar to the headers in that they don't contain any useful information in certain columns. At first, we'll lump them all together under `possible_header == True`, and then we'll tease out the ones that are note clumps vs actual headers.

In [28]:
# Label possible header rows (based on the nan cols specified above)
spc2.loc[spc2.filter(nan_cols).isna().all(1), 'possible_header'] = True

# Label good header rows (based on whether they contain key strings)
possible_header = spc2['possible_header']
good_header = spc2['plant_name_ferc1'].str.contains('|'.join(header_strings))
not_bad = ~spc2['plant_name_ferc1'].str.contains('|'.join(exclude))

spc2.loc[possible_header & good_header & not_bad, 'row_type'] = 'header'
spc2.loc[spc2['plant_name_ferc1'].isin(exceptions), 'row_type'] = 'header'

# Label total rows
spc2.loc[spc2['plant_name_ferc1'].str.contains('total'), 'row_type'] = 'total'

# What you'll see when you look at the values listed as headers and totals is
# that many of them are actually notes. These next kernals will help identify
# and label those clumps accordingly.
spc2[spc2['row_type']=='total'].plant_name_ferc1.value_counts()
spc2[spc2['row_type']=='header'].plant_name_ferc1.value_counts()

internal combustion                           297
hydro                                         237
internal combustion:                          185
hydro:                                        171
hydro plants:                                 111
                                             ... 
with e. hampton gas turbine on page 403.2.      1
other-leased:                                   1
other-steam expenses:                           1
gas turbines:                                   1
wind turbine plants:                            1
Name: plant_name_ferc1, Length: 143, dtype: int64

In [29]:
def create_groups(group, group_col):
    """Count groups of headers in a given utiltiy group.

    This function takes a utility group and regroups it by of rows where 
    possible_header = True (i.e.: all values in the specified nan_cols are NA)
    vs. False. Rows where possible_header = True can be bad data, headers, or notes. 
    The result is a DataFrame that contains one row per clump of similar adjecent
    possible_header values with columns val_col depicting the number of rows per
    possible_header clump.

    Ex: If you pass in a df with the possible_header values: True, False False, True, True,
    the header_groups output df will look like this: {'header':[True, False, True], 'val_col:
    [1, 2, 2]}.

    Args:
        group (pandas.DataFrameGroupBy): A groupby object that you'd like to condense by group_col.
        group_col (str): The name of the column you'd like to make sub groups from.

    Returns:
        pandas.DataFrame: A condensed version of that dataframe input grouped by
            breaks in fuel type over the years.

    """
    # Make groups based on consecutive sections where the group_col is alike.
    header_groups = group.groupby((group[f'{group_col}'].shift() !=
                        group[f'{group_col}']).cumsum(), as_index=False)
    
    # Identify the first (and only) group_col value for each group and count how many
    # rows are in each group.
    header_groups_df = header_groups.agg(header=(f'{group_col}', 'first'), val_count=(f'{group_col}', 'count'))

    return header_groups, header_groups_df

In [30]:
def get_header_clumps_all(df):
    """
    Remove clumps of consecutive rows flagged as possible headers.
    
    FERC has lots of note rows that are not headers but are also not useful for analysis.
    This function looks for rows flagged as possible headers (based on NAN values) and checks to
    see if there are multiple in a row. A header row is (usually) defined as a row with NAN values
    followed by rows without NAN values, so when there are more than one clumped together they are
    likely either notes or not helpful.
    
    Sometimes note clumps will end with a meaningful header. This function also checks for this and will
    unclump any headers at the bottom of clumps. There is one exception to this case which is a header that 
    is followed by a plant that had no values reported... Unfortunately I haven't built a work around,
    but hopefully there aren't very many of these. Currently, that header and plant will be categorized
    as clumps and removed.
    
    """
    util_groups = df.groupby(['utility_id_ferc1', 'report_year'])
    
    def get_header_clumps(util_year_group):
        
        # Create mini groups that count pockets of true and false for each utility and year
        # create_groups() is a function from the fill_ferc1_fuel_gaps module-- basically what
        # it does is create a df where each row represents a clump of adjecent, equal values for
        # a given column. Ex: a column of True, True, True, False, True, False, False, will
        # appear as True, False, True, False with value counts for each
        group, header_count = create_groups(util_year_group, 'possible_header')
        
        # These are used later to enable exceptions
        max_idx_val = header_count.index.max()
        max_df_val = util_year_group.index.max()
        
        # Create a list of the index values that comprise each of the header clumps
        # It's only considered a clump if it is greater than 1.
        idx_list = list(header_count[
            (header_count['header']) & (header_count['val_count'] > 1)].index)
        
        # If the last row is not a clump (i.e. there is just one value) but it is a header (i.e. has nan values)
        # then also include it in the index values to be flagged because it might be a one-liner note. And
        # because it is at the bottom there is no chance it can actually be a useful header because there are
        # no value rows below it.
        last_row = header_count.tail(1)
        if (last_row['header'].item()) & (last_row['val_count'].item()==1):
            idx_list = idx_list + list(last_row.index)
        # If there are any clumped/end headers:
        if idx_list:
            for idx in idx_list:
                # Check to see if last clump bit is not a header... sometimes you might find a clump of
                # notes FOLLOWED by a useful header. This next bit will check the last row in each of
                # the identified clumps and "unclump" it if it looks like a valid header. We only need
                # to check clumps that fall in the middle because, as previously mentioned, the last row
                # cannot contain any meaningful header information because there are no values below it.
                idx_range = group.groups[idx+1]
                is_middle_clump = group.groups[idx+1].max() < max_df_val
                is_good_header = util_year_group.loc[
                    util_year_group.index.isin(group.groups[idx+1])].tail(1)['plant_name_ferc1'].str.contains('|'.join(header_strings)).all()  #.isin(header_strings).all()
                # If the clump is in the middle and the last row looks like a header, then drop it from the idx range
                if is_middle_clump & is_good_header:
                    idx_range = [x for x in idx_range if x != idx_range.max()]
                # Label the clump as a clump
                util_year_group.loc[
                    util_year_group.index.isin(idx_range), 'row_type'] = 'note'
        return util_year_group
    
    return util_groups.apply(lambda x: get_header_clumps(x))

In [31]:
spc3 = get_header_clumps_all(spc2)

In [32]:
# Take a look at the rows that were labeled as notes. Looks like notes to me!
# We label rather than remove them because it's possible the might contain useful information
# such as ferc license number.

print('CLUMP PLANT NAMES: \n\n', spc3[spc3['row_type']=='note'].plant_name_ferc1.unique())
print('NON CLUMP OR HEADER PLANT NAMES: \n\n', spc3[~spc3['row_type'].isin(['note', 'header'])].plant_name_ferc1.unique().tolist())

CLUMP PLANT NAMES: 

 ['(a) project #2438' '(b) project #2738' '(c) project #2835'
 '(d) project #2852' 'license project: 2069'
 '(1) applicable to atlantic city' 'electrics share of jointly-'
 'owned facility.' 'note: generation provided in column (e)'
 'is in kilowatt-hours.' 'a-internal combustion unit'
 'b-combustion turbine unit' 'note: amounts per above are not'
 'included on pages 402, 403, and 403a' '(plant not yet in service)'
 'item 2. the generating unit is leased. the'
 'cost of plant represents the annual rental cost.'
 'ferc licensed project no. 2380' 'generator - hutsonville'
 '(1) operated on an emergency basis only.' 'a project # 2090'
 'b project # 2531' 'c project # 2674' 'd project # 2879'
 'cummins diesel #3' 'cummins diesel #4' 'cummins diesel #5'
 'internal combustion:' "yellowstone nat'l park:"
 '*(1) ferc licensed project no. 2582'
 '*(2) ferc licensed project no. 2584'
 '*(3) ferc licensed project no. 2596' '*(4) water for power'
 'other production' 'santa cat

In [34]:
# Now, if you take a look at the total and header rows they are a lot cleaner because we got rid of the notes!
clean_totals = spc3[spc3['row_type']=='total'].plant_name_ferc1.value_counts() # print if you want to see
clean_headers = spc3[spc3['row_type']=='header'].plant_name_ferc1.value_counts() # print if you want to see

# There are some recurring TOTAL rows that contain the plant name "(amounts are for the total of..."
# If you look here, however, only two of them contain any useful information (i.e., they're notes not totals)
num_cols = [x for x in spc3.select_dtypes(include=['float', 'Int64']).columns.tolist() if x not in ['utility_id_ferc1', 'report_year', 'ferc_license_id']]
test = spc3[spc3['plant_name_ferc1'].str.contains('amounts are for')]
print("'AMOUNTS ARE FOR'...ROWS WITH NON ZERO/NA VALUES")
display(test[test[num_cols] > 0].dropna(how='all')) # print this out if you want to see for yourself

# Lets take that one row with helpful information and copy it to the one above (where it should be)
print('\nTOTAL ROW CONTAINS INFORMATION THAT SHOULD BE IN THE ROW ABOVE')
display(spc3.iloc[8293:8295]) # the info from this total row belongs above!
spc3.loc[[8293],num_cols] = spc3.loc[8294][num_cols].values # move the information up
print('\nMOVED TOTAL VALUES TO TOTAL ROW')
display(spc3.iloc[8293:8295]) # look at this to see that both rows now have the same numeric information

# Now we can confidently call all rows with "(amounts are for the total of..." NOTE rather than TOTAL
spc3.loc[spc3['plant_name_ferc1'].str.contains('amounts are for'), 'row_type'] = 'note'

'AMOUNTS ARE FOR'...ROWS WITH NON ZERO/NA VALUES


Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,row_type,possible_header,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,opex_total,opex_fuel,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_clean,plant_type,ferc_license_id,record_id
8290,,,,,,1942.0,,,,,,,,,,,,,,
8294,,,,,,2004.0,160.5,160.5,704000.0,156897178.0,977553.0,136296.0,,,,,,,,



TOTAL ROW CONTAINS INFORMATION THAT SHOULD BE IN THE ROW ABOVE


Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,row_type,possible_header,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,opex_total,opex_fuel,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_clean,plant_type,ferc_license_id,record_id
8293,210,2004,intrepid wind farm (107 units @ 1.5 mw each),,True,,0.0,0.0,,,,,,,,,intrepid wind farm (107 units @ 1.5 mw each),,,f1_gnrt_plant_2004_12_210_0_7
8294,210,2004,(amounts are for the total of all 107 units),total,False,2004.0,160.5,160.5,704000.0,156897178.0,977553.0,136296.0,,,,,intrepid wind farm,wind,,f1_gnrt_plant_2004_12_210_0_8



MOVED TOTAL VALUES TO TOTAL ROW


Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,row_type,possible_header,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,opex_total,opex_fuel,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_clean,plant_type,ferc_license_id,record_id
8293,210,2004,intrepid wind farm (107 units @ 1.5 mw each),,True,2004.0,160.5,160.5,704000.0,156897178.0,977553.0,136296.0,,,,,intrepid wind farm (107 units @ 1.5 mw each),,,f1_gnrt_plant_2004_12_210_0_7
8294,210,2004,(amounts are for the total of all 107 units),total,False,2004.0,160.5,160.5,704000.0,156897178.0,977553.0,136296.0,,,,,intrepid wind farm,wind,,f1_gnrt_plant_2004_12_210_0_8


In [35]:
# There are some rows that looks like they could be headers or totals, but this shows that
# they all have information in their respective rows are are therefore correctly designated as TOTALS.
spc3[(spc3['plant_name_ferc1'].str.contains('total')) & (spc3['plant_name_ferc1'].str.contains(':'))]

Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,row_type,possible_header,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,opex_total,opex_fuel,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_clean,plant_type,ferc_license_id,record_id
13267,161,2012,total hydro:,total,False,,74.3,74.0,198391500.0,164383888.0,61522031.0,8425540.0,,3346929.0,,,total hydro:,,,f1_gnrt_plant_2012_12_161_0_36
13285,161,2012,total solar photovoltaic:,total,False,,21.0,21.0,32142750.0,108241238.0,68033111.0,3987417.0,,,,,total solar photovoltaic:,solar_pv,,f1_gnrt_plant_2012_12_161_1_7
16267,193,2016,total:,total,False,,395.3,0.0,1025813000.0,844115518.0,49347843.0,3830812.0,,10384670.0,,,total:,,,f1_gnrt_plant_2016_12_193_0_20
17052,193,2017,total:,total,False,,398.1,0.0,1089549000.0,839018896.0,60062594.0,4499639.0,,9175033.0,,,total:,,,f1_gnrt_plant_2017_12_193_0_21
18098,193,2018,total:,total,False,,587.2,0.0,934953000.0,856253120.0,61444114.0,5582861.0,,9102533.0,,,total:,,,f1_gnrt_plant_2018_12_193_0_20


## 4) Apply Header Fuel Type to Relevant Rows

In [39]:
spc4 = spc3.copy()

# Clean header names
spc4['header_clean'] = np.nan
d = expand_dict(zane_header_labels)

# Map cleaned header names onto df in a new column
spc4.loc[spc4['row_type']=='header', 'header_clean'] = (
    spc4['plant_name_ferc1'].str.extract(fr"({'|'.join(d.keys())})", expand=False).map(d))

# See what wasn't cleaned up
spc4[(spc4['row_type']=='header') & (spc4['header_clean'].isna())].plant_name_ferc1.value_counts()

other                                           25
hydraulic:                                      15
other:                                           8
lewiston canal facilities:                       7
hydraulic                                        6
renewables:                                      5
other general ops. supervision & engineering     3
other production:                                2
hydraulic (1):                                   1
other-leased:                                    1
renewables                                       1
Name: plant_name_ferc1, dtype: int64

In [40]:
# Make groups based on utility, year, and header
header_groups = spc4.groupby(['utility_id_ferc1', 'report_year', (spc4['row_type']=='header').cumsum()])

# Forward fill based on headers
spc4['fuel_type'] = np.nan
spc4.loc[spc4['row_type']!='note', 'fuel_type'] = header_groups.header_clean.ffill()

In [440]:
## COME UP WITH A WAY TO CHECK FUEL TYPE OVER TIME MAYBE BY PLANT ID?

In [41]:
# 161	1994
# 29	2000 # forked up....

# Use this to see how well it worked!
util_groups = spc4.groupby(['utility_id_ferc1', 'report_year'])
get_rand_group(util_groups)[['plant_name_ferc1', 'row_type', 'header_clean', 'fuel_type', 'construction_year']]

Unnamed: 0,plant_name_ferc1,row_type,header_clean,fuel_type,construction_year
7328,steam,header,steam_heat,steam_heat,
7329,"kauai, hawaii:",,,steam_heat,1969.0
7330,internal combustion,header,internal_combustion,internal_combustion,
7331,"kauai, hawaii",,,internal_combustion,1964.0
7332,"kauai, hawaii",,,internal_combustion,1991.0
7333,"newport, vermont:",,,internal_combustion,1948.0
7334,turbine,,,internal_combustion,
7335,"valencia, az",,,internal_combustion,1989.0
7336,hydro,header,hydro,hydro,
7337,"charleston, vermont",,,hydro,1928.0


## 5) Fill-in Obvioius Fuel Types (just hydro for now)

In [42]:
spc5 = spc4.copy()

# Check for non-labeled hydro in name
non_labeled_hydro = spc5[(spc5['fuel_type']!='hydro') & (spc5['row_type']!='note') & (spc5['plant_name_ferc1'].str.contains('hydro'))].plant_name_ferc1.value_counts()

# Fill in hydro
not_note = spc5['row_type'] != 'note'
contains_hydro = spc5['plant_name_ferc1'].str.contains('hydro')
spc5.loc[not_note & contains_hydro, 'fuel_type'] = 'hydro'

In [43]:
# THERE ARE STILL LOTS OF KINKS TO WORK OUT SO JUST DOING HYDRO FOR NOW

# # label all "obvious" plant types
# fuel_dict = expand_dict(zane_header_labels)
# not_note = spc5['row_type'] != 'note'
# in_keys = spc5['plant_name_ferc1'].str.contains('|'.join(fuel_dict.keys()), regex=True)
# in_two_fuel_keys = spc5['plant_name_ferc1'].str.contains('|'.join(two_fuel_names_dict.keys()), regex=True)
# not_in_exceptions = ~spc5['plant_name_ferc1'].str.contains('windsor rd|gaston|sc-etwind')

# # replace obvious
# spc5.loc[not_note & in_keys & not_in_exceptions, 'fuel_type'] = (
#     spc5['plant_name_ferc1'].str.extract(fr"({'|'.join(fuel_dict.keys())})", expand=False).map(fuel_dict))

# # replace outliers
# spc5.loc[not_note & in_two_fuel_keys, 'fuel_type'] = (
#     spc5['plant_name_ferc1'].str.extract(fr"({'|'.join(two_fuel_names_dict.keys())})", expand=False).map(two_fuel_names_dict))

In [46]:
# Take a look
util_groups = spc5.groupby(['utility_id_ferc1', 'report_year'])
get_rand_group(util_groups)[['plant_name_ferc1', 'row_type', 'header_clean', 'fuel_type', 'construction_year']]

Unnamed: 0,plant_name_ferc1,row_type,header_clean,fuel_type,construction_year
15936,solar plants,header,solar_pv,solar_pv,
15937,newman pv system,,,solar_pv,2009.0
15938,rio grande pv system,,,solar_pv,2009.0
15939,wrangler cpv system,,,solar_pv,2011.0
15940,stanton pv system,,,solar_pv,2012.0
15941,el paso community college pv system,,,solar_pv,2012.0
15942,van horn pv system,,,solar_pv,2013.0
15943,total solar,total,,solar_pv,


## 6) Associate Notes and Add FERC ID Column

In [673]:
def extract_ferc_lic_from_name(df):

    """Extract FERC license number from plant_name_ferc1 and make new column.
    
    Many of FERC license numbers are embedded in the plant_name_ferc1 field, whether
    thats a note row or an actual plant name. This function extracts those license 
    numbers and puts them in a new column.
    
    """
    # Extract all numbers greater than 2 digits from plant_name_ferc1 and put then in a new column as integers.
    # Rename manually collected FERC id column to reflect that.
    df = (
        df.assign(
            ferc_license=lambda x: x.plant_name_ferc1.str.extract('(\d{3,})').astype('float').astype('Int64'),
            ferc_license_id=lambda x: x.ferc_license_id.astype('Int64'))
        .rename(columns={'ferc_license_id': 'ferc_license_manual'}))
    
    
    # Not all of these 3+ digit numbers are FERC licenses. Some are dates, dollar amounts, page numbers, or numbers
    # of wind turbines. These next distinctions help to weed out the non-licesnse values and keep the good ones.
    obvious_license = df.plant_name_ferc1.str.contains('no\.|license|ferc|project', regex=True)
    not_license = df.plant_name_ferc1.str.contains('page|pg|\$|wind|nonutility|units|surrendered', regex=True)
    exceptions = df.plant_name_ferc1.str.contains('tomahawk|otter rapids|wausau|alexander|hooksett|north umpqua', regex=True)
    year_vs_num = (df['ferc_license'] > 1900) & (df['ferc_license'] < 2050)
    not_hydro = ~df.plant_type.isin(['hydro', np.nan]) # figure this one out.....
    extracted_license = df.ferc_license.notna()
    
    # Replace all the non-license numbers with nan
    df.loc[extracted_license & not_hydro, 'ferc_license'] = pd.NA # figure this one out.....
    extracted_license = df.ferc_license.notna() # reset
    df.loc[extracted_license & not_license, 'ferc_license'] = pd.NA
    extracted_license = df.ferc_license.notna() #reset
    df.loc[extracted_license & year_vs_num & ~obvious_license & ~exceptions, 'ferc_license'] = pd.NA

    df['ferc_license'] = df.ferc_license.astype('Int64') # figure out how not to do this twice....
    
    return df

In [826]:
spc6 = extract_ferc_lic_from_name(spc5)

Now we have to get the license numbers that appear in notes! These values are a few lines below the actual row they apply to.

In [41]:
def associate_footnotes_and_license_with_values(group):
    """The """
    
    footnote_pattern = r'(\(\d?[a-z]?[A-Z]?\))'
    regular_row = group['row_type'].isna()
    has_note = group['row_type']=='note'
    has_footnote = group.plant_name_ferc1.str.contains(footnote_pattern)
    
    # Create footnote column
    
    
    # Shorten execution time by only looking at groups with discernable footnotes
    if group.footnote.any():
        print('found')
        # Create new footnote column
        group.loc[:, 'footnote'] = group.plant_name_ferc1.str.extract(footnote_pattern, expand=False)

        # Make a df that combines notes and ferc license with the same footnote value.
        footnote_df = (
            group[has_note]
            .groupby('footnote')
            .agg({'plant_name_ferc1': ', '.join,
                  'ferc_license': 'first'})
            .rename(columns={'plant_name_ferc1': 'notes'}))

        # Map these new licnese and note values onto the original df
        updated_ferc_license_col = group.footnote.map(footnote_df['ferc_license'])
        notes_col = group.footnote.map(footnote_df['notes'])
        group.ferc_license.update(updated_ferc_license_col)
        group['notes'] = notes_col

    return group

In [929]:
%%time
spc6['notes'] = pd.NA
groups = spc6.groupby(['report_year', 'utility_id_ferc1'])
spc6 = groups.apply(lambda x: associate_footnotes_and_license_with_values(x))

  has_footnote = group.plant_name_ferc1.str.contains(footnote_pattern)


CPU times: user 3.89 s, sys: 34.1 ms, total: 3.92 s
Wall time: 3.95 s


In [931]:
# 57	2020	
test_groups = spc6.groupby(['utility_id_ferc1', 'report_year'])
test = get_rand_group(test_groups)[['utility_id_ferc1', 'report_year', 'plant_name_ferc1', 'row_type', 'ferc_license', 'notes', 'footnote', 'note']]

In [932]:
spc6

Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,row_type,possible_header,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,opex_total,opex_fuel,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_clean,plant_type,ferc_license_manual,record_id,header_clean,ferc_license,notes,footnote,note
0,115,1994,hydro,header,True,,0.00,0.0,,,,,,,hydro,,hydro,,,f1_gnrt_plant_1994_12_115_0_1,hydro,,,,
1,115,1994,seneca falls (a),,False,1917.0,8.00,6.0,17695840.0,2342818.0,293.0,45573.0,,33766.0,hydro,,seneca falls (a),,,f1_gnrt_plant_1994_12_115_0_2,,2438,(a) project #2438,(a),(a) project #2438
2,115,1994,rainbow falls (c),,False,1926.0,2.64,2.8,17108000.0,2666266.0,1010.0,57909.0,,40082.0,hydro,,rainbow falls (c),,,f1_gnrt_plant_1994_12_115_0_3,,2835,(c) project #2835,(c),(c) project #2835
3,115,1994,cadyville (b),,False,1922.0,5.53,4.9,26108906.0,15002778.0,2713.0,55345.0,,140692.0,hydro,,cadyville (b),,,f1_gnrt_plant_1994_12_115_0_4,,2738,(b) project #2738,(b),(b) project #2738
4,115,1994,waterloo (a),,False,1915.0,1.92,1.3,5027300.0,1103365.0,575.0,71829.0,,13091.0,hydro,,waterloo (a),,,f1_gnrt_plant_1994_12_115_0_5,,2438,(a) project #2438,(a),(a) project #2438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19837,193,2020,pine - 2486 (4),,False,1922.0,3.20,4.2,23517900.0,7081132.0,2212854.0,78338.0,,163434.0,hydro,,pine - 2486 (4),,,f1_gnrt_plant_2020_12_193_0_14,,2486,,(4),
19838,193,2020,solar,header,True,,0.00,0.0,,,,,,,solar_pv,,solar,,,f1_gnrt_plant_2020_12_193_0_16,solar_pv,,,,
19839,193,2020,solar now,,False,2019.0,11.00,6.4,8325000.0,17649831.0,1604530.0,236831.0,,,solar_pv,,solar now,,,f1_gnrt_plant_2020_12_193_0_17,,,,,
19840,193,2020,other,header,True,,0.00,0.0,,,,,,,,,other,,,f1_gnrt_plant_2020_12_193_0_19,,,,,
