# Test the FERC analysis module

Whew! this new analysis module is a bit much, so I created this notebook to explain what's going on, known quirks, etc.

## Setup

In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
# Standard libraries
import logging
import sys
import os
import pathlib
import random

# 3rd party libraries
import geopandas as gpd
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import sqlalchemy as sa
import re

# Local libraries
import pudl
from pudl.analysis.clean_up_ferc1 import *

# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [11]:
# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine=pudl_engine,
    freq='AS'
)

ferc_engine = sa.create_engine(pudl_settings['ferc1_db'])

In [12]:
# Recreate the transform process
small_plants_raw = pd.read_sql("f1_gnrt_plant", ferc_engine)
small_plants = pudl_out.plants_small_ferc1()

# Here we create a fake raw dfs dictionary with just the small plants df to run it through
# Zane's existing transform feature.
fake_dict = {'plants_small_ferc1': small_plants_raw}
new_dict = {}
small_plants_dict = pudl.transform.ferc1.plants_small(fake_dict, new_dict)
small_plants_out = small_plants_dict['plants_small_ferc1']

# drop rows with no plant name because we can't use that
small_plants_out = small_plants_out.dropna(subset=['plant_name_ferc1'])

  warn(msg)


In [14]:
# Helper function

def get_rand_group(groups):
    random_set = random.choice(list(groups.groups.keys()))
    return groups.get_group(random_set)

In [32]:
com_cols = ['record_id', 'utility_id_ferc1', 'report_year', 'plant_name_ferc1', 
            'row_type', 'construction_year', 'capacity_mw', 'net_generation_mwh']

## Test Module

In [16]:
# Run full cleaning
%%time
test_full = clean_small_gens(small_plants_out)

CLEANING SMALL GENS TABLE...
Removing rows where an entire utility has reported NA in key columns
Removing rows with three or more dashes for plant name
Removing rows with NA for plant name
Labeling header rows
Labeling total rows
Labeling notes rows
Mapping header fuels to relevant rows
Getting fuel type from plant name
Extracting FERC license from plant name
Mapping notes and ferc license from footnotes


In [82]:
# Run individual steps
#%%time
test1 = remove_bad_rows(small_plants_out, show_removed=True)
test2 = label_row_type(test1)
test3 = improve_plant_type(test2) ## COME UP WITH A WAY TO CHECK FUEL TYPE OVER TIME MAYBE BY PLANT ID?
test4 = extract_ferc_license(test3)
test5 = associate_notes_with_values(test4)

UsageError: Line magic function `%%time` not found.


### Take a closer look at each of the steps...

#### LABELING ROW TYPE

This step identifies the "type" of row as either a total, note, header, or NA (regular row). 
Pay attention to the `row_type` column.

In [85]:
# Look at random, relevant chunks of the data by utility and year
# Re-run this cell to refresh and see a new utility/year group

util_groups = test2.groupby(['utility_id_ferc1', 'report_year'])
get_rand_group(util_groups)[com_cols]

Unnamed: 0,record_id,utility_id_ferc1,report_year,plant_name_ferc1,row_type,construction_year,capacity_mw,net_generation_mwh
4820,f1_gnrt_plant_1999_12_95_0_1,95,1999,gas turbine,header,,0.0,
4821,f1_gnrt_plant_1999_12_95_0_2,95,1999,williston,,1953.0,7.8,75700.0


#### EXTRACTING PLANT TYPE FROM HEADERS AND PLANT NAME

This step extracts the plant type from the header rows as well as the plant name (when it's not a header) and puts it in the `plant_type_2` column (for comparison with Zane's `plant_type` column).

Known issues: 
* 161, 1994
* 29, 2000
* 170, 1996
* 44, 2003

In [87]:
print('\nCOMPARE PLANT TYPE FROM HEADERS WITH PLANT TYPE:')
same_plt_type = test5[
    test5['plant_type'] == test5['plant_type_2']]
diff_plt_type = test5[
    (test5['plant_type_2'].notna()) & 
    (test5['plant_type'].notna()) & 
    (test5['plant_type_2']!=test5['plant_type'])]
new_plt_type = test5[
    (test5['plant_type'].isna()) & 
    (test5['plant_type_2'].notna())]
print(
    f'''
    same plant type: {len(same_plt_type)}
    diff plant type: {len(diff_plt_type)} 
    new plant type: {len(new_plt_type)}''')

print("\nShow records where plant type differs")
display(diff_plt_type[com_cols + ['plant_type', 'plant_type_2']])


COMPARE PLANT TYPE FROM HEADERS WITH PLANT TYPE:

    same plant type: 4547
    diff plant type: 497 
    new plant type: 8787

Show records where plant type differs


Unnamed: 0,record_id,utility_id_ferc1,report_year,plant_name_ferc1,row_type,construction_year,capacity_mw,net_generation_mwh,plant_type,plant_type_2
8189,f1_gnrt_plant_2004_12_25_0_2,25,2004,st. albans,,1950.0,2.6,43.0,internal_combustion,diesel_turbine
8222,f1_gnrt_plant_2004_12_44_0_10,44,2004,* connors creek,,1971.0,5.5,-21.0,internal_combustion,steam_heat
8223,f1_gnrt_plant_2004_12_44_0_11,44,2004,*harbor beach,,1967.0,4.0,-45.0,internal_combustion,steam_heat
8224,f1_gnrt_plant_2004_12_44_0_12,44,2004,*st. clair,,1970.0,5.5,-654.0,internal_combustion,steam_heat
8263,f1_gnrt_plant_2004_12_61_0_13,61,2004,vergennes station # 9,,1963.0,4.0,606.0,internal_combustion,diesel_turbine
...,...,...,...,...,...,...,...,...,...,...
16713,f1_gnrt_plant_2016_12_281_0_5,281,2016,total diesel,total,,10.0,-182490.0,internal_combustion,diesel_turbine
16762,f1_gnrt_plant_2016_12_73_0_14,73,2016,twin branch,,2016.0,2.6,1388.0,hydro,solar_pv
16871,f1_gnrt_plant_2016_12_115_0_10,115,2016,auburn gas turbine,,2000.0,7.3,245.0,gas_turbine,internal_combustion
16968,f1_gnrt_plant_2016_12_161_1_24,161,2016,uc santa barbara fuel cell,,2012.0,0.2,1648683.0,fuel_cell,solar_pv


In [95]:
# Look at random, relevant chunks of the data by utility and year
# Re-run this cell to refresh and see a new utility/year group

util_groups = test3.groupby(['utility_id_ferc1', 'report_year'])
util_groups_with_row_types = (
    util_groups.filter(lambda x: x['row_type'].notna().any())
    .groupby(['utility_id_ferc1', 'report_year']))
get_rand_group(util_groups_with_row_types)[com_cols + ['plant_type', 'plant_type_2']]

Unnamed: 0,record_id,utility_id_ferc1,report_year,plant_name_ferc1,row_type,construction_year,capacity_mw,net_generation_mwh,plant_type,plant_type_2
3055,f1_gnrt_plant_1997_12_69_0_1,69,1997,diesel plants:,header,,,,,diesel_turbine
3056,f1_gnrt_plant_1997_12_69_0_3,69,1997,centerville,,1963.0,6.0,-38000.0,,diesel_turbine
3057,f1_gnrt_plant_1997_12_69_0_4,69,1997,ames,,1960.0,2.0,2500.0,,diesel_turbine
3058,f1_gnrt_plant_1997_12_69_0_5,69,1997,marshalltown,,1941.0,5.0,-142020.0,,diesel_turbine
3059,f1_gnrt_plant_1997_12_69_0_7,69,1997,total diesel....................................,total,,13.0,-177520.0,,diesel_turbine
3060,f1_gnrt_plant_1997_12_69_0_13,69,1997,hydro plants:,header,,,,,hydro
3061,f1_gnrt_plant_1997_12_69_0_15,69,1997,anamosa,,1990.0,0.24,1050240.0,,hydro
3062,f1_gnrt_plant_1997_12_69_0_16,69,1997,maquoketa,,1924.0,1.36,5465000.0,,hydro
3063,f1_gnrt_plant_1997_12_69_0_17,69,1997,iowa falls,,1926.0,0.65,1548453.0,,hydro
3064,f1_gnrt_plant_1997_12_69_0_19,69,1997,total hydro.....................................,total,,2.25,8063693.0,,hydro


#### EXTRACTING FERC LICENSES AND ASSOCIATING NOTES WITH PROPER ROWS

This step takes ferc licenses from the name and from notes and associates both the ferc license and the note with the relevant row when possible. Pay attention to the `ferc_license`, `ferc_license_manual`, and `notes` columns.

Known issues: 
* 193, 2001
* 61, 2002

In [78]:
manual_lic = test5[test5['ferc_license_manual'].notna()]
prog_lic_of_manual = manual_lic[manual_lic['ferc_license'].notna()]
print(f'{round(len(prog_lic_of_manual) / len(manual_lic) * 100)}% of FERC licenses that were manually mapped are caught by the program')
print('Sample of manual licenses not grabbed by program')
display(manual_lic[manual_lic['ferc_license'].isna()].sample(2))

same_lic = prog_lic_of_manual[prog_lic_of_manual['ferc_license_manual']==prog_lic_of_manual['ferc_license']]
print(f'\n{round(len(prog_lic_of_manual) / len(same_lic) * 100)}% of manual licenses are the same as the programatically mapped ones!')
print('Cases where manual license differs from programatic license excluding NA values')
display(prog_lic_of_manual[prog_lic_of_manual['ferc_license_manual']!=prog_lic_of_manual['ferc_license']])

prog_lic = test5[test5['ferc_license'].notna()]
print(f'\nThere are {len(manual_lic)} manually mapped licenses vs. {len(prog_lic)} programatically mapped licenses')

90% of FERC licenses that were manually mapped are caught by the program
Sample of manual licenses not grabbed by program


Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,row_type,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,...,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_clean,plant_type,ferc_license_manual,record_id,plant_type_2,ferc_license,notes
12726,61,2011,waterbury station # 22 a,,1953.0,5.52,0.0,24075.0,2359550.0,427455.0,...,109688.0,,,waterbury station,hydro,2090,f1_gnrt_plant_2011_12_61_0_8,hydro,,
9598,17,2006,marshall hydro,,1910.0,5.0,4.0,5357000.0,12177532.0,2435506.0,...,115743.0,,,marshall hydro,hydro,2380,f1_gnrt_plant_2006_12_17_0_1,hydro,,



100% of manual licenses are the same as the programatically mapped ones!
Cases where manual license differs from programatic license excluding NA values


Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,row_type,construction_year,capacity_mw,peak_demand_mw,net_generation_mwh,total_cost_of_plant,capex_per_mw,...,opex_maintenance,fuel_type,fuel_cost_per_mmbtu,plant_name_clean,plant_type,ferc_license_manual,record_id,plant_type_2,ferc_license,notes
9776,151,2006,rochester station #160 (note 3),,1916.0,0.34,0.0,,254356.0,748106.0,...,,hydro,,rochester station #160,hydro,2596,f1_gnrt_plant_2006_12_151_0_5,,160,
10472,151,2007,rochester station #160 (note 3),,1916.0,0.34,0.0,,,,...,,hydro,,rochester station #160,hydro,2596,f1_gnrt_plant_2007_12_151_0_5,,160,



There are 1842 manually mapped licenses vs. 4754 programatically mapped licenses


In [81]:
# Look at random, relevant chunks of the data by utility and year
# Re-run this cell to refresh and see a new utility/year group

util_groups = test5.groupby(['utility_id_ferc1', 'report_year'])
util_groups_with_licenses = (
    util_groups.filter(lambda x: x['ferc_license'].notna().any())
    .groupby(['utility_id_ferc1', 'report_year']))
util_groups_with_notes = (
    util_groups.filter(lambda x: x['notes'].notna().any())
    .groupby(['utility_id_ferc1', 'report_year']))

# Show chunks with ferc licenses
print("HIGHLIGHT FERC LICENSES")
display(get_rand_group(util_groups_with_licenses)[com_cols + ['ferc_license', 'ferc_license_manual', 'notes']])

# Show chunks with notes
print("HIGHLIGHT NOTES")
display(get_rand_group(util_groups_with_notes)[com_cols + ['ferc_license', 'ferc_license_manual', 'notes']])

HIGHLIGHT FERC LICENSES


Unnamed: 0,record_id,utility_id_ferc1,report_year,plant_name_ferc1,row_type,construction_year,capacity_mw,net_generation_mwh,ferc_license,ferc_license_manual,notes
8528,f1_gnrt_plant_2004_12_195_0_1,195,2004,hydro: lic project no.,header,,0.0,,,,
8529,f1_gnrt_plant_2004_12_195_0_2,195,2004,caldron falls 2525,,1924.0,6.4,14872.0,2525.0,2525.0,
8530,f1_gnrt_plant_2004_12_195_0_3,195,2004,high falls 2595,,1910.0,7.0,18520.0,2595.0,2595.0,
8531,f1_gnrt_plant_2004_12_195_0_4,195,2004,johnson falls 2522,,1923.0,3.52,11715.0,2522.0,2522.0,
8532,f1_gnrt_plant_2004_12_195_0_5,195,2004,sandstone rapids 2546,,1925.0,3.84,12557.0,2546.0,2546.0,
8533,f1_gnrt_plant_2004_12_195_0_6,195,2004,potato rapids 2560,,1921.0,1.38,4335.0,2560.0,2560.0,
8534,f1_gnrt_plant_2004_12_195_0_7,195,2004,peshtigo 2581,,1920.0,0.59,3405.0,2581.0,2581.0,
8535,f1_gnrt_plant_2004_12_195_0_8,195,2004,otter rapids 1957,,1907.0,0.5,1589.0,1957.0,1957.0,
8536,f1_gnrt_plant_2004_12_195_0_9,195,2004,hat rapids *,,1905.0,1.7,7723.0,,,
8537,f1_gnrt_plant_2004_12_195_0_10,195,2004,tomahawk 1940,,1937.0,2.6,12276.0,1940.0,1940.0,


HIGHLIGHT NOTES


Unnamed: 0,record_id,utility_id_ferc1,report_year,plant_name_ferc1,row_type,construction_year,capacity_mw,net_generation_mwh,ferc_license,ferc_license_manual,notes
3490,f1_gnrt_plant_1997_12_211_0_1,211,1997,black creek hydroelectric project (1),,1994.0,3.7,10474200.0,6221,,(1) ferc project #6221
3491,f1_gnrt_plant_1997_12_211_0_46,211,1997,(1) ferc project #6221,note,,,,6221,,
