# **Setting Up RMI Depreciation Flags for FERC 1 Data**

### Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Standard libraries
import logging
import sys
import os
import pathlib

# 3rd party libraries
import geopandas as gpd
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa

# Local libraries
import pudl

In [3]:
# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [4]:
# Display settings
sns.set()
%matplotlib inline
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 100
pd.options.display.max_rows = 10

In [5]:
# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine=pudl_engine,
    freq='AS'
)

In [179]:
# testing
def mini_df(df, pid, year):
    df1 =  df[df['plant_id_pudl']==pid] 
    
    if 'report_year' in df.columns:
        return df1[df1['report_year']==year]
    elif 'report_date' in df.columns:
        return df1[df1['report_date'].dt.year==year]

## **Step 1:** Create a better technology type

### **Step 1.1:** Merge steam table with fbp table
PUDL already merges the steam and fuel tables in the fbp (fuel_by_plant) table. This table classifies each plant's fuel type based on fuel cost and fuel consumed by each plant by each fuel type. The fuel type with the highest percentage (according to a given threshold) is deemed the "primary" fuel source.

Here, we merge the fbp table with the steam to get a new table that includes the primary fuel types and the steam fields excluded from the fbp table.

In [197]:
# Generate the tables we'll be working with
fbp = pudl_out.fbp_ferc1()
steam = pudl_out.plants_steam_ferc1()
plants = pudl_out.plants_eia860()
gens = pudl_out.gens_eia860()
net_gen = pudl_out.gen_eia923()

# Specify some column groupings for later
ferc_id_cols = ['report_year', 'utility_id_pudl', 'plant_name_ferc1']
plant_id_cols = ['report_year', 'utility_id_pudl', 'plant_id_pudl']
gen_id_cols = ['report_date', 'plant_id_pudl', 'generator_id']

In [215]:
# Merge fbp and steam tables
fbp_sample = fbp[ferc_id_cols + ['primary_fuel_by_mmbtu', 'primary_fuel_by_cost']]
steam_better_fuel = pd.merge(steam, fbp_sample, on=ferc_id_cols, how='left').drop_duplicates()

In [199]:
print('rows with no fuel / total rows')
print(len(steam_better_fuel[steam_better_fuel['primary_fuel_by_mmbtu'].isna()]), '/', len(steam_better_fuel))

rows with no fuel / total rows
7581 / 29270


In [217]:
# Extend primary plant fuel type designation to all rows in a given plant-year group
steam_better_fuel['primary_plant_fuel_type'] = steam_better_fuel['primary_fuel_by_mmbtu'].astype('string')
steam_better_fuel['primary_plant_fuel_type'] = (
    steam_better_fuel
    .groupby(['report_year', 'plant_id_pudl'])['primary_plant_fuel_type']
    .transform(lambda x: x.max())
    .replace([''], pd.NA)
)

In [10]:
print('rows with no fuel / total rows')
print(len(steam_better_fuel[steam_better_fuel['primary_plant_fuel_type'].isna()]), '/', len(steam_better_fuel))

rows with no fuel / total rows
5655 / 29270


### **Step 1.2:** Merge steam with EIA

In [243]:
# Combine generation from 860 with generation from 923 to get fuel type and net generation
net_gen_fuel = pd.merge(gens, net_gen[gen_id_cols+['net_generation_mwh']], on=gen_id_cols, how='outer')

In [137]:
# Aggregate gen by plant and get primary plant fuel type based on net generation

# Get plant net generation sums
net_gen_fuel['plant_net_gen'] = (
    net_gen_fuel.groupby(['report_date', 'plant_id_pudl'])['net_generation_mwh'].transform('sum')
)

# Calculate the portion of net generation that each row is responsible for
net_gen_fuel['gen_pct'] = (
    net_gen_fuel['net_generation_mwh'] / net_gen_fuel['plant_net_gen'] * 100
)

# Calculate the sum portion of net generation attributable to each fuel type 
net_gen_fuel_sum = (
    net_gen_fuel.groupby(['report_date', 'plant_id_pudl', 'fuel_type_code_pudl'])['gen_pct'].sum().reset_index()
)

# Keep the fuel type rows with the maximum portion for each plant
net_gen_fuel_sum = (
    net_gen_fuel_sum.loc[net_gen_fuel_sum.groupby(['report_date', 'plant_id_pudl'])['gen_pct'].idxmax()]
)

# Only keep the rows where the maximum portion is greater than 50
net_gen_fuel_50 = (
    net_gen_fuel_sum[net_gen_fuel_sum['gen_pct']>50].copy()
    .rename(columns={'fuel_type_code_pudl': 'primary_fuel_type_eia'})
    .assign(report_year=lambda x: x.report_date.dt.year)
    .drop(['report_date', 'gen_pct'], axis=1)
)

In [233]:
# Combine steam table with EIA plant primary fuel type table
steam_better_fuel_eia = pd.merge(steam_better_fuel, net_gen_fuel_50, on=['report_year', 'plant_id_pudl'], how='left')

# Add EIA primary fuels to the primary_plant_fuel_type column where null
steam_better_fuel_eia['primary_plant_fuel_type'] = steam_better_fuel_eia['primary_plant_fuel_type'].fillna(steam_better_fuel_eia['primary_fuel_type_eia'])

In [242]:
print('rows with no fuel / total rows')
print(len(steam_better_fuel_eia[steam_better_fuel_eia['primary_plant_fuel_type'].isna()]), '/', len(steam_better_fuel_eia))

rows with no fuel / total rows
5158 / 29270


In [186]:
# Cases where EIA and FERC fuel type don't match up perfectly 
test = steam_better_fuel[['report_year', 'plant_id_pudl', 'primary_fuel_by_mmbtu']]
tt = pd.merge(test, net_gen_fuel_50, on=['report_year', 'plant_id_pudl'], how='outer')
dd = tt[(tt['primary_fuel_type_eia'].notna()) & (tt['primary_fuel_by_mmbtu'].notna())].copy()
dd['diff'] = dd['primary_fuel_type_eia'] != dd['primary_fuel_by_mmbtu'].copy()
dl = dd[dd['diff']==True]

#### Things to do still: 
- historic back-filling
- check fuel table transform to see if any values are dropped / changed
- add old eia923 years
- look at Jon's detailed list of changes to fill these

In [46]:
#list(no_fuel_df['plant_id_pudl'].unique())

In [50]:
#no_fuel_df[no_fuel_df['plant_id_pudl']==530]

In [69]:
test = ferc_eia_fuel[ferc_eia_fuel['plant_id_pudl']==530]
#test[test['primary_fuel_by_mmbtu'].notna()]


## **Step 2:** Create flags for rows that represent TOTALS
Create flag columns for the steam table that notes which rows should be included when aggregating different data fields for each utility. These fields are: capacity, generation, number of employees, original cost, operation expenses.


The plan is to develop a flag to note whether a line is the whole plant or the owned portion

**Flags:**
- utility owned total
- unit total
- plant total
- combustion turbine extra

### Functions

In [15]:
def flag_totals(steam_table):
    """Preliminarily mark the rows with total indicated in the plant name."""
    regex1 = r'(?i)tot[a-z]*' # Find anything that would resembles TOTAL in the plant name
    regex2 = '100%' # Find all names with 100% in them
    regex3 = 'ttl ' # the space at the end is important
    regex4 = r'\(all' # find any names with '(all' in them
    
    steam_table['total'] = steam_table['plant_name_ferc1'].str.contains('|'.join([regex1, regex2, regex3, regex4]))
    ser_no_octo = steam_table[steam_table['plant_name_ferc1'].str.contains('octotillo')].total == False
    steam_table['total'].update(ser_no_octo) # remove octotillio from total = True because it accidentally fits in the 'tot' regex
    
    return steam_table

In [16]:
def flag_plant_totals(df, col_name):
    
    def is_plant_total(row):
        if 'total plant' in row:
            return 'plant total'
        elif 'plant total' in row:
            return 'plant total'
        elif 'total plt' in row:
            return 'plant total'
        elif 'ttl plt' in row:
            return 'plant total'
        elif 'tot. plt.' in row:
            return 'plant total'
        elif '100%' in row:
            return 'plant total'
        else:
            return None
    
    df[col_name] = df.apply(lambda x: is_plant_total(x.plant_name_ferc1), axis=1)
    
    return df

In [17]:
def backfill_years_by_capacity(df, col_name, replace, replace_with):
    """Backfill rows based on capacity."""
    
    only_totals_df = df[df[col_name]==replace_with]
    plant_groups = only_totals_df.groupby('plant_id_pudl')
    capacity_dict = plant_groups['capacity_mw'].apply(lambda x: [x for x in list(x.unique()) if x !=0]).to_dict() # no zeros
    plants_with_totals_list = list(capacity_dict.keys())
    
    for plant_id in plants_with_totals_list:
        for capacity in capacity_dict[plant_id]:
            one_plant_df = df[df['plant_id_pudl']==plant_id].copy()
            cap_match_df = one_plant_df[one_plant_df['capacity_mw'].isin(capacity_dict[plant_id])]
            series_update = cap_match_df[col_name].replace({replace: replace_with})
            df[col_name].update(series_update) 
            
    return df

In [18]:
def categorize_bad_rows(df, f_list): # could probably make this faster...
    """Flag bad rows."""
    for fix_dict in f_list:
        for year in fix_dict['years']:
            df.loc[df['record_id']==f"f1_steam_{year}{fix_dict['id_suffix']}", 'total_type'] = fix_dict['total_type']
    
    return df

In [19]:
# Fix-a-dic
# need to append ferc1_steam_YEAR_id_suffix
# categories:
# - plant total
# - unit total
# - utility owned total
# - combustion turbine extra

fix_list = [
    # Rockport AEP
    {'id_suffix': '_12_1_0_3', 'total_type': 'utility owned total', 'years': range(1994,2020)}, #pudl id 530
    # Rockport IMP
    {'id_suffix': '_12_73_1_3', 'total_type': 'utility owned total', 'years': range(1994,1997)}, #pudl id 530
    {'id_suffix': '_12_73_0_3', 'total_type': 'utiltiy owned total', 'years': range(1997,2020)}, # pudl id 530
    # Amos APC
    {'id_suffix': '_12_6_0_3', 'total_type': 'plant total', 'years': range(1994,2002)}, #pudl id 16
    # Conesville 4 - Columbus Southern Power Company then Ohio Power Company then AEP
    {'id_suffix': '_12_31_0_3', 'total_type': 'unit total', 'years': range(1994,2011)}, # pudl id 128
    {'id_suffix': '_12_127_4_1', 'total_type': 'unit total', 'years': range(2011,2014)}, # pudl id 128
    {'id_suffix': '_12_452_1_2', 'total_type': 'unit total', 'years': range(2014,2015)}, # pudl id 128
    # Conesville 4 - Duke 
    {'id_suffix': '_12_27_1_3', 'total_type': 'unit total', 'years': range(1994,2003)}, # pudl if 128  # was plant total
    # Belle River - DTE
    {'id_suffix': '_12_44_0_1', 'total_type': 'utility owned total', 'years': range(1994, 2020)}, # pudl id 44  # also plant total, doesn't add up first year
    # Mitchell - Kentucky Power
    {'id_suffix': '_12_81_0_3', 'total_type': 'plant total', 'years': range(2014,2020)}, # pudl id 382  
    # Mitchell - AEP then Wheeling Power
    {'id_suffix': '_12_452_3_3', 'total_type': 'plant total', 'years': range(2014,2015)}, # pudl id 382
    {'id_suffix': '_12_192_0_2', 'total_type': 'plant total', 'years': range(2015,2020)}, # pudl id 382
    # Iatan 1 - Kansas City Power and Light
    {'id_suffix': '_12_79_1_1', 'total_type': 'unit total', 'years': range(2010,2020)}, # pudl id 295  # was plant total
    # Iatan 2 - Kansas Ciry Power and Light
    {'id_suffix': '_12_79_1_3', 'total_type': 'unit total', 'years': range(2010,2020)}, # pudl id 295  # was plant total
    # La Cygne - Kansas  
    #{'id_suffix': '_12_80_0_3', 'total_type': 'plant total', 'years': range(1994,2010)}, # pudl id 336  # very weird, nums don't add up
    # Jeffrey - Kansas Gas and Electric
    {'id_suffix': '_12_80_1_1', 'total_type': 'plant total', 'years': range(1994,1995)}, # pudl id 307
    {'id_suffix': '_12_80_1_3', 'total_type': 'plant total', 'years': range(1995,2002)}, # pudl id 307
    {'id_suffix': '_12_80_1_1', 'total_type': 'plant total', 'years': range(2002,2010)},
    # Jeffrey - Westar Energy
    {'id_suffix': '_12_191_1_4', 'total_type': 'plant total', 'years': range(1994,1995)}, # pudl id 307
    {'id_suffix': '_12_191_1_5', 'total_type': 'plant total', 'years': range(1995,2005)}, # pudl id 307
    {'id_suffix': '_12_191_1_3', 'total_type': 'plant total', 'years': range(2005,2006)}, # pudl id 307
    {'id_suffix': '_12_191_1_5', 'total_type': 'plant total', 'years': range(2006,2010)}, # pudl id 307
    # JM Stuart - Duke Energy
    {'id_suffix': '_12_27_1_1', 'total_type': 'plant total', 'years': range(1994,2003)}, # pudl id 288
    # JM Stuart - Dayton Power and Light
    {'id_suffix': '_12_42_2_1', 'total_type': 'plant total', 'years': range(1994,1998)}, # pudl id 288
    {'id_suffix': '_12_42_1_1', 'total_type': 'plant total', 'years': range(1998,2001)}, # pudl id 288
    # Valley - Wisconsin Power and Electric
    {'id_suffix': '_12_193_0_3', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 603  # could also be plant total
    # Pt. Wash - Wisconsin Electric Power
    {'id_suffix': '_12_193_1_4', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 470  # other weird value -- see below
    # Pt. Wash (gas) - Wisconsin Electric Power
    {'id_suffix': '_12_193_7_1', 'total_type': 'combustion turbine extra', 'years': range(1994,1996)}, # pudl id 470
    {'id_suffix': '_12_193_1_3', 'total_type': 'combustion turbine extra', 'years': range(1996,2004)}, # pudl id 470  # 2004 might not be right -- confusing
    
    {'id_suffix': '_12_193_3_4', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 469  # also plant total
    
    {'id_suffix': '_12_193_4_4', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 542  # also plant total
    {'id_suffix': '_12_193_0_4', 'total_type': 'utility owned total', 'years': range(1996,2008)}, # has 1 unit only here on...
    {'id_suffix': '_12_193_1_3', 'total_type': 'utility owned total', 'years': range(2008,2015)}, # pudl id 542  # technically becomes just one row in 2010
    
    {'id_suffix': '_12_193_5_2', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 1216 # also plant total
    {'id_suffix': '_12_193_0_5', 'total_type': 'utility owned total', 'years': range(1996,2008)}, # pudl id 1216 # has 1 unit only
    
    {'id_suffix': '_12_193_5_5', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 458 # also plant total
    
    {'id_suffix': '_12_193_6_5', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 216  # also plant total
    
    {'id_suffix': '12_193_8_4', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 127  # also plant total
    
    {'id_suffix': '_12_194_0_4', 'total_type': 'unit total', 'years': range(1994,2012)}, # pudl id 123  # was labeled plant total b/c of 100%
    {'id_suffix': '_12_194_0_1', 'total_type': 'unit total', 'years': range(2012,)}, # pudl id 123
    
    {'id_suffix': '_12_194_0_5', 'total_type': 'unit total', 'years': range(1994,2012)}, # pudl id 123  # was labeled plant total b/c of 100%
    {'id_suffix': '_12_194_0_2', 'total_type': 'unit total', 'years': range(2012,)}, # pudl id 123
    
    {'id_suffix': '_12_194_2_3', 'total_type': 'utility owned total', 'years': range(1994,1998)}, # pudl id 123 
    {'id_suffix': '_12_194_1_4', 'total_type': 'utility owned total', 'years': range(1998,2012)}, # pudl id 123  # continuation of before 2_3 to 1_4
    {'id_suffix': '_12_194_1_1', 'total_type': 'utility owned total', 'years': range(2012,2014)}, # pudl id 123 
    
    {'id_suffix': '_12_89_1_5', 'total_type': 'utility owned total', 'years': range(1994,1998)},# pudl id 123
    {'id_suffix': '_12_89_1_3', 'total_type': 'utility owned total', 'years': range(1998,2002)}, # pudl id 123  # continuation of before 1_5 to 1_3
    {'id_suffix': '_12_89_1_5', 'total_type': 'utility owned total', 'years': range(2002,2007)}, # pudl id 123  # record id went back from 1_3 to 1_5
    {'id_suffix': '_12_89_0_4', 'total_type': 'utility owned total', 'years': range(2007,2008)}, # pudl id 123  # 1_5 to 0_4
    {'id_suffix': '_12_89_0_5', 'total_type': 'utility owned total', 'years': range(2008,2009)}, # pudl id 123  # 0_4 to 0_5
    {'id_suffix': '_12_89_1_4', 'total_type': 'utility owned total', 'years': range(2010,2012)}, # pudl id 123  # 0_5 to 1_4
    {'id_suffix': '_12_89_0_5', 'total_type': 'utility owned total', 'years': range(2012,2014)}, # pudl id 123
    {'id_suffix': '_12_89_1_4', 'total_type': 'utility owned total', 'years': range(2014,2015)}, # pudl id 123
    {'id_suffix': '_12_89_1_5', 'total_type': 'utility owned total', 'years': range(2015,2016)}, # pudl id 123
    {'id_suffix': '_12_89_0_5', 'total_type': 'utility owned total', 'years': range(2016,2019)}, # pudl id 123
    {'id_suffix': '_12_89_0_4', 'total_type': 'utility owned total', 'years': range(2019,2020)}, # pudl id 123
    
    {'id_suffix': '_12_194_2_5', 'total_type': 'unit total', 'years': range(1994,1998)}, # pudl id 171  # was plant total
    {'id_suffix': '_12_194_2_1', 'total_type': 'unit total', 'years': range(1998,2012)}, # pudl id 171
    {'id_suffix': '_12_194_1_3', 'total_type': 'unit total', 'years': range(2012,2014)}, # pudl id 171
    {'id_suffix': '_12_194_1_1', 'total_type': 'unit total', 'years': range(2014,2016)}, # pudl id 171
    {'id_suffix': '_12_194_0_5', 'total_type': 'unit total', 'years': range(2016,2019)}, # pudl id 171
    
    {'id_suffix': '_12_194_3_4', 'total_type': 'unit total', 'years': range(1994,1998)}, # pudl id 171  # contains unit-1, was plant total
    {'id_suffix': '_12_194_2_3', 'total_type': 'unit total', 'years': range(1998,2012)}, # pudl id 171
    {'id_suffix': '_12_194_1_5', 'total_type': None, 'years': range(2012,2014)}, # pudl id 171 -- was 2_3 but now 1_5 and owned by one utility   
    
    {'id_suffix': '_12_134_0_5', 'total_type': 'utility owned total', 'years': range(1994,2001)}, # pudl id 281  # also plant total
    {'id_suffix': '_12_134_1_5', 'total_type': 'utility owned total', 'years': range(2001,2016)},
    {'id_suffix': '_12_134_1_4', 'total_type': 'utility owned total', 'years': range(2016,2020)},
    
    {'id_suffix': '_12_138_0_5', 'total_type': 'utility owned total', 'years': range(1994,)}, # pudl id 2281  # also plant total
    
    {'id_suffix': '_12_138_2_3', 'total_type': 'combustion turbine extra', 'years': range(1994,1997)}, # pudl id 2281  # pause for 2 years
    {'id_suffix': '_12_138_4_1', 'total_type': 'combustion turbine extra', 'years': range(1999,2000)}, # pudl id 2281
    {'id_suffix': '_12_138_2_3', 'total_type': 'combustion turbine extra', 'years': range(2000,2001)}, # pudl id 2281

    {'id_suffix': '_12_195_1_5', 'total_type': 'utility owned total', 'years': range(1994,2008)}, # pudl id 503  # was plant total, is also technically plant total...
    {'id_suffix': '_12_195_1_3', 'total_type': 'utility owned total', 'years': range(2008,2011)}, # pudl id 503
    {'id_suffix': '_12_195_1_1', 'total_type': 'utility owned total', 'years': range(2011,2019)}, # pudl id 503
   
    {'id_suffix': '_12_195_3_4', 'total_type': 'utility owned total', 'years': range(1994,2011)}, # pudl id 473  # was plant total, might also be plant total
    {'id_suffix': '_12_195_2_3', 'total_type': 'utility owned total', 'years': range(2011,2018)}, # pudl id 473
    {'id_suffix': '_12_195_2_2', 'total_type': 'utility owned total', 'years': range(2018,2019)}, # pudl id 473 # but there is wierdness with the w31 and w32
    
    {'id_suffix': '_12_195_2_5', 'total_type': 'unit total', 'years': range(2008,2011)}, # pudl id 473
    {'id_suffix': '_12_195_2_1', 'total_type': 'unit total', 'years': range(2011,2018)}, # pudl id 473
    
    {'id_suffix': '_12_195_3_5', 'total_type': 'plant total', 'years': range(1994,2006)}, # pudl id 1166
    
    {'id_suffix': '_12_195_5_5', 'total_type': 'unit total', 'years': range(2004,2006)}, # pudl id 343
    {'id_suffix': '_12_195_5_3', 'total_type': 'unit total', 'years': range(2006,2011)}, # pudl id 343
    {'id_suffix': '_12_195_3_3', 'total_type': 'plant total', 'years': range(2018,2019)}, # pudl id 343

    {'id_suffix': '_12_57_5_1', 'total_type': 'utility owned total', 'years': range(1994,1995)}, # pudl id 661  # was plant total  # doesn't add up
    {'id_suffix': '_12_57_4_1', 'total_type': 'utility owned total', 'years': range(1995,1998)}, # pudl id 661  # skips a year
    {'id_suffix': '_12_57_4_1', 'total_type': 'utility owned total', 'years': range(1999,2009)}, # pudl id 661

    {'id_suffix': '_12_57_5_3', 'total_type': 'utiltiy owned total', 'years': range(1994,1995)}, # pudl id 257  # was plant total
    {'id_suffix': '_12_57_5_1', 'total_type': 'utility owned total', 'years': range(1995,2009)}, # pudl id 257  # doesn't add up

    {'id_suffix': '_12_193_9_4', 'total_type': 'utility owned total', 'years': range(1995,1996)}, # pudl id 443

    {'id_suffix': '_12_281_0_2', 'total_type': 'utility owned total', 'years': range(2002,2007)}, # pudl id 1110  # was plant total and maybe is

    {'id_suffix': '_12_89_2_5', 'total_type': 'utility owned total', 'years': range(2011,2019)},
    {'id_suffix': '_12_89_1_2', 'total_type': 'utility owned total', 'years': range(2019,2020)},
]

# pudl id 336 unclear which is the total
# pudl id 652 is fishy and kind of seems like a duplicate?
# pudl id 40 needs some attention....
# pudl id 410 unsure role of cge
# pudl id 167 unsure role of cge
# pudl id 316 unsure role of cge
# pudl id 611 unsure role of cge
# pudl id 470 in 2008 two totals?
# pudl id 363 gets confusing around 2008 
# pudl id 281 unt 2 in year 1999 might get double counted
# pudl id 1209 components don't add up
# pudl id 503 pulliam-common? with capcity 0 and in ~2004 pulliam 31 shows up
# pudl id 473 has "communal" row as well and in ~1997 w31, w32
# pudl id 661 is confusing which values are which
# pudl id 529 doesn't add up
# pudl id 610 confused by what this 100% ownership thing is...
# pudl id 90 confusing
# pudl id 183 confusing total value in 2011

In [20]:
def compare_totals(flag_df, comp_col):
    """Sum non-total utility-plant reported values and compare to reported totals when applicable.
    
    This function takes in a DataFrame and calculates the sum value for both the plant and utility-plant
    groups on an annual basis and compares them with any reported totals. First, this function calculates
    group totals by excluding any rows flagged as totals or extraneous in the total_types row (i.e.:
    total_type.isna()). Then, it separates the DataFrame into two small DataFrames representing all
    the utility owned total rows and all of the plant total rows reported and flagged. Next, it
    compares the calculated group total against the reported group total and outputs a True/False
    boolean. The boolean flags from these small table comparisons are then subsumed into the larger 
    DataBase under the column names: utility_owned_total_flag and plant_total_flag.
    
    These columns will show users where there might be a reporting discrepancy and allows them to
    choose which value they might want to reply on for further calculation. This flag is not yet
    incorporated into the aggregation function below, but it could be!
    
    """
    def sum_no_totals(df, col):
        no_totals_df = df.loc[df['total_type'].isna()]
        return no_totals_df[col].sum()

    # Groupby utility-plant and plant
    plant_util_groups = flag_df.groupby(['report_year', 'utility_id_pudl', 'plant_id_pudl'])
    plant_groups = flag_df.groupby(['report_year', 'plant_id_pudl'])

    # Get the sum of the values in each group that are not totals
    plant_util_total_series = plant_util_groups.apply(lambda x: sum_no_totals(x, comp_col))
    plant_total_series = plant_groups.apply(lambda x: sum_no_totals(x, comp_col))

    # Take that series of sums, reset the index, and give the sum column an informative name
    util_plant_df = pd.DataFrame(plant_util_total_series).reset_index().rename(columns={0:'plant_util_total'})
    plant_df = pd.DataFrame(plant_total_series).reset_index().rename(columns={0:'plant_total'})

    # Merge the two sum columns together on utility-plant
    df = pd.merge(plant_df, util_plant_df, on=['report_year', 'plant_id_pudl'], how='outer')
    comp_totals = pd.merge(flag_df, df, on=['report_year', 'utility_id_pudl', 'plant_id_pudl'], how='outer')
    
    # Get slices of the dataframe that represent the utiltiy owned total rows and the plant total rows to compare with calculated totals
    util_ot = comp_totals[comp_totals['total_type']=='utility owned total'].copy()
    plant_ot = comp_totals[comp_totals['total_type']=='plant total'].copy()

    # For each dataframe, flag whether the sum of the components (calculated above) equals the reported total
    # Could make this a little more flexible (+/- 1)
    util_ot['utility_owned_total_flag'] = np.where(util_ot['capacity_mw'] == util_ot['plant_util_total'], True, False)
    plant_ot['plant_total_flag'] = np.where(plant_ot['capacity_mw'] == plant_ot['plant_total'], True, False)
    
    # Create blank columns in the original dataframe
    comp_totals[f'{comp_col}_utility_owned_total_flag'] = None
    comp_totals[f'{comp_col}_plant_total_flag'] = None

    # Update those blank columns so that the True/False values from the prior calculations are added to the correct column
    comp_totals[f'{comp_col}_utility_owned_total_flag'].update(util_ot['utility_owned_total_flag'])
    comp_totals[f'{comp_col}_plant_total_flag'].update(plant_ot['plant_total_flag'])

    return comp_totals

### **Step 2.1:** Add a simple totals flag to the steam table

In [21]:
# Create copy of the steam table with fuel types merged in
steam_test = ferc_eia_fuel.copy()

flagged_steam = (
    steam_test
    .pipe(flag_totals)
    .pipe(backfill_years_by_capacity, col_name='total', replace=False, replace_with=True)
    .pipe(flag_plant_totals, col_name='total_type')
    .pipe(backfill_years_by_capacity, col_name='total_type', replace=None, replace_with='plant total')
    .pipe(categorize_bad_rows, f_list=fix_list)
    .drop('total', axis=1)
)

In [78]:
# test
test = flagged_steam[flagged_steam['total_type'].notna()]
test2 = test[test['primary_fuel_by_mmbtu'].notna()]
tot = list(test2['plant_id_pudl'].unique())

print(len(pl))
print(len([x for x in pl if x in tot]))

1069
22


### **Step 2.2:** Check reported totals against sum of available components
For utility-owned plant portions and entire plants

In [22]:
# Compare the reported totals with the sum of the reported components
flagged_steam_total_comp = compare_totals(flagged_steam, 'capacity_mw')

In [23]:
# Just show the relevant columns
flag_total_narrow = flagged_steam_total_comp[['report_year', 'utility_id_pudl', 'utility_name_ferc1', 'plant_id_pudl', 'plant_name_ferc1',
                                              'total_type', 'capacity_mw', 'plant_total', 'plant_util_total',
                                              'capacity_mw_utility_owned_total_flag', 
                                              'capacity_mw_plant_total_flag']].copy()

In [24]:
# Mis-matching utility owned totals
uot = flag_total_narrow[flag_total_narrow['total_type']=='utility owned total']
uot_bad = uot[uot['capacity_mw_utility_owned_total_flag']==False]
print(int((len(uot_bad)/len(uot)*100)),'% of reported utility owned total values differ from the sum of their reported components')
uot_bad['plant_id_pudl'].unique()

53 % of reported utility owned total values differ from the sum of their reported components


array([  44,  542, 1216,  281,  503,  473,  661,  257, 1110,  123])

In [25]:
# Mis-matching plant totals
pt = flag_total_narrow[flag_total_narrow['total_type']=='plant total']
pt_bad = pt[pt['capacity_mw_plant_total_flag']==False]
print(int(len(pt_bad)/len(pt)*100),'% of reported utility owned total values differ from the sum of their reported components')
pt_bad['plant_id_pudl'].unique()

64 % of reported utility owned total values differ from the sum of their reported components


array([  16,  450,  288, 1087,  295,  307,  652, 1665,  123, 1209, 1166,
        529,  610,  473,  343,  336])

### **Step 2.3:** Custom aggregation based on the presense of nulls and/or totals rows
Most of the time the total rows are excluded. Sometimes, however, they provide valuable information we might want to use in the aggregation of certain columns.

The fields we'd like to aggregate on are: 
- capacity_mw
- net_generation_mwh
- avg_num_employees
- **original cost:** capex_land, capex_equipment, capex_structures, capex_total, asset_retirement_cost
- **operational expenses:** opex_operations, opex_fuel, opex_coolants, opex_steam, opex_steam_other, opex_transfer, opex_electric, opex_misc_power, opex_rents, opex_allowances, opex_engineering, opex_structures, opex_boiler, opex_plants, opex_misc_steam, opex_production_total.

In [26]:
def col_aggregator(flag_df, agg_col):
    """
    Remove total rows from aggregation; use when there are nulls present in non-total rows. 
    
    This function looks at a table grouped by year, utility, and plant and determines whether to use any of the
    information from the total rows in an aggregation based on the column agg_col specified as a parameter.
    If there are no total rows, this function simply takes the sum of each group. If there are totals rows, 
    this function first looks to see if there is a utility owned total reported and then looks to see if there is
    a plant total reported.
    
    If the aggregation must resort to using a total row, a flag is created and returned alongside the aggregated value.
    These values will later get split apart and set as seperate columns (in the build_col_agg_df function).
    
    Args: 
        df (pandas.DataFrame): A flagged version of the cleaned ferc1_steam table with the column name 'total_type'
            specifying wither it is a utility owned total, unit total, or plant total.
        agg_col (str): The name of the column you'd like to aggregate by.
    
    """
    if flag_df.loc[flag_df['total_type'].isna()][agg_col].notna().all(): 
            flag = None
            agg_value = flag_df.loc[flag_df['total_type'].isna()][agg_col].sum()
            return [agg_value, flag]
    else:
        if flag_df['total_type'].str.contains('utility owned total').any() & flag_df.loc[flag_df['total_type']=='utility owned total'][agg_col].notna().all():
            flag = 'used utility owned total'
            agg_value = flag_df.loc[flag_df['total_type']=='utility owned total'][agg_col].unique()[0]
            return [agg_value, flag]
        elif flag_df['total_type'].str.contains('plant total').any() & flag_df.loc[flag_df['total_type']=='plant total'][agg_col].notna().all():
            flag = 'used plant total pertains to more than one utility'
            agg_value = flag_df.loc[flag_df['total_type']=='plant total'][agg_col].unique()[0]
            return [agg_value, flag]
        elif flag_df.loc[flag_df['total_type'].isna()][agg_col].isna().all():
            flag = None
            agg_value = np.nan
            return [agg_value, flag]
        else:
            flag = 'aggregated with some null values'
            agg_value = flag_df.loc[flag_df['total_type'].isna()][agg_col].sum()
            return [agg_value, flag]

In [27]:
def build_col_agg_df(flag_df, agg_col):
    """Sort by field level.
    
    This function creates a mini aggregated dataframe based on a column specified in the parameters. 
    It runs the col_aggregator function so that the aggregations exclude total values unless there are
    gaps in the subcomponents--in which case it will first try and use a reported utility total and 
    then a plant total. The col_aggregator function returns both the aggregated value and flag to
    indicate whether it was aggregated based on a value from one of the total rows. This function turns
    the value and the flag (returned as a list within one column) to seperate columns.
    
    These column-based data aggregations can later be merged to form one large data aggregation column.
    
    Args: 
        df (pandas.DataFrame): A DataFrame....
        agg_level (str): The level of aggregation you'd like (plant or utility)
        agg_col (str): The name of the column you'd like to aggregate by.
    Returns:
    
    """

    plant_util_group = flag_df.groupby(['report_year', 'utility_id_pudl', 'plant_id_pudl'])
    col_agg_series = plant_util_group.apply(lambda x: col_aggregator(x, agg_col))
    col_agg_df = pd.DataFrame(col_agg_series).reset_index()
    col_agg_df[[agg_col, f'{agg_col}_flag']] = pd.DataFrame(col_agg_df[0].tolist(), index=col_agg_df.index)
    col_agg_df = col_agg_df.drop(columns=[0])
    
    return col_agg_df

In [28]:
# SMALL SCALE AGGREGATOR TEST

test = fixed_flags[fixed_flags['plant_id_pudl']==8470]
#test = test[['report_year', 'utility_id_pudl', 'utility_name_ferc1', 'plant_id_pudl', 'plant_name_ferc1', 'capacity_mw', 'plant_type', 'total', 'total_type', 'record_id']]
# test = test[['report_year', 'utility_name_ferc1', 'utility_id_pudl', 'plant_id_pudl', 'plant_name_ferc1', 'plant_type', 
#              'total_type', 'capacity_mw', 'net_generation_mwh', 'avg_num_employees', 'capex_total', 
#              'capex_land', 'capex_equipment', 'capex_structures', 'asset_retirement_cost', 'opex_operations',
#              'opex_fuel', 'opex_coolants', 'opex_steam', 'opex_steam_other', 'opex_transfer', 'opex_electric',
#              'opex_misc_power', 'opex_rents', 'opex_allowances', 'opex_engineering', 'opex_structures', 
#              'opex_boiler', 'opex_plants', 'opex_misc_steam', 'opex_production_total']]
test2 = test[['report_year', 'utility_name_ferc1', 'utility_id_pudl', 'plant_id_pudl', 'plant_name_ferc1', 'plant_type', 'capacity_mw',
              'total_type', 'avg_num_employees', 'capex_total', 'opex_production_total']]
df = test2[(test2['report_year'] == 2004)]
df
#build_col_agg_df(df, 'avg_num_employees')

NameError: name 'fixed_flags' is not defined

In [29]:
# Run this to get the full DF!
build_col_agg_df(flagged_steam_total_comp, 'avg_num_employees')

Unnamed: 0,report_year,utility_id_pudl,plant_id_pudl,avg_num_employees,avg_num_employees_flag
0,1994,7,530,462.0,used plant total pertains to more than one uti...
1,1994,14,434,123.0,
2,1994,14,1243,1925.0,
3,1994,14,2507,51.0,
4,1994,14,2561,83.0,
...,...,...,...,...,...
21873,2019,3514,1326,,
21874,2019,3514,2570,,
21875,2019,5514,434,,
21876,2019,5515,11431,5.0,


### **Step 2.3:** Combine custom aggregations and group by utility

In [699]:
# decide what to do with the flags during this groupby

#### Things to do still: 
- if use a value from plant total, make it apply to the other utilities that also have a stake in that plant (avg_num_employees is a good example)
- if there is a value within 1 of a labeled value in the same plant group, it's probably a total
- add more values to the ones that get flagged
- check my flagged values against jon's
- combine aggregated column dfs into one big df? (but maybe not that necessary/hard
- change the way that flags work
- figure out what to do with unit totals (whether it's enough to just leave them marked) 
- figure out what to do with gas plants and other weird extra values
- aggregate by utility! That's what the end goal is afterall

### Compare with Jon's CSV

In [501]:
# read in Jon's CSV
jon_df = pd.read_csv('/Users/aesharpe/Desktop/Work/Catalyst_Coop/RMI/Depreciation/f1_steam_flagged_with_plant_id.csv')

In [503]:
jon_df = jon_df[['record_id', 'Flag', 'plant_name', 'report_year', 'plant_id_ferc1']]
jon_df['plant_name'] = jon_df.plant_name.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jon_df['plant_name'] = jon_df.plant_name.str.lower()


In [496]:
steam_test = flagged_steam[['record_id','report_year', 'utility_id_ferc1', 'utility_id_pudl', 'utility_name_ferc1',
                 'plant_id_pudl', 'plant_id_ferc1', 'plant_name_ferc1', 'avg_num_employees', 
                 'capacity_mw', 'net_generation_mwh', 'opex_production_total', 'capex_total',
                 'total_type']]

In [506]:
#jon_df['rec'] = jon_df.report_year.map(str) + '-' + jon_df.plant_name.map(str)
#steam3['rec'] = steam3.report_year.map(str) + '-' + steam3.plant_name_ferc1.map(str)

In [558]:
m = missed_flags[missed_flags['plant_id_pudl']==8469]
#m[m['Flag']!= 'k']
#steam_test[(steam_test['plant_id_pudl']==8470) & (steam_test['report_year']==1995)]
m

Unnamed: 0,record_id,report_year_x,utility_id_ferc1,utility_id_pudl,utility_name_ferc1,plant_id_pudl,plant_id_ferc1_x,plant_name_ferc1,avg_num_employees,capacity_mw,net_generation_mwh,opex_production_total,capex_total,total_type,Flag,plant_name,report_year_y,plant_id_ferc1_y
211,f1_steam_1995_12_2_1_5,1995.0,2.0,18.0,ALABAMA POWER COMPANY,8469.0,10.0,miss. power,,0.0,1372059.0,4330092.0,54581072.0,,d,miss. power,1995.0,
212,f1_steam_1996_12_2_1_5,1996.0,2.0,18.0,ALABAMA POWER COMPANY,8469.0,10.0,miss. power,,0.0,1457593.0,4798298.0,54135037.0,,d,miss. power,1996.0,
213,f1_steam_1997_12_2_1_5,1997.0,2.0,18.0,ALABAMA POWER COMPANY,8469.0,10.0,miss. power,,0.0,1334215.0,7809343.0,56433280.0,,d,miss. power,1997.0,
214,f1_steam_1998_12_2_1_5,1998.0,2.0,18.0,ALABAMA POWER COMPANY,8469.0,10.0,miss. power,,0.0,1557828.0,3853739.0,56644997.0,,d,miss. power,1998.0,
215,f1_steam_1999_12_2_1_5,1999.0,2.0,18.0,ALABAMA POWER COMPANY,8469.0,10.0,miss. power,,0.0,1384959.0,6198693.0,58460443.0,,d,miss. power,1999.0,
216,f1_steam_2000_12_2_1_5,2000.0,2.0,18.0,ALABAMA POWER COMPANY,8469.0,10.0,miss. power,,0.0,1502988.0,5450401.0,60638800.0,,d,miss. power,2000.0,
217,f1_steam_2001_12_2_1_5,2001.0,2.0,18.0,ALABAMA POWER COMPANY,8469.0,10.0,miss. power,,0.0,1523622.0,4827498.0,60910914.0,,d,miss. power,2001.0,
218,f1_steam_2002_12_2_1_5,2002.0,2.0,18.0,ALABAMA POWER COMPANY,8469.0,10.0,miss. power,,0.0,1408944.0,5940801.0,63376590.0,,d,miss. power,2002.0,
219,f1_steam_2003_12_2_1_5,2003.0,2.0,18.0,ALABAMA POWER COMPANY,8469.0,10.0,miss. power,,0.0,1421419.0,5813232.0,66367561.0,,d,miss. power,2003.0,
220,f1_steam_2004_12_2_1_5,2004.0,2.0,18.0,ALABAMA POWER COMPANY,8469.0,10.0,miss. power,,0.0,1417039.0,6211069.0,68855873.0,,d,miss. power,2004.0,


In [557]:
#dd = pd.merge(jon_df, steam3, on='rec', how='inner')
merge_df = pd.merge(steam_test, jon_df, on='record_id', how='outer')
#print(len(merge_df))
missed_flags = merge_df[(merge_df['total_type'].isna()) & (merge_df['Flag'].notna())]
pl = list(missed_flags['plant_id_pudl'].unique())
len(pl)
pl = pl[:155]
pl2 = [int(p) for p in pl]
pl2

[287,
 249,
 8470,
 8469,
 11580,
 11579,
 1848,
 288,
 316,
 8536,
 1251,
 65,
 336,
 295,
 307,
 11839,
 11861,
 11869,
 469,
 542,
 1216,
 216,
 127,
 1080,
 525,
 556,
 55,
 281,
 8580,
 12217,
 123,
 1032,
 471,
 1083,
 623,
 121,
 12064,
 11826,
 2281,
 305,
 503,
 473,
 1166,
 644,
 12130,
 11558,
 603,
 470,
 458,
 443,
 11537,
 11536,
 40,
 410,
 167,
 611,
 11787,
 278,
 122,
 12087,
 12175,
 12090,
 12089,
 652,
 11987,
 444,
 12209,
 11813,
 90,
 11591,
 11753,
 511,
 8625,
 11985,
 450,
 12091,
 12092,
 12329,
 545,
 8577,
 11812,
 8890,
 8975,
 9208,
 490,
 8467,
 1019,
 15382,
 11640,
 15383,
 15385,
 15384,
 15386,
 662,
 4542,
 12066,
 12065,
 15389,
 8578,
 12336,
 12125,
 12129,
 12335,
 1236,
 9136,
 12149,
 12093,
 8729,
 11815,
 9356,
 9157,
 12083,
 12082,
 1167,
 9388,
 9369,
 12345,
 9217,
 9218,
 8530,
 11996,
 11851,
 8841,
 12127,
 8579,
 12353,
 12344,
 357,
 8,
 334,
 9216,
 9403,
 8468,
 9415,
 348,
 381,
 8466,
 422,
 421,
 8532,
 497,
 8542,
 258,
 183,

In [931]:
dd['dup'] = dd['rec'].duplicated()

In [932]:
dups = dd[dd['dup']==True]
dup_recs = list(dups['rec'].unique())

In [933]:
tt = dd[dd['rec'].str.contains('|'.join(dup_recs))]

  return func(self, *args, **kwargs)


In [944]:
test = tt[(tt['Flag'].notna()) & (tt['total']==False)]
test[['plant_id_pudl', 'report_year_x']].drop_duplicates()

Unnamed: 0,plant_id_pudl,report_year_x
31,582,1994
94,316,1994
101,167,1994
177,599,1994
223,295,1994
...,...,...
37404,65,2019
37633,357,2019
37686,278,2019
37769,8470,2019


In [880]:
flags = jon_df[jon_df['Flag'].notna()]
flags

Unnamed: 0,Flag,plant_name,report_year,rec
2,x,sterling,1994,1994-sterling
3,x,grand tower,1994,1994-grand tower
10,d,laredo,1994,1994-laredo
11,k,lon c. hill,1994,1994-lon c. hill
12,d,victoria,1994,1994-victoria
...,...,...,...,...
28932,d,columbia 2,2019,2019-columbia 2
28933,k,columbia total,2019,2019-columbia total
28934,d,elm road 1,2019,2019-elm road 1
28935,d,elm road 2,2019,2019-elm road 2


In [882]:
#steam3[steam3['total']==True]