# Depreciation Flags

### Setup

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Standard libraries
import logging
import sys
import os
import pathlib

# 3rd party libraries
import geopandas as gpd
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa

# Local libraries
import pudl

In [4]:
# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [5]:
# Display settings
sns.set()
%matplotlib inline
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 100
pd.options.display.max_rows = 10

In [6]:
# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine=pudl_engine,
    freq='AS'
)

## Step 1: combine steam + fuel and create a better technology type

**merge steam with fbp table** 
(steam and fuel tables have already been combined in the fbp table. Now, we have to merge that table with steam to get the steam fields with the fbp fuel type)

In [7]:
fbp = pudl_out.fbp_ferc1()
steam = pudl_out.plants_steam_ferc1()
plants = pudl_out.plants_eia860()
gens = pudl_out.gens_eia860()
net_gen = pudl_out.gen_eia923()

ferc_id_cols = ['report_year', 'utility_id_pudl', 'plant_name_ferc1']
plant_id_cols = ['report_year', 'utility_id_pudl', 'plant_id_pudl']
gen_id_cols = ['report_date', 'plant_id_pudl', 'generator_id']

In [8]:
# Merge fbp and steam tables
fbp_sample = fbp[ferc_id_cols + ['primary_fuel_by_mmbtu', 'primary_fuel_by_cost']]
steam_better_fuel = pd.merge(steam, fbp_sample, on=ferc_id_cols, how='left').drop_duplicates()

Show rows that need to be filled in

In [9]:
print('total rows:', len(steam_better_fuel))
# Find rows that still have no fuel type in the better steam table
print('rows without fuel type:', len(steam_better_fuel[steam_better_fuel['primary_fuel_by_mmbtu'].isna()]))

# Find plants that still have no fuel type in the better steam table
no_fuel = steam_better_fuel[steam_better_fuel['primary_fuel_by_mmbtu'].isna()]
no_fuel_list = list(no_fuel['plant_id_pudl'].unique())
print('plants with no fuel type:', len(no_fuel_list))

# Find plants with no fuel type that do have a fuel type in EIA
gens2 = gens[(gens['plant_id_pudl'].notna()) & (gens['fuel_type_code_pudl'].notna())].copy()
gens2['report_year'] = gens2['report_date'].dt.year
gen_plants = list(gens2['plant_id_pudl'].unique()).copy()
print('plants EIA could possibly cover:', len([plant for plant in no_fuel_list if plant in gen_plants]))

# Find plants and years with no fuel typ ethat do have a fuel type in EIA
id_df = (
    gens2[['report_year', 'plant_id_pudl']].drop_duplicates()
    .append(no_fuel[['report_year', 'plant_id_pudl']].drop_duplicates())
).copy()
id_df['dup'] = id_df.duplicated().copy()
print('rows EIA could possibly cover:', len(id_df[id_df['dup']==True].drop_duplicates()))

total rows: 29270
rows without fuel type: 7581
plants with no fuel type: 1069
plants EIA could possibly cover: 737
rows EIA could possibly cover: 2562


**Merge with EIA** (because there are still 6233 rows without a fuel type and some can be filled in by EIA values)

In [10]:
# Combine generation from 860 with generation from 923 to get fuel type and net generation
net_gen_fuel = pd.merge(gens, net_gen[gen_id_cols+['net_generation_mwh']], on=gen_id_cols, how='outer')

NOTE: because 923 only goes back to 2009, some will inevitably be left out :(

In [11]:
# Aggregate gen by plant and get fuel type based on net generation
net_gen_fuel['plant_net_gen'] = (
    net_gen_fuel.groupby(['report_date', 'plant_id_pudl'])['net_generation_mwh'].transform('sum')
)

net_gen_fuel['gen_pct'] = (
    net_gen_fuel['net_generation_mwh'] / net_gen_fuel['plant_net_gen'] * 100
)

# only keep rows where the gen_pct value is the highest
idx = net_gen_fuel.groupby(['report_date', 'plant_id_pudl'])['gen_pct'].transform(max) == net_gen_fuel['gen_pct']
plant_net_gen_fuel = net_gen_fuel[idx].drop_duplicates(['report_date', 'plant_id_pudl', 'gen_pct'])

In [12]:
# Make plant level eia fuel types ready to merge with steam table
eia_fuel = plant_net_gen_fuel[['report_date', 'plant_id_pudl', 'utility_id_pudl', 'fuel_type_code_pudl']].copy()
eia_fuel['report_year'] = eia_fuel['report_date'].dt.year.copy()

In [13]:
# Merge eia and steam tables
ferc_eia_fuel = pd.merge(steam_better_fuel, eia_fuel, on=plant_id_cols, how='left')
ferc_eia_fuel = ferc_eia_fuel.rename(columns={'fuel_type_code_pudl': 'fuel_type_code_pudl_net_gen'})

In [14]:
# Number of rows that still don't have a fuel type:
no_fuel_df = ferc_eia_fuel[(ferc_eia_fuel['fuel_type_code_pudl_net_gen'].isna()) & (ferc_eia_fuel['primary_fuel_by_mmbtu'].isna()) & (ferc_eia_fuel['primary_fuel_by_cost'].isna())]
print(len(no_fuel_df), 'out of', len(ferc_eia_fuel), 'rows still dont have a fuel type')
print(round((len(no_fuel_df)/len(ferc_eia_fuel)*100), 1),'%')

6816 out of 29270 rows still dont have a fuel type
23.3 %


In [15]:
print(len(no_fuel_df))
print(len(no_fuel_df[no_fuel_df['report_year']<2009]))

6816
5012


#### Might be able to do some historic backfilling too

## **Step 2:** Create flags for rows that represent TOTALS
Create flag columns for the steam table that notes which rows should be included when aggregating different data fields for each utility. These fields are: capacity, generation, number of employees, original cost, operation expenses.


The plan is to develop a flag to note whether a line is the whole plant or the owned portion

**Flags:**
- utility owned total
- unit total
- plant total
- extra gas

In [129]:
def flag_totals(steam_table):
    """Preliminarily mark the rows with total indicated in the plant name."""
    regex1 = r'(?i)tot[a-z]*' # Find anything that would resembles TOTAL in the plant name
    regex2 = '100%' # Find all names with 100% in them
    regex3 = 'ttl ' # the space at the end is important
    regex4 = r'\(all' # find any names with '(all' in them
    
    steam_table['total'] = steam_table['plant_name_ferc1'].str.contains('|'.join([regex1, regex2, regex3, regex4]))
    ser_no_octo = steam_table[steam_table['plant_name_ferc1'].str.contains('octotillo')].total == False
    steam_table['total'].update(ser_no_octo) # remove octotillio from total = True because it accidentally fits in the 'tot' regex
    
    return steam_table

In [87]:
def flag_plant_totals(df, col_name):
    
    def is_plant_total(row):
        if 'total plant' in row:
            return 'plant total'
        elif 'plant total' in row:
            return 'plant total'
        elif 'total plt' in row:
            return 'plant total'
        elif 'ttl plt' in row:
            return 'plant total'
        elif 'tot. plt.' in row:
            return 'plant total'
        elif '100%' in row:
            return 'plant total'
        else:
            return None
    
    df[col_name] = df.apply(lambda x: is_plant_total(x.plant_name_ferc1), axis=1)
    
    return df

In [88]:
def backfill_years_by_capacity(df, col_name, replace, replace_with):
    """Backfill rows based on capacity."""
    
    only_totals_df = df[df[col_name]==replace_with]
    plant_groups = only_totals_df.groupby('plant_id_pudl')
    capacity_dict = plant_groups['capacity_mw'].apply(lambda x: [x for x in list(x.unique()) if x !=0]).to_dict() # no zeros
    plants_with_totals_list = list(capacity_dict.keys())
    
    for plant_id in plants_with_totals_list:
        for capacity in capacity_dict[plant_id]:
            one_plant_df = df[df['plant_id_pudl']==plant_id].copy()
            cap_match_df = one_plant_df[one_plant_df['capacity_mw'].isin(capacity_dict[plant_id])]
            series_update = cap_match_df[col_name].replace({replace: replace_with})
            df[col_name].update(series_update) 
            
    return df

In [137]:
def categorize_bad_rows(df, f_list): # could probably make this faster...
    """Flag bad rows."""
    for fix_dict in f_list:
        for year in fix_dict['years']:
            df.loc[df['record_id']==f"f1_steam_{year}{fix_dict['id_suffix']}", 'total_type'] = fix_dict['total_type']
    
    return df

In [None]:
# Fix-a-dic
# need to append ferc1_steam_YEAR_id_suffix
# categories:
# - plant total
# - unit total
# - utility owned total
# - combustion turbine extra

fix_list = [
    # Rockport AEP
    {'id_suffix': '_12_1_0_3', 'total_type': 'utility owned total', 'years': range(1994,2020)}, #pudl id 530
    # Rockport IMP
    {'id_suffix': '_12_73_1_3', 'total_type': 'utility owned total', 'years': range(1994,1997)}, #pudl id 530
    {'id_suffix': '_12_73_0_3', 'total_type': 'utiltiy owned total', 'years': range(1997,2020)}, # pudl id 530
    # Amos APC
    {'id_suffix': '_12_6_0_3', 'total_type': 'plant total', 'years': range(1994,2002)}, #pudl id 16
    # Conesville 4 - Columbus Southern Power Company then Ohio Power Company then AEP
    {'id_suffix': '_12_31_0_3', 'total_type': 'unit total', 'years': range(1994,2011)}, # pudl id 128
    {'id_suffix': '_12_127_4_1', 'total_type': 'unit total', 'years': range(2011,2014)}, # pudl id 128
    {'id_suffix': '_12_452_1_2', 'total_type': 'unit total', 'years': range(2014,2015)}, # pudl id 128
    # Conesville 4 - Duke 
    {'id_suffix': '_12_27_1_3', 'total_type': 'unit total', 'years': range(1994,2003)}, # pudl if 128  # was plant total
    # Belle River - DTE
    {'id_suffix': '_12_44_0_1', 'total_type': 'utility owned total', 'years': range(1994, 2020)}, # pudl id 44  # also plant total, doesn't add up first year
    # Mitchell - Kentucky Power
    {'id_suffix': '_12_81_0_3', 'total_type': 'plant total', 'years': range(2014,2020)}, # pudl id 382  
    # Mitchell - AEP then Wheeling Power
    {'id_suffix': '_12_452_3_3', 'total_type': 'plant total', 'years': range(2014,2015)}, # pudl id 382
    {'id_suffix': '_12_192_0_2', 'total_type': 'plant total', 'years': range(2015,2020)}, # pudl id 382
    # Iatan 1 - Kansas City Power and Light
    {'id_suffix': '_12_79_1_1', 'total_type': 'unit total', 'years': range(2010,2020)}, # pudl id 295  # was plant total
    # Iatan 2 - Kansas Ciry Power and Light
    {'id_suffix': '_12_79_1_3', 'total_type': 'unit total', 'years': range(2010,2020)}, # pudl id 295  # was plant total
    # La Cygne - Kansas  
    #{'id_suffix': '_12_80_0_3', 'total_type': 'plant total', 'years': range(1994,2010)}, # pudl id 336  # very weird, nums don't add up
    # Jeffrey - Kansas Gas and Electric
    {'id_suffix': '_12_80_1_1', 'total_type': 'plant total', 'years': range(1994,1995)}, # pudl id 307
    {'id_suffix': '_12_80_1_3', 'total_type': 'plant total', 'years': range(1995,2002)}, # pudl id 307
    {'id_suffix': '_12_80_1_1', 'total_type': 'plant total', 'years': range(2002,2010)},
    # Jeffrey - Westar Energy
    {'id_suffix': '_12_191_1_4', 'total_type': 'plant total', 'years': range(1994,1995)}, # pudl id 307
    {'id_suffix': '_12_191_1_5', 'total_type': 'plant total', 'years': range(1995,2005)}, # pudl id 307
    {'id_suffix': '_12_191_1_3', 'total_type': 'plant total', 'years': range(2005,2006)}, # pudl id 307
    {'id_suffix': '_12_191_1_5', 'total_type': 'plant total', 'years': range(2006,2010)}, # pudl id 307
    # JM Stuart - Duke Energy
    {'id_suffix': '_12_27_1_1', 'total_type': 'plant total', 'years': range(1994,2003)}, # pudl id 288
    # JM Stuart - Dayton Power and Light
    {'id_suffix': '_12_42_2_1', 'total_type': 'plant total', 'years': range(1994,1998)}, # pudl id 288
    {'id_suffix': '_12_42_1_1', 'total_type': 'plant total', 'years': range(1998,2001)}, # pudl id 288
    # Valley - Wisconsin Power and Electric
    {'id_suffix': '_12_193_0_3', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 603  # could also be plant total
    # Pt. Wash - Wisconsin Electric Power
    {'id_suffix': '_12_193_1_4', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 470  # other weird value -- see below
    # Pt. Wash (gas) - Wisconsin Electric Power
    {'id_suffix': '_12_193_7_1', 'total_type': 'combustion turbine extra', 'years': range(1994,1996)}, # pudl id 470
    {'id_suffix': '_12_193_1_3', 'total_type': 'combustion turbine extra', 'years': range(1996,2004)}, # pudl id 470  # 2004 might not be right -- confusing
    
    {'id_suffix': '_12_193_3_4', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 469  # also plant total
    
    {'id_suffix': '_12_193_4_4', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 542  # also plant total
    {'id_suffix': '_12_193_0_4', 'total_type': 'utility owned total', 'years': range(1996,2008)}, # has 1 unit only here on...
    {'id_suffix': '_12_193_1_3', 'total_type': 'utility owned total', 'years': range(2008,2015)}, # pudl id 542  # technically becomes just one row in 2010
    
    {'id_suffix': '_12_193_5_2', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 1216 # also plant total
    {'id_suffix': '_12_193_0_5', 'total_type': 'utility owned total', 'years': range(1996,2008)}, # pudl id 1216 # has 1 unit only
    
    {'id_suffix': '_12_193_5_5', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 458 # also plant total
    
    {'id_suffix': '_12_193_6_5', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 216  # also plant total
    
    {'id_suffix': '12_193_8_4', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 127  # also plant total
    
    {'id_suffix': '_12_194_0_4', 'total_type': 'unit total', 'years': range(1994,2012)}, # pudl id 123  # was labeled plant total b/c of 100%
    {'id_suffix': '_12_194_0_1', 'total_type': 'unit total', 'years': range(2012,)}, # pudl id 123
    
    {'id_suffix': '_12_194_0_5', 'total_type': 'unit total', 'years': range(1994,2012)}, # pudl id 123  # was labeled plant total b/c of 100%
    {'id_suffix': '_12_194_0_2', 'total_type': 'unit total', 'years': range(2012,)}, # pudl id 123
    
    {'id_suffix': '_12_194_2_3', 'total_type': 'utility owned total', 'years': range(1994,1998)}, # pudl id 123 
    {'id_suffix': '_12_194_1_4', 'total_type': 'utility owned total', 'years': range(1998,2012)}, # pudl id 123  # continuation of before 2_3 to 1_4
    {'id_suffix': '_12_194_1_1', 'total_type': 'utility owned total', 'years': range(2012,2014)}, # pudl id 123 
    
    {'id_suffix': '_12_89_1_5', 'total_type': 'utility owned total', 'years': range(1994,1998)},# pudl id 123
    {'id_suffix': '_12_89_1_3', 'total_type': 'utility owned total', 'years': range(1998,2002)}, # pudl id 123  # continuation of before 1_5 to 1_3
    {'id_suffix': '_12_89_1_5', 'total_type': 'utility owned total', 'years': range(2002,2007)}, # pudl id 123  # record id went back from 1_3 to 1_5
    {'id_suffix': '_12_89_0_4', 'total_type': 'utility owned total', 'years': range(2007,2008)}, # pudl id 123  # 1_5 to 0_4
    {'id_suffix': '_12_89_0_5', 'total_type': 'utility owned total', 'years': range(2008,2009)}, # pudl id 123  # 0_4 to 0_5
    {'id_suffix': '_12_89_1_4', 'total_type': 'utility owned total', 'years': range(2010,2012)}, # pudl id 123  # 0_5 to 1_4
    {'id_suffix': '_12_89_0_5', 'total_type': 'utility owned total', 'years': range(2012,2014)}, # pudl id 123
    {'id_suffix': '_12_89_1_4', 'total_type': 'utility owned total', 'years': range(2014,2015)}, # pudl id 123
    {'id_suffix': '_12_89_1_5', 'total_type': 'utility owned total', 'years': range(2015,2016)}, # pudl id 123
    {'id_suffix': '_12_89_0_5', 'total_type': 'utility owned total', 'years': range(2016,2019)}, # pudl id 123
    {'id_suffix': '_12_89_0_4', 'total_type': 'utility owned total', 'years': range(2019,2020)}, # pudl id 123
    
    {'id_suffix': '_12_194_2_5', 'total_type': 'unit total', 'years': range(1994,1998)}, # pudl id 171  # was plant total
    {'id_suffix': '_12_194_2_1', 'total_type': 'unit total', 'years': range(1998,2012)}, # pudl id 171
    {'id_suffix': '_12_194_1_3', 'total_type': 'unit total', 'years': range(2012,2014)}, # pudl id 171
    {'id_suffix': '_12_194_1_1', 'total_type': 'unit total', 'years': range(2014,2016)}, # pudl id 171
    {'id_suffix': '_12_194_0_5', 'total_type': 'unit total', 'years': range(2016,2019)}, # pudl id 171
    
    {'id_suffix': '_12_194_3_4', 'total_type': 'unit total', 'years': range(1994,1998)}, # pudl id 171  # contains unit-1, was plant total
    {'id_suffix': '_12_194_2_3', 'total_type': 'unit total', 'years': range(1998,2012)}, # pudl id 171
    {'id_suffix': '_12_194_1_5', 'total_type': None, 'years': range(2012,2014)}, # pudl id 171 -- was 2_3 but now 1_5 and owned by one utility   
    
    {'id_suffix': '_12_134_0_5', 'total_type': 'utility owned total', 'years': range(1994,2001)}, # pudl id 281  # also plant total
    {'id_suffix': '_12_134_1_5', 'total_type': 'utility owned total', 'years': range(2001,2016)},
    {'id_suffix': '_12_134_1_4', 'total_type': 'utility owned total', 'years': range(2016,2020)},
    
    {'id_suffix': '_12_138_0_5', 'total_type': 'utility owned total', 'years': range(1994,)}, # pudl id 2281  # also plant total
    
    {'id_suffix': '_12_138_2_3', 'total_type': 'combustion turbine extra', 'years': range(1994,1997)}, # pudl id 2281  # pause for 2 years
    {'id_suffix': '_12_138_4_1', 'total_type': 'combustion turbine extra', 'years': range(1999,2000)}, # pudl id 2281
    {'id_suffix': '_12_138_2_3', 'total_type': 'combustion turbine extra', 'years': range(2000,2001)}, # pudl id 2281

    {'id_suffix': '_12_195_1_5', 'total_type': 'utility owned total', 'years': range(1994,2008)}, # pudl id 503  # was plant total, is also technically plant total...
    {'id_suffix': '_12_195_1_3', 'total_type': 'utility owned total', 'years': range(2008,2011)}, # pudl id 503
    {'id_suffix': '_12_195_1_1', 'total_type': 'utility owned total', 'years': range(2011,2019)}, # pudl id 503
   
    {'id_suffix': '_12_195_3_4', 'total_type': 'utility owned total', 'years': range(1994,2011)}, # pudl id 473  # was plant total, might also be plant total
    {'id_suffix': '_12_195_2_3', 'total_type': 'utility owned total', 'years': range(2011,2018)}, # pudl id 473
    {'id_suffix': '_12_195_2_2', 'total_type': 'utility owned total', 'years': range(2018,2019)}, # pudl id 473 # but there is wierdness with the w31 and w32
    
    {'id_suffix': '_12_195_2_5', 'total_type': 'unit total', 'years': range(2008,2011)}, # pudl id 473
    {'id_suffix': '_12_195_2_1', 'total_type': 'unit total', 'years': range(2011,2018)}, # pudl id 473
    
    {'id_suffix': '_12_195_3_5', 'total_type': 'plant total', 'years': range(1994,2006)}, # pudl id 1166
    
    {'id_suffix': '_12_195_5_5', 'total_type': 'unit total', 'years': range(2004,2006)}, # pudl id 343
    {'id_suffix': '_12_195_5_3', 'total_type': 'unit total', 'years': range(2006,2011)}, # pudl id 343
    {'id_suffix': '_12_195_3_3', 'total_type': 'plant total', 'years': range(2018,2019)}, # pudl id 343

    {'id_suffix': '_12_57_5_1', 'total_type': 'utility owned total', 'years': range(1994,1995)}, # pudl id 661  # was plant total  # doesn't add up
    {'id_suffix': '_12_57_4_1', 'total_type': 'utility owned total', 'years': range(1995,1998)}, # pudl id 661  # skips a year
    {'id_suffix': '_12_57_4_1', 'total_type': 'utility owned total', 'years': range(1999,2009)}, # pudl id 661

    {'id_suffix': '_12_57_5_3', 'total_type': 'utiltiy owned total', 'years': range(1994,1995)}, # pudl id 257  # was plant total
    {'id_suffix': '_12_57_5_1', 'total_type': 'utility owned total', 'years': range(1995,2009)}, # pudl id 257  # doesn't add up

    {'id_suffix': '_12_193_9_4', 'total_type': 'utility owned total', 'years': range(1995,1996)}, # pudl id 443

    {'id_suffix': '_12_281_0_2', 'total_type': 'utility owned total', 'years': range(2002,2007)}, # pudl id 1110  # was plant total and maybe is

    {'id_suffix': '_12_89_2_5', 'total_type': 'utility owned total', 'years': range(2011,2019)},
    {'id_suffix': '_12_89_1_2', 'total_type': 'utility owned total', 'years': range(2019,2020)},
]

# pudl id 336 unclear which is the total
# pudl id 652 is fishy and kind of seems like a duplicate?
# pudl id 40 needs some attention....
# pudl id 410 unsure role of cge
# pudl id 167 unsure role of cge
# pudl id 316 unsure role of cge
# pudl id 611 unsure role of cge
# pudl id 470 in 2008 two totals?
# pudl id 363 gets confusing around 2008 
# pudl id 281 unt 2 in year 1999 might get double counted
# pudl id 1209 components don't add up
# pudl id 503 pulliam-common? with capcity 0 and in ~2004 pulliam 31 shows up
# pudl id 473 has "communal" row as well and in ~1997 w31, w32
# pudl id 661 is confusing which values are which
# pudl id 529 doesn't add up
# pudl id 610 confused by what this 100% ownership thing is...
# pudl id 90 confusing
# pudl id 183 confusing total value in 2011

In [138]:
# Create copy of the steam table with fuel types merged in
steam_test = ferc_eia_fuel.copy()

flagged_steam = (
    steam_test
    .pipe(flag_totals)
    .pipe(backfill_years_by_capacity, col_name='total', replace=False, replace_with=True)
    .pipe(flag_plant_totals, col_name='total_type')
    .pipe(backfill_years_by_capacity, col_name='total_type', replace=None, replace_with='plant total')
    .pipe(categorize_bad_rows, f_list=fix_list)
    .drop('total', axis=1)
)

### **Step 2.5:** Triage aggregation based on presense of nulls
Most of the time the total rows are excluded. Sometimes, however, they provide valuable information we might want to use in the aggregation of certain columns.

The fields we'd like to aggregate on are: 
- capacity_mw
- net_generation_mwh
- avg_num_employees
- **original cost:** capex_land, capex_equipment, capex_structures, capex_total, asset_retirement_cost
- **operational expenses:** opex_operations, opex_fuel, opex_coolants, opex_steam, opex_steam_other, opex_transfer, opex_electric, opex_misc_power, opex_rents, opex_allowances, opex_engineering, opex_structures, opex_boiler, opex_plants, opex_misc_steam, opex_production_total.

In [459]:
def col_aggregator(flag_df, agg_col):
    """
    Remove total rows from aggregation; use when there are nulls present in non-total rows. 
    
    This function looks at a table grouped by year, utility, and plant and determines whether to use any of the
    information from the total rows in an aggregation based on the column agg_col specified as a parameter.
    If there are no total rows, this function simply takes the sum of each group. If there are totals rows, 
    this function first looks to see if there is a utility owned total reported and then looks to see if there is
    a plant total reported.
    
    If the aggregation must resort to using a total row, a flag is created and returned alongside the aggregated value.
    These values will later get split apart and set as seperate columns (in the build_col_agg_df function).
    
    Args: 
        df (pandas.DataFrame): A flagged version of the cleaned ferc1_steam table with the column name 'total_type'
            specifying wither it is a utility owned total, unit total, or plant total.
        agg_col (str): The name of the column you'd like to aggregate by.
    
    """
    if flag_df.loc[flag_df['total_type'].isna()][agg_col].notna().all(): 
            flag = None
            agg_value = flag_df.loc[flag_df['total_type'].isna()][agg_col].sum()
            return [agg_value, flag]
    else:
        if flag_df['total_type'].str.contains('utility owned total').any() & flag_df.loc[flag_df['total_type']=='utility owned total'][agg_col].notna().all():
            flag = 'used utility owned total'
            agg_value = flag_df.loc[flag_df['total_type']=='utility owned total'][agg_col].unique()[0]
            return [agg_value, flag]
        elif flag_df['total_type'].str.contains('plant total').any() & flag_df.loc[flag_df['total_type']=='plant total'][agg_col].notna().all():
            flag = 'used plant total pertains to more than one utility'
            agg_value = flag_df.loc[flag_df['total_type']=='plant total'][agg_col].unique()[0]
            return [agg_value, flag]
        elif flag_df.loc[flag_df['total_type'].isna()][agg_col].isna().all():
            flag = None
            agg_value = np.nan
            return [agg_value, flag]
        else:
            flag = 'aggregated with some null values'
            agg_value = flag_df.loc[flag_df['total_type'].isna()][agg_col].sum()
            return [agg_value, flag]

In [460]:
def build_col_agg_df(flag_df, agg_col):
    """Sort by field level.
    
    This function creates a mini aggregated dataframe based on a column specified in the parameters. 
    It runs the col_aggregator function so that the aggregations exclude total values unless there are
    gaps in the subcomponents--in which case it will first try and use a reported utility total and 
    then a plant total. The col_aggregator function returns both the aggregated value and flag to
    indicate whether it was aggregated based on a value from one of the total rows. This function turns
    the value and the flag (returned as a list within one column) to seperate columns.
    
    These column-based data aggregations can later be merged to form one large data aggregation column.
    
    Args: 
        df (pandas.DataFrame): A DataFrame....
        agg_level (str): The level of aggregation you'd like (plant or utility)
        agg_col (str): The name of the column you'd like to aggregate by.
    Returns:
    
    """

    plant_util_group = flag_df.groupby(['report_year', 'utility_id_pudl', 'plant_id_pudl'])
    col_agg_series = plant_util_group.apply(lambda x: col_aggregator(x, agg_col))
    col_agg_df = pd.DataFrame(col_agg_series).reset_index()
    col_agg_df[[agg_col, f'{agg_col}_flag']] = pd.DataFrame(col_agg_df[0].tolist(), index=col_agg_df.index)
    col_agg_df = col_agg_df.drop(columns=[0])
    
    return col_agg_df

In [480]:
# SMALL SCALE AGGREGATOR TEST

test = fixed_flags[fixed_flags['plant_id_pudl']==171]
#test = test[['report_year', 'utility_id_pudl', 'utility_name_ferc1', 'plant_id_pudl', 'plant_name_ferc1', 'capacity_mw', 'plant_type', 'total', 'total_type', 'record_id']]
# test = test[['report_year', 'utility_name_ferc1', 'utility_id_pudl', 'plant_id_pudl', 'plant_name_ferc1', 'plant_type', 
#              'total_type', 'capacity_mw', 'net_generation_mwh', 'avg_num_employees', 'capex_total', 
#              'capex_land', 'capex_equipment', 'capex_structures', 'asset_retirement_cost', 'opex_operations',
#              'opex_fuel', 'opex_coolants', 'opex_steam', 'opex_steam_other', 'opex_transfer', 'opex_electric',
#              'opex_misc_power', 'opex_rents', 'opex_allowances', 'opex_engineering', 'opex_structures', 
#              'opex_boiler', 'opex_plants', 'opex_misc_steam', 'opex_production_total']]
test2 = test[['report_year', 'utility_name_ferc1', 'utility_id_pudl', 'plant_id_pudl', 'plant_name_ferc1', 'plant_type', 'capacity_mw',
              'total_type', 'avg_num_employees', 'capex_total', 'opex_production_total']]
df = test2[(test2['report_year'] < 1996)]
build_col_agg_df(df, 'capacity_mw')

Unnamed: 0,report_year,utility_id_pudl,plant_id_pudl,capacity_mw,capacity_mw_flag
0,1994,363,171,95.0,
1,1994,364,171,576.0,
2,1994,365,171,104.94,
3,1995,363,171,98.0,
4,1995,364,171,570.0,
5,1995,365,171,104.94,


In [None]:
build_col_agg_df(flagged_df, 'capacity_mw')

#### Things to do still: 
- if use a value from plant total make it apply to the other utilities that also have a stake in that plant (avg_num_employees is a good example)
- add another flag to check the subcomponent sum against reported utility owned totals (if there is one)
- if there is a value within 1 of a labeled value in the same plant group, it's probably a total

In [856]:
###### list of plant ids with total values
#list(steam2[steam2['total']==True]['plant_id_pudl'].unique())

In [None]:
##############

In [919]:
# read in Jon's CSV
jon_df = pd.read_csv('/Users/aesharpe/Desktop/Work/Catalyst_Coop/RMI/Depreciation/f1_steam_flagged_with_plant_id.csv')

In [920]:
jon_df = jon_df[['Flag', 'plant_name', 'report_year', 'plant_id_ferc1']]
jon_df['plant_name'] = jon_df.plant_name.str.lower()

In [921]:
steam3 = steam2[['report_year', 'utility_id_ferc1', 'utility_id_pudl', 'utility_name_ferc1',
                 'plant_id_pudl', 'plant_id_ferc1', 'plant_name_ferc1', 'avg_num_employees', 
                 'capacity_mw', 'net_generation_mwh', 'opex_production_total', 'capex_total',
                 'primary_fuel_by_mmbtu', 'primary_fuel_by_cost', 'fuel_type_code_pudl_net_gen', 'total']]

In [929]:
jon_df['rec'] = jon_df.report_year.map(str) + '-' + jon_df.plant_name.map(str)
steam3['rec'] = steam3.report_year.map(str) + '-' + steam3.plant_name_ferc1.map(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  steam3['rec'] = steam3.report_year.map(str) + '-' + steam3.plant_name_ferc1.map(str)


In [930]:
dd = pd.merge(jon_df, steam3, on='rec', how='inner')
len(dd)

38354

In [931]:
dd['dup'] = dd['rec'].duplicated()

In [932]:
dups = dd[dd['dup']==True]
dup_recs = list(dups['rec'].unique())

In [933]:
tt = dd[dd['rec'].str.contains('|'.join(dup_recs))]

  return func(self, *args, **kwargs)


In [944]:
test = tt[(tt['Flag'].notna()) & (tt['total']==False)]
test[['plant_id_pudl', 'report_year_x']].drop_duplicates()

Unnamed: 0,plant_id_pudl,report_year_x
31,582,1994
94,316,1994
101,167,1994
177,599,1994
223,295,1994
...,...,...
37404,65,2019
37633,357,2019
37686,278,2019
37769,8470,2019


In [880]:
flags = jon_df[jon_df['Flag'].notna()]
flags

Unnamed: 0,Flag,plant_name,report_year,rec
2,x,sterling,1994,1994-sterling
3,x,grand tower,1994,1994-grand tower
10,d,laredo,1994,1994-laredo
11,k,lon c. hill,1994,1994-lon c. hill
12,d,victoria,1994,1994-victoria
...,...,...,...,...
28932,d,columbia 2,2019,2019-columbia 2
28933,k,columbia total,2019,2019-columbia total
28934,d,elm road 1,2019,2019-elm road 1
28935,d,elm road 2,2019,2019-elm road 2


In [882]:
#steam3[steam3['total']==True]