From c9a681a5d503b7a24417e4d3386cde3d38dd878c Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Thu, 8 Jun 2023 17:14:45 -0400 Subject: [PATCH 01/10] Add a transform main function with spot fixes for certain negative ending_balance values for the utility_plant_summary_ferc1 table --- src/pudl/transform/ferc1.py | 161 ++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index bbbaae5ebc..c65f8c584f 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -3893,6 +3893,167 @@ class UtilityPlantSummaryFerc1TableTransformer(Ferc1AbstractTableTransformer): table_id: TableIdFerc1 = TableIdFerc1.UTILITY_PLANT_SUMMARY_FERC1 has_unique_record_ids: bool = False + def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: + """Spot fix depreciation_utility_plant_in_service records with bad signs.""" + df = super().transform_main(df) + + primary_keys = [ + "report_year", + "utility_id_ferc1", + "utility_type", + "utility_plant_asset_type", + ] + + # The utility_id_ferc1 211 follows the same pattern for several years + # instead of writing them all out in spot_fix_pks, we'll create a loop that + # generates all of them and then append them to spot_fix_pks later + spot_fix_211 = [] + for year in np.append(2006, range(2009, 2021)): + for utility_type in ["electric", "total"]: + pks = [ + ( + year, + 211, + utility_type, + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + ), + ( + year, + 211, + utility_type, + "amortization_of_other_utility_plant_utility_plant_in_service", + ), + ( + year, + 211, + utility_type, + "depreciation_amortization_and_depletion_utility_plant_in_service", + ), + (year, 211, utility_type, "depreciation_utility_plant_in_service"), + ] + spot_fix_211 = spot_fix_211 + pks + + spot_fix_pks = [ + ( + 2012, + 156, + "total", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + ), + ( + 2012, + 156, + "total", + "depreciation_amortization_and_depletion_utility_plant_in_service", + ), + (2012, 156, "total", "depreciation_utility_plant_in_service"), + ( + 2012, + 156, + "electric", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + ), + ( + 2012, + 156, + "electric", + "depreciation_amortization_and_depletion_utility_plant_in_service", + ), + (2012, 156, "electric", "depreciation_utility_plant_in_service"), + ( + 2002, + 170, + "other1", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility", + ), + ( + 2013, + 170, + "total", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + ), + ( + 2013, + 170, + "total", + "amortization_of_other_utility_plant_utility_plant_in_service", + ), + (2013, 170, "total", "amortization_of_plant_acquisition_adjustment"), + ( + 2013, + 170, + "total", + "depreciation_amortization_and_depletion_utility_plant_in_service", + ), + (2013, 170, "total", "depreciation_utility_plant_in_service"), + ( + 2013, + 170, + "electric", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + ), + ( + 2013, + 170, + "electric", + "amortization_of_other_utility_plant_utility_plant_in_service", + ), + (2013, 170, "electric", "amortization_of_plant_acquisition_adjustment"), + ( + 2013, + 170, + "electric", + "depreciation_amortization_and_depletion_utility_plant_in_service", + ), + (2013, 170, "electric", "depreciation_utility_plant_in_service"), + ( + 2007, + 393, + "electric", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + ), + ( + 2007, + 393, + "electric", + "depreciation_amortization_and_depletion_utility_plant_in_service", + ), + (2007, 393, "electric", "depreciation_utility_plant_in_service"), + ( + 2007, + 393, + "total", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + ), + ( + 2007, + 393, + "total", + "depreciation_amortization_and_depletion_utility_plant_in_service", + ), + (2007, 393, "total", "depreciation_utility_plant_in_service"), + ] + + # Combine bespoke fixes with programatically generated spot fixes + spot_fix_pks = spot_fix_pks + spot_fix_211 + + # Create a df out of the primary key of the records you want to fix + df_keys = pd.DataFrame(spot_fix_pks, columns=primary_keys).set_index( + primary_keys + ) + df.set_index(primary_keys, inplace=True) + # Flip the signs for the values in "ending balance" all records in the original + # df that appear in the primary key df + df.loc[df_keys.index, "ending_balance"] = df["ending_balance"] * -1 + # All of these are flipping negative values to positive values, so let's + # make sure that's what happens + if (df.loc[df_keys.index].ending_balance < 0).any(): + raise AssertionError("None of these spot fixes should be negative") + + df = df.reset_index() + + return df + class BalanceSheetLiabilitiesFerc1TableTransformer(Ferc1AbstractTableTransformer): """Transformer class for :ref:`balance_sheet_liabilities_ferc1` table.""" From fad70cde70fe78fdaad83bd6c2d653c4d36ccc92 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Tue, 13 Jun 2023 00:01:03 -0400 Subject: [PATCH 02/10] Add conditionals for spot fixes to accomodate fast tests --- src/pudl/transform/ferc1.py | 87 ++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 36 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 29ddd39787..ab6affdfa3 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -4014,33 +4014,44 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: "utility_plant_asset_type", ] + # We need to track which years are getting transformed so the spot fixes + # don't fail on the fast tests. + df_years = df.report_year.unique().tolist() + # The utility_id_ferc1 211 follows the same pattern for several years # instead of writing them all out in spot_fix_pks, we'll create a loop that # generates all of them and then append them to spot_fix_pks later spot_fix_211 = [] for year in np.append(2006, range(2009, 2021)): - for utility_type in ["electric", "total"]: - pks = [ - ( - year, - 211, - utility_type, - "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", - ), - ( - year, - 211, - utility_type, - "amortization_of_other_utility_plant_utility_plant_in_service", - ), - ( - year, - 211, - utility_type, - "depreciation_amortization_and_depletion_utility_plant_in_service", - ), - (year, 211, utility_type, "depreciation_utility_plant_in_service"), - ] + # Measure to account for fast tests where not all years are used + if year in df_years: + for utility_type in ["electric", "total"]: + pks = [ + ( + year, + 211, + utility_type, + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + ), + ( + year, + 211, + utility_type, + "amortization_of_other_utility_plant_utility_plant_in_service", + ), + ( + year, + 211, + utility_type, + "depreciation_amortization_and_depletion_utility_plant_in_service", + ), + ( + year, + 211, + utility_type, + "depreciation_utility_plant_in_service", + ), + ] spot_fix_211 = spot_fix_211 + pks spot_fix_pks = [ @@ -4143,24 +4154,28 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: ), (2007, 393, "total", "depreciation_utility_plant_in_service"), ] + # Measure to account for fast tests where not all years are used + spot_fix_pks = [x for x in spot_fix_pks if x[0] in df_years] # Combine bespoke fixes with programatically generated spot fixes spot_fix_pks = spot_fix_pks + spot_fix_211 - # Create a df out of the primary key of the records you want to fix - df_keys = pd.DataFrame(spot_fix_pks, columns=primary_keys).set_index( - primary_keys - ) - df.set_index(primary_keys, inplace=True) - # Flip the signs for the values in "ending balance" all records in the original - # df that appear in the primary key df - df.loc[df_keys.index, "ending_balance"] = df["ending_balance"] * -1 - # All of these are flipping negative values to positive values, so let's - # make sure that's what happens - if (df.loc[df_keys.index].ending_balance < 0).any(): - raise AssertionError("None of these spot fixes should be negative") - - df = df.reset_index() + # Measure to account for fast tests where not all years are used + if spot_fix_pks: + # Create a df out of the primary key of the records you want to fix + df_keys = pd.DataFrame(spot_fix_pks, columns=primary_keys).set_index( + primary_keys + ) + df.set_index(primary_keys, inplace=True) + # Flip the signs for the values in "ending balance" all records in the original + # df that appear in the primary key df + df.loc[df_keys.index, "ending_balance"] = df["ending_balance"] * -1 + # All of these are flipping negative values to positive values, so let's + # make sure that's what happens + if (df.loc[df_keys.index].ending_balance < 0).any(): + raise AssertionError("None of these spot fixes should be negative") + + df = df.reset_index() return df From ad5a3b07a66c61db938fbca3c7588852995a7667 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Tue, 13 Jun 2023 14:36:38 -0400 Subject: [PATCH 03/10] Switch accum_provision_DAD rows in utility_plant_summary table Previously we mapped FERC row 14, Accum Prov for Depr, Amort, & Depl, to the XBRL row accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility and FERC row 33, Total Accum Prov (equals 14) (22,26,30,31,32), to a unique row accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail (note the '_detail' suffix). The former (sans '_detail') maps to an XBRL calculated value while the latter does not. Technically, row 33 should be the calculated value as it notes all the rows that should sum up to it). This commit maps row 33 to accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility and row 14 to accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_reported. I added the '_reported' suffix instead of the '_detail' suffix because it's more informative.This commit also replaces all the spot-fix rows that reference these values with the correct value name. This commit also adds one more value to the spot fixer for utility_plant_summary_ferc1 --- src/pudl/transform/ferc1.py | 31 ++++++++++++++++++------------ src/pudl/transform/params/ferc1.py | 4 ++-- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index ab6affdfa3..c3129a854d 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -4031,7 +4031,7 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: year, 211, utility_type, - "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility", ), ( year, @@ -4059,7 +4059,7 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: 2012, 156, "total", - "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility", ), ( 2012, @@ -4072,7 +4072,7 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: 2012, 156, "electric", - "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility", ), ( 2012, @@ -4085,13 +4085,19 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: 2002, 170, "other1", - "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_reported", ), + ( + 2002, + 170, + "other1", + "utility_plant_net", + ), # ^^ This is the only record that goes positive to negative ( 2013, 170, "total", - "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility", ), ( 2013, @@ -4111,7 +4117,7 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: 2013, 170, "electric", - "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility", ), ( 2013, @@ -4131,7 +4137,7 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: 2007, 393, "electric", - "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility", ), ( 2007, @@ -4144,7 +4150,7 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: 2007, 393, "total", - "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail", + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility", ), ( 2007, @@ -4170,10 +4176,11 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: # Flip the signs for the values in "ending balance" all records in the original # df that appear in the primary key df df.loc[df_keys.index, "ending_balance"] = df["ending_balance"] * -1 - # All of these are flipping negative values to positive values, so let's - # make sure that's what happens - if (df.loc[df_keys.index].ending_balance < 0).any(): - raise AssertionError("None of these spot fixes should be negative") + # All of these are flipping negative values to positive values, except one, + # so let's make sure that's what happens + flipped_values = df.loc[df_keys.index].copy() + if len(flipped_values[flipped_values["ending_balance"] < 0]) > 1: + raise AssertionError("Only one of these spot fixes should be negative") df = df.reset_index() diff --git a/src/pudl/transform/params/ferc1.py b/src/pudl/transform/params/ferc1.py index 5b1de09734..206d13d10c 100644 --- a/src/pudl/transform/params/ferc1.py +++ b/src/pudl/transform/params/ferc1.py @@ -3199,7 +3199,7 @@ "const_wrk_prgrs": "construction_work_in_progress_ending_balance", "acqstn_adjstmnt": "utility_plant_acquisition_adjustment_ending_balance", "tot_utlty_plant": "utility_plant_and_construction_work_in_progress_ending_balance", - "accum_prvsn_dad": "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_ending_balance", + "accum_prvsn_dad": "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_reported_ending_balance", "net_utlty_plant": "utility_plant_net_ending_balance", # detail of accum deprish # in service @@ -3219,7 +3219,7 @@ # rest of details of acum deprish "abndn_leases": "abandonment_of_leases_ending_balance", "amrtzplnt_acqstn": "amortization_of_plant_acquisition_adjustment_ending_balance", - "tot_accum_prvsn": "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_detail_ending_balance", + "tot_accum_prvsn": "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_ending_balance", } }, "xbrl": { From 2706266c77584f6b27f3b7ecb92cc403108c3544 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Wed, 14 Jun 2023 11:51:34 -0400 Subject: [PATCH 04/10] Add spot fix to plant_in_service table that flips certain negative values for electric_plant_sold values where the overall electric_plant_in_service calculation is wrong --- src/pudl/transform/ferc1.py | 46 +++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index c3129a854d..461ff0731a 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -1227,7 +1227,7 @@ def transform_end(self, df: pd.DataFrame) -> pd.DataFrame: Checks calculations. Enforces dataframe schema. Checks for empty dataframes and null columns. """ - df = self.reconcile_table_calculations(df).pipe(self.enforce_schema) + df = self.reconcile_table_calculations(df) # .pipe(self.enforce_schema) if df.empty: raise ValueError(f"{self.table_id.value}: Final dataframe is empty!!!") for col in df: @@ -3042,8 +3042,50 @@ def transform_main(self, df: pd.DataFrame) -> pd.DataFrame: """The main table-specific transformations, affecting contents not structure. Annotates and alters data based on information from the XBRL taxonomy metadata. + + Fix instances where electric_plant_sold should be positive. This is not all + instance where electric_plant_sold is negative, rather some of the instances + where electric_plant_sold is negative AND the calculated value for + electric_plant_in_service (the field that uses electric_plant_sold as a + subcomponent) does not match it's calculated value. """ - return super().transform_main(df).pipe(self.apply_sign_conventions) + df = super().transform_main(df).pipe(self.apply_sign_conventions) + + df = df.pipe(self.reconcile_table_calculations) + index_group_1 = ["report_year", "utility_id_ferc1"] + index_group_2 = ["report_year", "utility_id_ferc1", "ferc_account_label"] + # Get the index values for electric_plant_in_service fields that don't match + # their calculations + bad_calcs_pis_index = ( + df[ + (df["ferc_account_label"] == "electric_plant_in_service") + & (df["ending_balance"] != df["calculated_amount"]) + ] + .set_index(index_group_1) + .index + ) + # Use that index to get the whole group of ferc_account_labels associated + # with that bad calculation + bad_calc_groups_df = df.set_index(index_group_1).loc[bad_calcs_pis_index] + # Get the index values for the instances of electric_plant_sold that exist + # within the groups with bad calculations. + bad_calc_eps_index = ( + bad_calc_groups_df[ + bad_calc_groups_df["ferc_account_label"] == "electric_plant_sold" + ] + .reset_index() + .set_index(index_group_2) + .index + ) + df = df.reset_index().set_index(index_group_2) + # Replace the electric_plant_sold ending_balance values identified above with + # their absolute value. Also enforce the schema so we can run the calcs again. + df.loc[bad_calc_eps_index, "ending_balance"] = abs(df["ending_balance"]) + df.loc[bad_calc_eps_index, "starting_balance"] = abs(df["starting_balance"]) + df_fixed = df.reset_index().pipe(self.enforce_schema) + # For now, we have to run the calcs check twice because we need to use the + # calculated_amounts field for this spot fix. + return df_fixed class PlantsSmallFerc1TableTransformer(Ferc1AbstractTableTransformer): From fc2e51000c450872039b05973ec09e5cb2ea31c2 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Wed, 14 Jun 2023 13:52:34 -0400 Subject: [PATCH 05/10] Apply reconcile_table_calculations to a copy of the dataframe in the transform_main function for the plant_in_service table (rather than to the dataframe itself) so we can use the calculations to find the index values for the rows to change in the original dataframe without running the calculations on it (and adding new rows for calculations fixes etc.) This change makes the reconcile_table_calculations function work properly in the transform_end function! Previously it was not calculating the right values after some of the spot fixes. --- src/pudl/transform/ferc1.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 461ff0731a..84667863c8 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -1227,7 +1227,7 @@ def transform_end(self, df: pd.DataFrame) -> pd.DataFrame: Checks calculations. Enforces dataframe schema. Checks for empty dataframes and null columns. """ - df = self.reconcile_table_calculations(df) # .pipe(self.enforce_schema) + df = self.reconcile_table_calculations(df).pipe(self.enforce_schema) if df.empty: raise ValueError(f"{self.table_id.value}: Final dataframe is empty!!!") for col in df: @@ -3050,23 +3050,24 @@ def transform_main(self, df: pd.DataFrame) -> pd.DataFrame: subcomponent) does not match it's calculated value. """ df = super().transform_main(df).pipe(self.apply_sign_conventions) - - df = df.pipe(self.reconcile_table_calculations) + # Create a copy of the df with the calculations so we can apply the fixes to + # the original df and run the calculations later (in transform_end) + df_calc = df.pipe(self.reconcile_table_calculations).copy() index_group_1 = ["report_year", "utility_id_ferc1"] index_group_2 = ["report_year", "utility_id_ferc1", "ferc_account_label"] # Get the index values for electric_plant_in_service fields that don't match # their calculations bad_calcs_pis_index = ( - df[ - (df["ferc_account_label"] == "electric_plant_in_service") - & (df["ending_balance"] != df["calculated_amount"]) + df_calc[ + (df_calc["ferc_account_label"] == "electric_plant_in_service") + & (df_calc["ending_balance"] != df_calc["calculated_amount"]) ] .set_index(index_group_1) .index ) # Use that index to get the whole group of ferc_account_labels associated # with that bad calculation - bad_calc_groups_df = df.set_index(index_group_1).loc[bad_calcs_pis_index] + bad_calc_groups_df = df_calc.set_index(index_group_1).loc[bad_calcs_pis_index] # Get the index values for the instances of electric_plant_sold that exist # within the groups with bad calculations. bad_calc_eps_index = ( @@ -3082,9 +3083,7 @@ def transform_main(self, df: pd.DataFrame) -> pd.DataFrame: # their absolute value. Also enforce the schema so we can run the calcs again. df.loc[bad_calc_eps_index, "ending_balance"] = abs(df["ending_balance"]) df.loc[bad_calc_eps_index, "starting_balance"] = abs(df["starting_balance"]) - df_fixed = df.reset_index().pipe(self.enforce_schema) - # For now, we have to run the calcs check twice because we need to use the - # calculated_amounts field for this spot fix. + df_fixed = df.reset_index() return df_fixed From 8c5650b617ab4b9e5481e6a28695113948d17393 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Thu, 15 Jun 2023 11:57:18 -0400 Subject: [PATCH 06/10] Respond to PR comments: dquote> dquote> Simplify the plant_in_service function so that it just flips all electric_plant_sold values to positive values. dquote> dquote> Update the accomodations for the fast test so that they only happen in one place (years are trunkated from the final list of spot_fix_pks instead of each of the individual lists before the are concatinated). --- src/pudl/transform/ferc1.py | 120 ++++++++++++++---------------------- 1 file changed, 46 insertions(+), 74 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 84667863c8..758493b237 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -1227,7 +1227,7 @@ def transform_end(self, df: pd.DataFrame) -> pd.DataFrame: Checks calculations. Enforces dataframe schema. Checks for empty dataframes and null columns. """ - df = self.reconcile_table_calculations(df).pipe(self.enforce_schema) + df = self.reconcile_table_calculations(df) # .pipe(self.enforce_schema) if df.empty: raise ValueError(f"{self.table_id.value}: Final dataframe is empty!!!") for col in df: @@ -3050,41 +3050,18 @@ def transform_main(self, df: pd.DataFrame) -> pd.DataFrame: subcomponent) does not match it's calculated value. """ df = super().transform_main(df).pipe(self.apply_sign_conventions) - # Create a copy of the df with the calculations so we can apply the fixes to - # the original df and run the calculations later (in transform_end) - df_calc = df.pipe(self.reconcile_table_calculations).copy() - index_group_1 = ["report_year", "utility_id_ferc1"] - index_group_2 = ["report_year", "utility_id_ferc1", "ferc_account_label"] - # Get the index values for electric_plant_in_service fields that don't match - # their calculations - bad_calcs_pis_index = ( - df_calc[ - (df_calc["ferc_account_label"] == "electric_plant_in_service") - & (df_calc["ending_balance"] != df_calc["calculated_amount"]) - ] - .set_index(index_group_1) - .index + # Make all electric_plant_sold values positive + # This could probably be a FERC transformer class function or in the + # apply_sign_conventions function, but it doesn't seem like the best fit for + # now. + neg_values = (df["ferc_account_label"] == "electric_plant_sold") & ( + df["ending_balance"] < 0 ) - # Use that index to get the whole group of ferc_account_labels associated - # with that bad calculation - bad_calc_groups_df = df_calc.set_index(index_group_1).loc[bad_calcs_pis_index] - # Get the index values for the instances of electric_plant_sold that exist - # within the groups with bad calculations. - bad_calc_eps_index = ( - bad_calc_groups_df[ - bad_calc_groups_df["ferc_account_label"] == "electric_plant_sold" - ] - .reset_index() - .set_index(index_group_2) - .index + df.loc[neg_values, "ending_balance"] = abs(df["ending_balance"]) + logger.info( + f"{self.table_id.value}: Converted {len(df[neg_values])} negative values to positive." ) - df = df.reset_index().set_index(index_group_2) - # Replace the electric_plant_sold ending_balance values identified above with - # their absolute value. Also enforce the schema so we can run the calcs again. - df.loc[bad_calc_eps_index, "ending_balance"] = abs(df["ending_balance"]) - df.loc[bad_calc_eps_index, "starting_balance"] = abs(df["starting_balance"]) - df_fixed = df.reset_index() - return df_fixed + return df class PlantsSmallFerc1TableTransformer(Ferc1AbstractTableTransformer): @@ -4055,45 +4032,39 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: "utility_plant_asset_type", ] - # We need to track which years are getting transformed so the spot fixes - # don't fail on the fast tests. - df_years = df.report_year.unique().tolist() - # The utility_id_ferc1 211 follows the same pattern for several years # instead of writing them all out in spot_fix_pks, we'll create a loop that # generates all of them and then append them to spot_fix_pks later spot_fix_211 = [] for year in np.append(2006, range(2009, 2021)): - # Measure to account for fast tests where not all years are used - if year in df_years: - for utility_type in ["electric", "total"]: - pks = [ - ( - year, - 211, - utility_type, - "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility", - ), - ( - year, - 211, - utility_type, - "amortization_of_other_utility_plant_utility_plant_in_service", - ), - ( - year, - 211, - utility_type, - "depreciation_amortization_and_depletion_utility_plant_in_service", - ), - ( - year, - 211, - utility_type, - "depreciation_utility_plant_in_service", - ), - ] - spot_fix_211 = spot_fix_211 + pks + for utility_type in ["electric", "total"]: + pks = [ + ( + year, + 211, + utility_type, + "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility", + ), + ( + year, + 211, + utility_type, + "amortization_of_other_utility_plant_utility_plant_in_service", + ), + ( + year, + 211, + utility_type, + "depreciation_amortization_and_depletion_utility_plant_in_service", + ), + ( + year, + 211, + utility_type, + "depreciation_utility_plant_in_service", + ), + ] + spot_fix_211 = spot_fix_211 + pks spot_fix_pks = [ ( @@ -4201,13 +4172,15 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: ), (2007, 393, "total", "depreciation_utility_plant_in_service"), ] - # Measure to account for fast tests where not all years are used - spot_fix_pks = [x for x in spot_fix_pks if x[0] in df_years] # Combine bespoke fixes with programatically generated spot fixes spot_fix_pks = spot_fix_pks + spot_fix_211 - # Measure to account for fast tests where not all years are used + # Par down spot fixes to account for fast tests where not all years are used + df_years = df.report_year.unique().tolist() + spot_fix_pks = [x for x in spot_fix_pks if x[0] in df_years] + logger.info(f"{self.table_id.value}: Spotfixing {len(spot_fix_pks)} records.") + if spot_fix_pks: # Create a df out of the primary key of the records you want to fix df_keys = pd.DataFrame(spot_fix_pks, columns=primary_keys).set_index( @@ -4219,11 +4192,10 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: df.loc[df_keys.index, "ending_balance"] = df["ending_balance"] * -1 # All of these are flipping negative values to positive values, except one, # so let's make sure that's what happens - flipped_values = df.loc[df_keys.index].copy() + flipped_values = df.loc[df_keys.index] if len(flipped_values[flipped_values["ending_balance"] < 0]) > 1: raise AssertionError("Only one of these spot fixes should be negative") - - df = df.reset_index() + df.reset_index(inplace=True) return df From 85b8d2119d04f2bf9861ceb273e24ff465eb98ba Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Thu, 15 Jun 2023 12:36:34 -0400 Subject: [PATCH 07/10] Uncomment line I was using for testing --- src/pudl/transform/ferc1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 758493b237..d6d74ef6eb 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -1227,7 +1227,7 @@ def transform_end(self, df: pd.DataFrame) -> pd.DataFrame: Checks calculations. Enforces dataframe schema. Checks for empty dataframes and null columns. """ - df = self.reconcile_table_calculations(df) # .pipe(self.enforce_schema) + df = self.reconcile_table_calculations(df).pipe(self.enforce_schema) if df.empty: raise ValueError(f"{self.table_id.value}: Final dataframe is empty!!!") for col in df: From 9c183069a45dcd5670cd9447988a4c46ceac4b24 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Fri, 16 Jun 2023 13:40:32 -0400 Subject: [PATCH 08/10] Fix indent issue in spot fixer and remove two spot fix rows that were causing trouble --- src/pudl/transform/ferc1.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index d6d74ef6eb..66f6589f35 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -4064,7 +4064,7 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: "depreciation_utility_plant_in_service", ), ] - spot_fix_211 = spot_fix_211 + pks + spot_fix_211 = spot_fix_211 + pks spot_fix_pks = [ ( @@ -4093,18 +4093,6 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: "depreciation_amortization_and_depletion_utility_plant_in_service", ), (2012, 156, "electric", "depreciation_utility_plant_in_service"), - ( - 2002, - 170, - "other1", - "accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_reported", - ), - ( - 2002, - 170, - "other1", - "utility_plant_net", - ), # ^^ This is the only record that goes positive to negative ( 2013, 170, From 6e8c6cfc4f826d707ee40faa52480638719b59d9 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Fri, 16 Jun 2023 13:58:49 -0400 Subject: [PATCH 09/10] Add some docs changes to reflect new code and update assertion error to reflect the fact that now there should only be negative values swapped to positive values --- src/pudl/transform/ferc1.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 940aae9f16..184085c0ae 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -3166,11 +3166,7 @@ def transform_main(self, df: pd.DataFrame) -> pd.DataFrame: Annotates and alters data based on information from the XBRL taxonomy metadata. - Fix instances where electric_plant_sold should be positive. This is not all - instance where electric_plant_sold is negative, rather some of the instances - where electric_plant_sold is negative AND the calculated value for - electric_plant_in_service (the field that uses electric_plant_sold as a - subcomponent) does not match it's calculated value. + Make all electric_plant_sold balances positive. """ df = super().transform_main(df).pipe(self.apply_sign_conventions) # Make all electric_plant_sold values positive @@ -4293,7 +4289,7 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: logger.info(f"{self.table_id.value}: Spotfixing {len(spot_fix_pks)} records.") if spot_fix_pks: - # Create a df out of the primary key of the records you want to fix + # Create a df of the primary key of the records you want to fix df_keys = pd.DataFrame(spot_fix_pks, columns=primary_keys).set_index( primary_keys ) @@ -4301,11 +4297,11 @@ def transform_main(self: Self, df: pd.DataFrame) -> pd.DataFrame: # Flip the signs for the values in "ending balance" all records in the original # df that appear in the primary key df df.loc[df_keys.index, "ending_balance"] = df["ending_balance"] * -1 - # All of these are flipping negative values to positive values, except one, + # All of these are flipping negative values to positive values, # so let's make sure that's what happens flipped_values = df.loc[df_keys.index] - if len(flipped_values[flipped_values["ending_balance"] < 0]) > 1: - raise AssertionError("Only one of these spot fixes should be negative") + if (flipped_values["ending_balance"] < 0).any(): + raise AssertionError("None of these spot fixes should be negative") df.reset_index(inplace=True) return df From d4b15b8a2c1d752ef87de01f6f16ad0ba33b0b13 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Wed, 21 Jun 2023 13:23:51 -0400 Subject: [PATCH 10/10] Add release notes --- docs/release_notes.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 6bc2998546..3f9b6e33eb 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -201,6 +201,8 @@ Data Cleaning * Added "correction" records to many FERC Form 1 tables where the reported totals do not match the outcomes of calculations specified in XBRL metadata (even after cleaning up the often incorrect calculation specifications!). See :issue:`2957` and :pr:`2620`. +* Flip the sign of some erroneous negative values in the :ref:`plant_in_service_ferc1` + and :ref:`utility_plant_summary_ferc1` tables. See :issue:`2599`, and :pr:`2647`. Analysis ^^^^^^^^