# Income Statement Explosion

In [None]:
import pandas as pd
import numpy as np
from dagster import AssetKey
import sqlalchemy as sa

import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
mpl.style.use('dark_background')
figsize=(12,5)

import pudl
from pudl.etl import defs, default_assets
from pudl.helpers import get_asset_group_keys
from pudl.transform.ferc1 import *

In [None]:
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
ferc1_engine_xbrl = sa.create_engine(pudl_settings["ferc1_xbrl_db"])
ferc1_engine_dbf= sa.create_engine(pudl_settings["ferc1_db"])

In [None]:
op_rev_other = pd.read_sql("electric_operating_revenues_other_300_duration" ,ferc1_engine_xbrl)

In [None]:
xbrl_meta = defs.load_asset_value(AssetKey("xbrl_metadata_json"))

## Convert the metadata & Pull the tables

In [None]:
income_statement_tables = [
    "income_statement_ferc1",
    "depreciation_amortization_summary_ferc1",
    "electric_operating_expenses_ferc1",
    "electric_operating_revenues_ferc1",
    # "electricity_sales_by_rate_schedule_ferc1"
]
income_table_dollar_cols = {
    "income_statement_ferc1": "income",
    "depreciation_amortization_summary_ferc1": "depreciation_amortization_value",
    "electric_operating_expenses_ferc1": "expense",
    "electric_operating_revenues_ferc1": "revenue",
    "electricity_sales_by_rate_schedule_ferc1": "sales_revenue",
}
# get_asset_group_keys("norm_ferc1", default_assets)
# tables = {tbl: defs.load_asset_value(AssetKey(tbl)) for tbl in income_statement_tables}
tables = {tbl: pd.read_sql(tbl, pudl_engine) for tbl in income_statement_tables}
meta_converted = ExplodeMeta(xbrl_meta).convert_metadata(income_statement_tables)

In [None]:
# make all the calculated tables

# note: there are more WARNING's about fields from the metadata not
# showing up here bc I added all of the columns into meta_converted
# even if they aren't calcucated values. this is bc in the next stage
# we need to be able to find the factiods that are reported in two
# tables
calc_dfs = {}
for table_name in income_statement_tables:
    dollar_value_col = income_table_dollar_cols[table_name]
    table_df = tables[table_name]
    calc_dfs[table_name] = check_table_calcs(
        table_name,
        table_df,
        dollar_value_col,
        meta_converted=meta_converted
    )

## Explode the income statement table

In [None]:
tables_to_concat = []
for table_name in income_statement_tables:
    dollar_value_col = income_table_dollar_cols[table_name]
    xbrl_factoid_name = FERC1_TFR_CLASSES[table_name]().params.merge_xbrl_metadata.on
    table_df = tables[table_name]
    tables_to_concat.append(table_df.rename(columns={dollar_value_col: "dollar_amount", xbrl_factoid_name: "xbrl_factoid_name"}))

In [None]:
explode_df = pd.concat(tables_to_concat)
explode_meta = {tbl: meta for (tbl, meta) in meta_converted.items() if tbl in income_statement_tables}

### ID, Verify & Drop Duplicates
- do the values of all of the columns that show up in two tables match?
   - if so drop them
- do all of the calculated values that have their sub-components in this table calculate?
  - if so drop the calcuated values and keep the sub-components

In [None]:
# for each field, does the name_original show up in other multiple table's name_original?
# we're using the name og bc each table has different transforms and therefor different
# renames

# compile all of the fields in this explosion
all_fields = {
    field 
    for tbl_fields in explode_meta.values()
    for field in tbl_fields.keys()
}

In [None]:
# count the instances of the field's OG name
for field in all_fields:
    # explode_meta... something

### Find/fix any missing columns

In [None]:
missing_cols = ['demand_charges_revenue_sales_for_resale', 'other_sales_to_public_authorities_billed', 'interdepartmental_sales_unbilled', 'other_charges_revenue_sales_for_resale', 'small_or_commercial_sales_electric_operating_revenue_unbilled', 'small_or_commercial_sales_electric_operating_revenue_billed', 'megawatt_hours_sold_large_or_industrial_unbilled', 'public_street_and_highway_lighting_unbilled', 'large_or_industrial_sales_electric_operating_revenue_billed', 'small_or_commercial_sales', 'public_street_and_highway_lighting_billed', 'large_or_industrial_sales', 'other_sales_to_public_authorities_unbilled', 'residential_sales_unbilled', 'interdepartmental_sales_billed', 'energy_charges_revenue_sales_for_resale', 'sales_to_railroads_and_railways_unbilled', 'sales_to_railroads_and_railways_billed', 'residential_sales_billed']
# build a lil dictionary of missing col name to 
missing_col_to_table = {
    col["name"]: table_name 
    for (table_name, inst_dur_dict) in xbrl_meta.items()
    for subtabl in inst_dur_dict.values()
    for col in subtabl
    if col["name"] in missing_cols
}

# if a calcuated value contains elements from another table, add the table
# name into the calc component.

In [None]:
[col for col in missing_cols if col not in missing_col_to_table.keys()]

### Clean up helper bits for one table

In [None]:
table_name = "electric_operating_expenses_ferc1"
table_df = tables[table_name]
calced_df = calc_dfs[table_name].drop(columns=["record_id"])
xbrl_factoid_name = FERC1_TFR_CLASSES[table_name]().params.merge_xbrl_metadata.on
pks = (
    pudl.metadata.classes.Package.from_resource_ids()
    .get_resource(table_name)
    .schema.primary_key
)
pks_wo_factoid = [col for col in pks if col != xbrl_factoid_name]

In [None]:
off_df = calced_df[
    ~np.isclose(
        calced_df.calculated_dollar_amount,
        calced_df[income_table_dollar_cols[table_name]]
    )
    &
    (calced_df["abs_diff"].notnull())
]
worst_calc_names = off_df[xbrl_factoid_name].value_counts().index[0:3]
worst_calc_name = off_df[xbrl_factoid_name].value_counts().index[0]

In [None]:
worst_calc_components = [calc_comp["name"] for calc_comp in meta_converted[table_name][worst_calc_name]["calcs"]] + [worst_calc_name]
off_df = off_df.assign(xbrl_factoid_name=lambda x: x[xbrl_factoid_name].astype(pd.CategoricalDtype(categories=worst_calc_components)))

In [None]:
for calc in worst_calc_names:
    plt.hist(
        off_df[off_df[xbrl_factoid_name] == calc].rel_diff, 
        bins=50,
        range=(0,2),
        label=calc,
        
    )
plt.legend()
plt.title(f"Relative Diff in table: {table_name}")
plt.show()

In [None]:
dollar_value_col = income_table_dollar_cols[table_name]
pd.merge(
    calced_df,
    off_df[
        (off_df[xbrl_factoid_name] ==worst_calc_name)
    ][pks_wo_factoid + ["abs_diff"]].drop_duplicates(),
    left_on=pks_wo_factoid + [dollar_value_col],
    right_on=pks_wo_factoid + ["abs_diff"],
)

In [None]:
worst_calc_name = worst_calc_names[0]
worst_calc_components = [calc_comp["name"] for calc_comp in meta_converted[table_name][worst_calc_name]["calcs"]] + [worst_calc_name]
off_worst_index = (
    off_df.set_index([xbrl_factoid_name] + pks_wo_factoid).sort_index().loc[worst_calc_name].index)
(
    calced_df.loc[(calced_df[xbrl_factoid_name].isin(worst_calc_components))]
    .sort_values(xbrl_factoid_name).set_index(pks_wo_factoid).sort_index()
    .loc[off_worst_index]
    .head(60)
)

In [None]:
op_rev_other

In [None]:
off_worst_index