# Income Statement Explosion

In [None]:
import pandas as pd
import numpy as np
from dagster import AssetKey
import sqlalchemy as sa

import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
mpl.style.use('dark_background')
figsize=(12,5)

import pudl
from pudl.etl import defs, default_assets
from pudl.helpers import get_asset_group_keys
from pudl.transform.ferc1 import *

In [None]:
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
ferc1_engine_xbrl = sa.create_engine(pudl_settings["ferc1_xbrl_db"])
ferc1_engine_dbf= sa.create_engine(pudl_settings["ferc1_db"])

In [None]:
xbrl_meta = defs.load_asset_value(AssetKey("xbrl_metadata_json"))

In [None]:
income_statement_tables = [
    "income_statement_ferc1",
    "depreciation_amortization_summary_ferc1",
    "electric_operating_expenses_ferc1",
    "electric_operating_revenues_ferc1",
]
income_table_dollar_cols = {
    "income_statement_ferc1": "income",
    "depreciation_amortization_summary_ferc1": "depreciation_amortization_value",
    "electric_operating_expenses_ferc1": "expense",
    "electric_operating_revenues_ferc1": "revenue",
}
# get_asset_group_keys("norm_ferc1", default_assets)
# tables = {tbl: defs.load_asset_value(AssetKey(tbl)) for tbl in income_statement_tables}
tables = {tbl: pd.read_sql(tbl, pudl_engine) for tbl in income_statement_tables}
meta_converted = ExplodeMeta(xbrl_meta).convert_metadata(income_statement_tables)

In [None]:
# make all the calculated tables
calc_dfs = {}
for table_name in income_statement_tables:
    calculated_values = meta_converted[table_name]
    dollar_value_col = income_table_dollar_cols[table_name]
    table_df = tables[table_name]
    calc_dfs[table_name] = check_table_calcs(table_name, table_df, dollar_value_col, calculated_values)
    

### Clean up helper bits for one table

In [None]:
table_name = "electric_operating_revenues_ferc1"
table_df = tables[table_name]
calced_df = calc_dfs[table_name].drop(columns=["record_id"])
xbrl_factoid_name = FERC1_TFR_CLASSES[table_name]().params.merge_xbrl_metadata.on
pks = (
    pudl.metadata.classes.Package.from_resource_ids()
    .get_resource(table_name)
    .schema.primary_key
)
pks_wo_factoid = [col for col in pks if col != xbrl_factoid_name]

In [None]:
off_df = calced_df[
    ~np.isclose(
        calced_df.calculated_dollar_amount,
        calced_df[income_table_dollar_cols[table_name]]
    )
    &
    (calced_df["abs_diff"].notnull())
]
worst_calc_names = off_df[xbrl_factoid_name].value_counts().index[0:3]
worst_calc_name = off_df[xbrl_factoid_name].value_counts().index[0]

In [None]:
worst_calc_components = [calc_comp["name"] for calc_comp in meta_converted[table_name][worst_calc_name]] + [worst_calc_name]
off_df = off_df.assign(xbrl_factoid_name=lambda x: x[xbrl_factoid_name].astype(pd.CategoricalDtype(categories=worst_calc_components)))

In [None]:
for calc in worst_calc_names:
    plt.hist(
        off_df[off_df[xbrl_factoid_name] == calc].rel_diff, 
        bins=50,
        range=(0,2),
        label=calc,
        
    )
plt.legend()
plt.title(f"Relative Diff in table: {table_name}")
plt.show()

In [None]:
dollar_value_col = "expense"
pd.merge(
    calced_df,
    off_df[
        (off_df[xbrl_factoid_name] ==worst_calc_name)
    ][pks_wo_factoid + ["abs_diff"]].drop_duplicates(),
    left_on=pks_wo_factoid + [dollar_value_col],
    right_on=pks_wo_factoid + ["abs_diff"],
)