# Notebook Preamble

## IPython Magic

In [None]:
%load_ext autoreload
%autoreload 3

## Notebook Imports

In [None]:
# 3rd Party Imports:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import matplotx
import duckdb
import polars as pl
import sqlalchemy as sa
import dagster as dg
from dagster import AssetKey
from pathlib import Path
import os

In [None]:
# PUDL Imports
import pudl
from pudl.etl import defs
from pudl.workspace.setup import PudlPaths

In [None]:
logger = pudl.logging_helpers.get_logger("pudl")
os.environ["PUDL_INPUT"]

## Visualization Settings

In [None]:
%matplotlib inline

In [None]:
matplotlib.rcParams["figure.figsize"] = (10, 6)
matplotlib.rcParams["figure.dpi"] = 150
matplotlib.style.use(matplotx.styles.onedark)

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 300)
pd.set_option("display.max_colwidth", 1000)

pl.Config.set_tbl_rows(100)
pl.Config.set_tbl_cols(100)
pl.Config.set_fmt_str_lengths(100)

## Set up data access shortcuts

In [None]:
from pudl.helpers import get_parquet_table as get_parquet
import geopandas as gpd

def get_asset(table: str) -> pd.DataFrame | gpd.GeoDataFrame:
    return defs.load_asset_value(AssetKey(table))

def get_pandas(table: str) -> pd.DataFrame:
    return pd.read_parquet(
        Path(os.environ["PUDL_OUTPUT"]) / f"parquet/{table}.parquet",
        memory_map=True,
        engine="pyarrow",
    ).convert_dtypes()

def get_polars(table: str) -> pl.DataFrame:
    return pl.read_parquet(
        Path(os.environ["PUDL_OUTPUT"]) / f"parquet/{table}.parquet",
    )

def get_pyarrow(table: str) -> pd.DataFrame:
    return pd.read_parquet(
        Path(os.environ["PUDL_OUTPUT"]) / f"parquet/{table}.parquet",
        dtype_backend="pyarrow",
        memory_map=True,
        engine="pyarrow",
    )


In [None]:
companies = get_polars("out_sec10k__quarterly_company_information")
filings = get_polars("out_sec10k__quarterly_filings")
parsubs = get_polars("out_sec10k__parents_and_subsidiaries")
name_changes = get_polars("out_sec10k__changelog_company_name")
ex21 = get_polars("core_sec10k__quarterly_exhibit_21_company_ownership")

## What fraction of electricity companies are linked to EIA utilities?

### Identify industry IDs with lots of Utility ID associations
- This is kind of begging the question, but it's not a terrible way to find interesting SICs to look at.
- Unsurprisingly the two top SICs with the largest number of Utility ID associations 

In [None]:
electricity_sics = (
    get_polars("out_sec10k__quarterly_company_information")
    .group_by(sic=pl.col("industry_id_sic"))
    .agg(
        fraction_with_utility_id=pl.col("utility_id_eia").is_not_null().mean()
    )
    .sort("fraction_with_utility_id", descending=True)
    .filter(pl.col("fraction_with_utility_id") > 0.50)
    .select("sic").to_series().to_list()
)
electricity_sics

In [None]:
(
    get_polars("out_sec10k__quarterly_company_information")
    .filter(pl.col("industry_id_sic").is_in(electricity_sics))
    .select(["industry_id_sic", "industry_name_sic"])
    .unique(["industry_id_sic", "industry_name_sic"])
)

In [None]:
util_ids_by_year = get_polars("out_sec10k__quarterly_company_information").filter(
    pl.col("industry_id_sic").is_in(electricity_sics),
).with_columns(
    year=pl.col("report_date").dt.year()
).group_by(["year", "industry_id_sic"]).agg(
    fraction_with_utility_id=pl.col("utility_id_eia").is_not_null().mean(),
).sort("year")

for sic in util_ids_by_year["industry_id_sic"].unique():
    df = util_ids_by_year.filter(pl.col("industry_id_sic") == sic)
    plt.plot(df["year"], df["fraction_with_utility_id"], label=sic)

plt.legend()

In [None]:
unmatched_companies = (
    get_polars("out_sec10k__quarterly_company_information")
    .filter(
        pl.col("industry_id_sic").is_in(["4911", "4931"]),
        pl.col("utility_id_eia").is_not_null(),
    )
    .select(["company_name", "utility_name_eia", "report_date"])
    .unique(["company_name", "utility_name_eia", "report_date"])
)
unmatched_companies.sample(30).sort("report_date")

In [None]:
(
    get_polars("out_sec10k__parents_and_subsidiaries")[
        "report_date",
        "parent_company_central_index_key",
        "parent_company_utility_id_eia",
        "parent_company_name",
        "subsidiary_company_id_sec10k",
        "subsidiary_company_central_index_key",
        "subsidiary_company_utility_id_eia",
        "subsidiary_company_name",
        "fraction_owned",
    ].filter(
        pl.col("report_date").dt.year().is_in([2001, 2023]),
        pl.col("parent_company_central_index_key").eq("0000072903")
    )
)

In [None]:
electricity_sics = ["4911", "4931"]
sics = ["4911", "4931", "6798", "6189", "1311", "2621", "2834", "4991", "4922", "4961", "2631", "7372", "4924", "3674", "2911"]
companies.filter(
    pl.col("utility_id_eia").is_not_null(),
    pl.col("industry_id_sic").is_in(sics)
).group_by(["industry_name_sic"]).len().sort("len", descending=True)

In [None]:
companies.filter(
    # pl.col("utility_id_eia").is_not_null(),
    pl.col("industry_name_sic").str.contains(".*(electricity|power).*")
).select(["industry_name_sic", "industry_id_sic"]).group_by("industry_id_sic").len().sort("len", descending=True)

In [None]:
assert False

In [None]:
def plot_nullness_by_year(
    df: pd.DataFrame,
    title: str = "Fraction of Non-Null Values by Column-Year",
):
    """Plot the fraction of null values in a dataframe by column-year.

    Creates a binary visualization where zeros (completely missing data) are
    displayed in black and any non-zero values (some data present) are
    displayed in white.

    The x-axis represents the years, and the y-axis represents the columns.

    Args:
        df: DataFrame with columns as features and rows as years.
        title: Title for the plot.
    """
    import matplotlib.colors as mcolors
    import numpy as np

    # Calculate dynamic height based on number of rows (columns in the data)
    # Use a minimum height of 4 inches and scale with 0.15 inches per row
    min_height = 4
    height_per_row = 0.15
    dynamic_height = max(min_height, len(df.index) * height_per_row)

    # Create the plot with dynamic sizing
    fig, ax = plt.subplots(figsize=(10, dynamic_height))

    # Create binary data: 0 for zeros, 1 for non-zeros
    binary_data = (df.values > 0).astype(int)

    # Create binary colormap: black for 0, white for 1
    colors = ['black', 'white']
    binary_cmap = mcolors.ListedColormap(colors)

    # Create the heatmap using imshow
    im = ax.imshow(
        binary_data,
        cmap=binary_cmap,
        aspect='auto',
        vmin=0,
        vmax=1,
        interpolation='nearest'
    )

    # Add grid lines for cell borders
    # Vertical lines (between years)
    for i in range(len(df.columns) + 1):
        ax.axvline(x=i - 0.5, color='gray', linewidth=0.5, alpha=0.7)

    # Horizontal lines (between columns)
    for i in range(len(df.index) + 1):
        ax.axhline(y=i - 0.5, color='gray', linewidth=0.5, alpha=0.7)

    # Set the ticks and labels with smaller fonts
    ax.set_xticks(range(len(df.columns)))
    ax.set_xticklabels(df.columns, rotation=45, ha='right', fontsize=8)
    ax.set_yticks(range(len(df.index)))
    ax.set_yticklabels(df.index, fontsize=7)

    # Labels and title with smaller fonts
    ax.set_xlabel('Year', fontsize=10)
    ax.set_ylabel('Column', fontsize=10)
    ax.set_title(title, fontsize=11)

    # Add colorbar with custom labels
    cbar = plt.colorbar(im, ax=ax, ticks=[0, 1])
    cbar.ax.set_yticklabels(['No Data (0)', 'Some Data (>0)'], fontsize=8)
    cbar.set_label('Data Availability', fontsize=9)

    # Adjust layout to prevent label cutoff
    plt.tight_layout()

    return fig, ax

In [None]:
table_name = "out_eia__monthly_generators"
parquet_path = PudlPaths().parquet_path(table_name)
nullness = calculate_nullness_by_year(
    parquet_path,
    date_column="report_date"
)

In [None]:
plot_nullness_by_year(
    nullness,
    title=table_name,
)

In [None]:
assert False

In [None]:
gf = pudl.helpers.get_parquet_table("out_eia923__yearly_generation_fuel_by_generator_energy_source_owner")
gf["observed_capfac"] = gf["net_generation_mwh"] / (gf["capacity_mw"] * 8784)
inconsistent = gf.query("observed_capfac > 1.0")
inconsistent["observed_capfac"].hist(bins=100, range=(0,5))

In [None]:
inconsistent["observed_capfac"].sort_values(ascending=False).tail(40)

In [None]:
from pudl.metadata.classes import Package
pudl_pkg = Package.from_resource_ids()

In [None]:
core_eia923_resources = [res.name for res in pudl_pkg.resources if "core_eia923" in res.name and "coalmine" not in res.name]
plant_ids = [
    set(get_parquet(table, columns=["plant_id_eia"])["plant_id_eia"].unique())
    for table in core_eia923_resources
]
plant_ids = sorted(set.union(*plant_ids))
len(plant_ids)

In [None]:
ba_eia930: pd.DataFrame = get_parquet("out_eia930__hourly_operations")
subs_eia930: pd.DataFrame = get_parquet("out_eia930__hourly_subregion_demand")
pa_ferc714: pd.DataFrame = get_parquet("out_ferc714__hourly_planning_area_demand")

In [None]:
ba_eia930.info(show_counts=True)

In [None]:
def check_imputation(df):
    reported_na_or_zero = ((df.demand_reported_mwh.isna()) | (df.demand_reported_mwh == 0)).sum()
    imputed_na_or_zero = ((df.demand_imputed_pudl_mwh.isna()) | (df.demand_imputed_pudl_mwh == 0)).sum()
    total_flagged = df.demand_imputed_pudl_mwh_imputation_code.notna().sum()
    print(f"Flagged for imputation: {total_flagged}")
    print(f"Reported NA or 0: {reported_na_or_zero}")
    print(f"Imputed NA or 0: {imputed_na_or_zero}")
    display(df.demand_imputed_pudl_mwh_imputation_code.value_counts().sort_index())

In [None]:
check_imputation(pa_ferc714)

In [None]:
pa_ferc714[pa_ferc714.demand_reported_mwh.notna() & (pa_ferc714.demand_imputed_pudl_mwh.isna())].sample(10)

In [None]:
assert False

In [None]:
rate_base = get_table("out_ferc1__yearly_rate_base")
rate_base.info()

In [None]:
rate_base.sample(20)

In [None]:
assert False

In [None]:
ex21_own = get_table("core_sec10k__quarterly_exhibit_21_company_ownership")
ex21_own.info()

In [None]:
idx_cols = [
    "operator_id_phmsa",
    "report_year",
    "operating_state",
    "commodity",
]
dude = phmsa_ops.set_index(idx_cols).sort_index()

In [None]:
dude[dude.index.duplicated(keep=False)]

In [None]:
assert False

In [None]:
raw_filings = get_table("raw_sec10k__quarterly_filings")
raw_company_info = get_table("raw_sec10k__quarterly_company_information")
raw_ownership = get_table("raw_sec10k__exhibit_21_company_ownership")
raw_parsubs = get_table("raw_sec10k__parents_and_subsidiaries")

core_filings = get_table("core_sec10k__quarterly_filings")
out_filings = get_table("out_sec10k__quarterly_filings")

In [None]:
gpratk_avail_cap = get_table("out_gridpathratoolkit__hourly_available_capacity_factor")

In [None]:
gpratk_avail_cap.head(10)

In [None]:
out_filings["report_date"].value_counts().sort_index()

In [None]:
assert False

In [None]:
from pudl.metadata import PUDL_PACKAGE
from pudl.workspace.setup import PudlPaths
pudl_paths = PudlPaths()
md = PUDL_PACKAGE.to_sql()
pudl_engine = sa.create_engine(pudl_paths.sqlite_db_uri("pudl"))
# md.create_all(pudl_engine)

In [None]:
assert False

In [None]:
new_imputed = pd.read_parquet("s3://pudl.catalyst.coop/nightly/out_ferc714__hourly_planning_area_demand.parquet")
old_imputed = pd.read_parquet("s3://pudl.catalyst.coop/nightly/_out_ferc714__hourly_imputed_demand.parquet")

In [None]:
both_imputed = pd.merge(
    new_imputed.set_index(["respondent_id_ferc714", "datetime_utc"]),
    old_imputed.set_index(["respondent_id_ferc714", "datetime_utc"]),
    left_index=True,
    right_index=True,
    suffixes=("_new", "_old"),
    how="outer",
)
# Reset the index to make `respondent_id_ferc714` a data column
both_imputed = both_imputed.reset_index()
both_imputed.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assign a discrete color to each `respondent_id_ferc714`
unique_ids = both_imputed["respondent_id_ferc714"].unique()
palette = sns.color_palette("tab20", len(unique_ids))
color_map = {rid: palette[i] for i, rid in enumerate(unique_ids)}
colors = both_imputed["respondent_id_ferc714"].map(color_map)

# Create the scatter plot
plt.figure(figsize=(12, 12))
plt.scatter(
    both_imputed["demand_mwh"],
    both_imputed["demand_imputed_pudl_mwh"],
    c=colors,
    s=0.1,
)

# Set both axes to logarithmic scale
plt.xscale("log")
plt.yscale("log")
plt.xlim(1e-1, 1e6)
plt.ylim(1e-1, 1e6)

# Add gridlines
plt.grid(True, which="both", linestyle="--", linewidth=0.5)

# Optionally add labels and a title
plt.xlabel("Old Imputed FERC-714 Planning Area Demand [MWh]")
plt.ylabel("New Imputed FERC-714 Planning Area Demand [MWh]")
plt.title("Log-Log Scatter Plot of Old vs New Imputed Demand")

plt.show()

In [None]:
assert False

In [None]:
out_sub_eia930 = get_table("out_eia930__hourly_subregion_demand")
core_sub_eia930 = get_table("core_eia930__hourly_subregion_demand")

In [None]:
core_sub_eia930

## subregion output EIA-930 notes
- There's no EIA imputation for the subregions, but for naming clarity purposes, do we want to rename `demand_imputed_mwh` so that it will be consistent with the names of the PUDL imputed columns that will exist in other tables?
- It would be nice if the table were sorted by BA Code, subregion code, and time to ensure contiguous time series.
- Looking at `CISO` I see that there are **more** NA values in the imputed column than the reported column. Is that expected? I would have thought we'd fill in the missing values.


In [None]:
out_sub_eia930

In [None]:
df = out_sub_eia930.set_index(["balancing_authority_code_eia", "balancing_authority_subregion_code_eia", "datetime_utc"]).sort_index()

In [None]:
dude = (
    out_sub_eia930
    .groupby(
        [
            "balancing_authority_code_eia",
            "balancing_authority_subregion_code_eia"
        ],
        observed=True
    )[
        [
            "demand_reported_mwh",
            "demand_imputed_mwh"
        ]
    ]
    .count()
    .assign(
        ratio=lambda x: x["demand_imputed_mwh"] / x["demand_reported_mwh"]
    )
)
printme = dude.to_markdown()
print(printme)

In [None]:
start_date = "2019-02-01"
end_date = "2019-02-14"
df.loc[("CISO", "PGAE", slice(start_date, end_date)), ["demand_reported_mwh", "demand_imputed_mwh"]].plot()

In [None]:
assert False

In [None]:
def core_sec10k__quarterly_company_information(raw_sec10k__quarterly_company_information: pd.DataFrame) -> pd.DataFrame:
    """Clean shit up."""
    # Strip erroneous "]" characters from company information fact names (the keys in
    # the key-value pairs). This has to be done before the table is re-shaped.
    df = (
        raw_sec10k__quarterly_company_information
        .assign(
            company_information_fact_name=lambda x: x.company_information_fact_name.str.lstrip("]"),
            filename_sec10k=lambda x: x.filename_sec10k.str.removeprefix("edgar/data/"),
        )
    )


    def pivot_info_block(df: pd.DataFrame, block: str) -> pd.DataFrame:
        """Extract distinct blocks of company information for separate processing."""
        pivoted = (
            df.loc[df.company_information_block == block].pivot(
                values="company_information_fact_value",
                index=[
                    "filename_sec10k",
                    "filer_count",
                    "report_date",
                    "company_information_block_count",
                ],
                columns="company_information_fact_name",
            )
            .convert_dtypes()
        )
        pivoted.columns.name = None
        return pivoted

    business_address = pivot_info_block(df, "business_address").rename(columns={"business_phone": "phone"})
    company_data = pivot_info_block(df, "company_data").drop(columns=["organization_name"])
    filing_values = pivot_info_block(df, "filing_values")
    mail_address = pivot_info_block(df, "mail_address")
    former_company = pivot_info_block(df, "former_company")

    # Add prefixes where needed to ensure that column names are distinct after concatentation.
    business_address.columns = [f"business_{col}" for col in business_address.columns]
    mail_address.columns = [f"mail_{col}" for col in mail_address.columns]
    return business_address, company_data, filing_values, mail_address, former_company
    #return pd.concat([business_address, company_data, filing_values, mail_address], axis="columns").reset_index()

    df = raw_sec10k__quarterly_company_information.pivot(
        values="company_information_fact_value",
        index=[
            "filename_sec10k",
            "report_date",
            "company_information_block",
            "company_information_block_count",
        ],
        columns="company_information_fact_name",
    )
    df.columns.name = None
    df = df.reset_index()
    return df

In [None]:
core_info = get_table("core_sec10k__quarterly_company_information")
raw_info = get_table("raw_sec10k__quarterly_company_information")
#file_sec10k = get_table("core_sec10k__quarterly_filings")
#own_sec10k = get_table("core_sec10k__quarterly_exhibit_21_company_ownership")
#out_sec10k = get_table("out_sec10k__parents_and_subsidiaries")
#name_change_sec10k = get_table("core_sec10k__changelog_company_name")

In [None]:
business_address, company_data, filing_values, mail_address, former_company = core_sec10k__quarterly_company_information(raw_info)
company_info = pd.concat(
    [
        company_data,
        business_address,
        mail_address,
    ], axis="columns"
)

## Splitting Company Info

### Company data
- 3 of the 5 tables that can be pulled out of this data seem to pertain to companies -- static or slowly changing dimensions.
- This data comes from the `business_address`, `mail_address` and `company_data` sub-tables.
- If we concatenate these sets of columns together, what natural primary keys do we end up with.
- Note that the `report_date` is not an independent field -- each filename seems to imply a specific report date.
- The relationship between filename and report date seems to be naturally homed in the "master filer" table.
- There's a directly reported CIK in the company data, but also a "master filer" CIK that can be extracted from the filename.
- Having both the master filer and the company CIKs in the same table might be useful, and could indicate whether the company is a subsidiary.
- The filename is mostly for provenance -- where did this data come from.
- If the reported data fields (address, TIN, name, fiscal year, etc.) are truly identical, then we don't lose any information about the companies by deduplicating them.
- We can drop the providencial / index information (filename, company & file information block counts) and then drop duplicates, and have a unique collection of company information.
- In an ideal world this information would not include any CIK + date duplicates but...

### Company naming history
- There's a kind of "sub-table" that's associated with the company data, which describes the history of the company names.
- The whole history seems to be reported in each filing, so there's lots of duplication.
- Alone, this table only has the "former" names and the dates of changes away from those former names.
- Being able to merge in the report date, filename, current company name and CIK from the company table will make this table more legible / useful.
- With all of that information there will be a ton of duplication, but also probably lots of useful ways to deduplicate it in a subsequent step.
- Maybe this is a raw or intermediate table that gets handed off for subsequent processing.

### Filing data
- There's also some filin-level data that's associatd with the company data.
- On cursory inspection, it seems like a lot of the duplicate values that show up when trying to create a naturla primary key if we concatenate the filing data with the company data are actually due to the filing data.
- If we separate the filing data out we may get a granular filing history that's separate from the less variable company data?
- The same SEC filing number can show up many times because amendments and the filings they amend share the same filing number.
- 


In [None]:
# On what basis is this table unique?

# True!!!
# company_info.index.is_unique

# True!!!
# company_info.reset_index().set_index(["filename_sec10k", "filer_count", "company_information_block_count"]).index.is_unique

# False!!!
# company_info.reset_index().set_index(["filename_sec10k", "company_information_block_count"]).index.is_unique

# False!!!
# company_info.reset_index().set_index(["central_index_key", "report_date"]).index.is_unique

# False!!!
# company_info.reset_index().set_index(["filename_sec10k", "filer_count"]).index.is_unique

dude = company_info.reset_index().set_index(["central_index_key", "report_date"]).sort_index().reset_index()
mask = dude.duplicated(subset=["central_index_key", "report_date"], keep=False)
#dude.loc[mask, ~dude.columns.str.match(f"^(business_|mail_)")].sort_values(["central_index_key", "report_date"]).head(50)
dude.info()

In [None]:
dude.drop(columns=["filename_sec10k", "filer_count", "company_information_block_count"]).drop_duplicates().info()

In [None]:
deduplicated = (
    dude.drop(columns=["filename_sec10k", "filer_count", "company_information_block_count"])
    .drop_duplicates()
    .set_index(["central_index_key", "report_date"])
    .sort_index()
    .reset_index()
)
dupes = deduplicated[deduplicated.duplicated(subset=["central_index_key", "report_date"], keep=False)]

In [None]:
dupes.set_index(["central_index_key", "report_date"]).head(500).tail(50)

In [None]:
business_address.info()

In [None]:
company_data.info()

In [None]:
filing_values.info()

In [None]:
mail_address.info()

In [None]:
dude = (
    former_company.reset_index()
    .assign(
        central_index_key=lambda x: x.filename_sec10k.str.split("/").str[2].str.zfill(10),
        name_change_date=lambda x: pd.to_datetime(x.date_of_name_change, format="%Y%m%d")
    )
    .sort_values(["central_index_key", "report_date"])
    .drop(columns=["date_of_name_change"])
    .drop_duplicates(subset=[
        "central_index_key",
        "name_change_date",
        "former_conformed_name",
    ], keep="last")
    .sort_values(["central_index_key", "name_change_date"])
)
display(dude.info())
dude.head(50)[
    [
        "central_index_key",
        "name_change_date",
        "former_conformed_name",
        "report_date",
        "filename_sec10k",
        "filer_count",
        "company_information_block_count",
    ]
]

In [None]:
raw_info[(raw_info.filer_count != raw_info.company_information_block_count)]

In [None]:
raw_info[raw_info.filename_sec10k == "edgar/data/105839/0000003673-94-000037.txt"]

In [None]:
assert False

## SEC 10-K Quarterly Filings

The structure of this table generally seems good.

Some field-level comments:

* It would be helpful if the column descriptions for `filing_date` and `report_date` indicated the daily vs. quarterly frequencies or other different meanings that apply in the context of this resource (or maybe the whole SEC 10-K dataset?).
* It seems as if `report_date` can be derived from `filing_date` -- that it's the first day of the quarter in which `filing_date` falls. If that's always true then technically we might not want it in the well normalized `core` tables, and could generate it in the output tables as needed. Is there a good reason to keep it? E.g. sometimes filings are late and pertain to a quarter other than the one it seems like they would?
* It seems as if in the values in `central_index_key` need to be padded out to 10 characters with leading zeros in some contexts, e.g. [in this URL](https://data.sec.gov/submissions/CIK0000032377.json) and in the filenames that are listed in the `filename_sec10k` column. Are we stripping leading zeroes in the processing? Or are they coming from the structured SEC data that way? Interestingly any number of leading zeroes is accepted [in the Edgar URLs](https://www.sec.gov/edgar/browse/?CIK=00000000000000000000000000000000000000000000000000000000000000032377). Also interestingly, the field type is already `string`. The CIK values that show up in the company info table as values are also padded to 10 characters with leading zeroes.
* `exhibit_21_version` seems to have a clean format, should we enforce it with a regex?
* Is the `filename_sec10k` value something that's specific to our document store? Or does it reference something out in SEC-land?

In [None]:
file_sec10k.info()

In [None]:
file_sec10k.sample(20)

## SEC 10-K Company Info

* At first glance, it looks like there's a lot of normalization that could be done to this table that would clarify the relationships between the different values & entities.
* The values under `company_information_fact_name` seem like they should be ~20 distinct columns, with the potential for lots of well defined data types for what would then be homogenous values coming out of `company_information_fact_value`. A handful of values appear malformed and could/should be cleaned up (`lstrip("]")`)
* Normalizing this table looks like it would drop the row count from ~8M to ~400K.
* The fact that there are multiple blocks of the same kind of data in some cases seems to suggest that there are other embedded tables that could/should be stripped out and given a relationship to this table.
* The primary key that's defined for this column is basically just all of the columns, which is a bit of a red flag.
* It looks like there are at least 2 layers of normalized tables hiding in here?
* Maybe:
  * A `company` or SEC 10K filer table, with columns drawn from the `company_data`, `business_address` and `mail_address` blocks where the PK might be CIK + report date?
  * A `filing` table drawn from the `filing_values` block that has a FK relationship with the `company` table.
  * A company name changelog table drawn from the `former_company` block that also has a FK relationship with the `company` table.
* I fiddled around with unstacking a bit (and maybe you did too) and it wasn't super simple because of the multiple nested tables, but the information in here seems like it's some of the valuable raw materials that are going to feed into the rest of the process -- e.g. the time evolution of company names and other values that we're going to try and do entity matching on. If this data was never normalized, how are we current going about extracting structured information from it for use in the document modeling / record linkage now?
* Do we understand why there are multiple companies / filings contained in some files? What's the nature of the associations between the companies that show up in a single file? Does it provide any information about ownership? In the cases where there's more than one company associated with a given file, it seems like sometimes the CIK for each of those companies is different from the CIK referenced in the filename, which seems to correspond to the "master filer" referenced in the `core_sec10k__quarterly_filings` table.

In [None]:
info_sec10k.info()

In [None]:
df: pd.DataFrame = info_sec10k[
    info_sec10k["company_information_fact_name"] == "central_index_key"
]
df.duplicated(subset=["company_information_fact_value", "report_date", "filer_count"]).sum()

In [None]:
info_sec10k.filer_count.value_counts()

In [None]:
info_sec10k["company_information_fact_name"].value_counts()

In [None]:
info_sec10k.head(50)

In [None]:
dude = info_sec10k.set_index(
    [
        "report_date",
        "filename_sec10k",
        "filer_count",
        "company_information_block",
        "company_information_block_count",
        "company_information_fact_name"
    ]
).unstack(level="company_information_fact_name")

In [None]:
dude.head(50)

In [None]:
info_sec10k[info_sec10k.filename_sec10k == "edgar/data/105839/0000003673-94-000037.txt"]

In [None]:
own_sec10k.info()

## SEC 10-K Ownership Information

* We know what file each row came from, but in many cases there are multiple companies referenced in individual 10-K files. How do we decide which one of those companies should be considered the parent? I.e. who owns the `fraction_owned`?
* There seems to be a CIK embedded in the filename. Is that important for linking this data back to the company info table? I guess we can also go indirectly to CIK via the filename listed here and in the filings table.

In [None]:
own_sec10k.sample(10)

## SEC 10-K Parent / Subsidiary Table

* This is a denormalized association table that relates parents to subsidiaries.
* It should probably correspond to a very skinny normalized association table that describes just the associations using IDs, without any of the data columns, but which can be used as the backbone for merging in data from one or more other tables that contain the specific data about the parent and subsidiary companies.
* How many different kinds of companies are there, and how do they relate to each other? Can/Should the parent and subsidiary companies be described using the same data structure?
* Every column in this table should be named such that it is clear whether the attribute is associated with the parent or the subsidiary company, since there are two sets of basically similar information in the table side-by-side, e.g. with a `parent_` and `subsidiary_` prefix.
* Based on the other tables it seems like there might be as many as 3 different kinds of companies we're dealing with, or about which we might have different kinds and amounts of information.
  * The (master filer?) company listed in the `core_sec10k__quarterly_filings` table.
  * The individual companies listed in un-normalized sub-tables of `core_sec10k__company_information`.
  * The subsidiary companies in `core_sec10k_quarterly_exhibit_21_company_ownership` about which may only have a name, or at best a name, location, and fraction of that company that's owned by some parent.
* Maybe the master filers / individual companies can be described using the same structure because that information is ultimately available for all SEC 10-K filers?
* Any company that we're pulling from the filing or company info tables will have a CIK associated with it so we can probably use that to look up (time varying) information about the company somehow.
* The raw company info that we're pulling Exhibit 21 initially won't be guaranteed to have anything but a name (and a filename / report date), but then we use that to do some entity matching, and create an association table between the Exhibit 21 companies and other SEC 10-K filers.
* There should be an Exhibit 21 company table that has its own primary key that's hopefully eventually associated with one of the real SEC 10-K filer records.
* Because ownership can change over time, with subsidiaries being sold off to other companies, it seems like we have to maintain a sense of time variability in these associations. Otherwise the data won't be useful for answering questions like "In 2020Q2 when this dodgy transaction took place between companies A and B, where they owned by the same parent company?"
* What does it mean if in a given year a subsidiary shows up in Exhibit 21 for a parent company, and in the subsequent year it does **not** show up in Exhibit 21 for that same parent company? It seems like that should imply that it's no longer owned by the parent. Is there a reason why we wouldn't be able to say that?
* Once the entity matching has been done within the SEC 10-K data, connecting the sparse information we have about many subsidiary companies back to fuller information about the SEC 10-K filers, then we have a table of company information we can try to match against the EIA utility data.

### Company Tables

* SEC 10-K filers (lots of columns, PK is probably CIK?)
* Exhibit 21 Subsidiaries (very few columns, PK is probably messy/name-based, or an auto-incrementing pseudo-key)
* EIA Utilities (lots of columns from PUDL)
* Normalized (skinny) association table that links SEC 10-K filers and Exhibit 21 Subsidiaries, based on record linkage.
* Denormalized table that merges in all relevant company-level attributes from both SEC 10-K filers and Exhibit 21 Subsidiaries tables
* Many companies will show up only as one or the other, some as both, so this denormalized table would have a lot of null fields.
* Right now it seems like `company_id_sec10k` is the column that represents the primary key of this merged company table, but it's a mix of CIKs and concatenated values from the Exhibit 21 table which is a messy mix that can lead to confusion (we were bitten by a similar pattern in the EIA utility and/or BA IDs in the FERC-714 data).
* This denormalized SEC 10-K table seems like the thing that would have been used in record linkage against the EIA utilities, since it has the most SEC based company information? In which case there would also potentially be a `utility_id_eia` column for use connecting the SEC 10-K companies to the EIA utilities.

### SEC 10-K Filer (entity) Table

* `central_index_key`
* Are there any other truly permanent company fields linked to the CIK?

### SEC 10-K Filer (time-varying) company info table

* `central_index_key` (PK)
* `report_date` (PK)
* Any other time-varying, per-company fields that fall out of the normalization of `core_sec10k__quarterly_company_information`

### Exhibit 21 Company (entity) Table

Notionally this would contain any permanent attributes of the companies observed as subsidiaries in Exhibit 21. Since that table is almost unstructured, this one is probably vestigal and doesn't need to exist.

* `company_id_ex21`
* ???

### Exhibit 21 (time-varying) Company Table

All the unique **company** information that's been extracted from Exhibit 21 attachements. Note that this doesn't include the ownership fraction, which ends up in a table that is about the data that's unique to the parent-subsidiary relationship.

* `company_id_ex21` (PK, auto-incrementing integer pseudo-key?)
* `report_date` (data)
* `parent_central_index_key` (data, extracted from `filename_sec10k`?)
* `subsidiary_company_name` (data)
* `subsidiary_company_location` (data)
* `filename_sec10k` (data)

### SEC 10-K Company (Filer and/or Subsidiary) Association Table

Links companies that have been observed as both SEC 10-K filers and Exhibit 21 subsidiaries, based on statstical record linkage.  There's no explicit time variability in this table, but the fact that some companies only show up in some years means there's an implicity time dependence, which will show up in other tables with `report_date` columns.

* `company_id_sec10k` (PK)
* `company_id_ex21` (PK)
* `central_index_key` (PK)

### Parent-subsidiary association table

A normalized table that contains just the information that is unique to the association between a parent and subsidiary company pair. This can be used to merge together other pieces of information about the parent and/or subsidiary to create useful denormalized output table. Each row says "On this date, this parent owned this fraction of this subsidiary."

* `report_date` (PK)
* `parent_company_id_sec10k` (PK)
* `subsidiary_company_id_sec10k` (PK)
* `fraction_owned` (data)

### SEC 10-K Company Output Table

A wide, human-readable denormalized table that merges together all of the relevant information about a company that we have extracted from the SEC 10-K filings, whether that information comes from the core SEC 10-K filings, Exhibit 21 attachments, or EIA Utilities data we have in PUDL.

* `company_id_sec10k` (PK)
* `report_date` (PK)
* `central_index_key` (data)
* `company_id_ex21` (data)
* `utility_id_eia` (data)
* Other data columns to be selected.

In [None]:
out_sec10k.info()

In [None]:
out_sec10k.sample(10)

In [None]:
assert False

In [None]:
interchange = get_parquet("core_eia930__hourly_interchange")
gen_by_source = get_parquet("core_eia930__hourly_net_generation_by_energy_source")
operations = get_parquet("core_eia930__hourly_operations")
demand = get_parquet("core_eia930__hourly_subregion_demand")

In [None]:
operations[operations.demand_imputed_mwh.notna()].filter(like="demand").sample(30)

In [None]:
1 - operations.filter(like="demand").isna().sum() / len(operations)

In [None]:
sec10k = get_table("out_sec10k__parents_and_subsidiaries")

In [None]:
sec10k.info()