In [None]:
%load_ext autoreload
%autoreload 3

In [None]:
%pip install matplotx

In [None]:
%pip install polars

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import matplotx
import polars as pl
from pathlib import Path
import os

In [None]:
%matplotlib inline

In [None]:
matplotlib.rcParams["figure.figsize"] = (10, 6)
matplotlib.rcParams["figure.dpi"] = 150
matplotlib.style.use(matplotx.styles.onedark)

pl.Config.set_tbl_rows(500)
pl.Config.set_tbl_cols(100)
pl.Config.set_fmt_str_lengths(100)

In [None]:
def get_polars(table: str) -> pl.DataFrame:
    return pl.read_parquet(
        f"https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/v2025.9.1/{table}.parquet"
    )

In [None]:
companies = get_polars("out_sec10k__quarterly_company_information")
filings = get_polars("out_sec10k__quarterly_filings")
parsubs = get_polars("out_sec10k__parents_and_subsidiaries")
name_changes = get_polars("out_sec10k__changelog_company_name")
ex21 = get_polars("core_sec10k__quarterly_exhibit_21_company_ownership")
eia_utilities = get_polars("core_eia__entity_utilities")

In [None]:
sic_lookup = (
    (
        parsubs
        .select(["parent_company_industry_id_sic","parent_company_industry_name_sic"])
        .sort("parent_company_industry_id_sic")
        .unique()
        .rename({"parent_company_industry_id_sic":"industry_id_sic", "parent_company_industry_name_sic":"p_industry_name_sic"})
    ).join(
        (
            parsubs
            .select(["subsidiary_company_industry_id_sic","subsidiary_company_industry_name_sic"])
            .sort("subsidiary_company_industry_id_sic")
            .unique()
            .rename({"subsidiary_company_industry_id_sic":"industry_id_sic", "subsidiary_company_industry_name_sic":"s_industry_name_sic"})
        ),
        on="industry_id_sic",
        how="full",
        coalesce=True,
    ).sort("industry_id_sic")
    .filter(pl.sum_horizontal(pl.all().is_null())<2)
    .with_columns(
        industry_name_sic=pl.col("s_industry_name_sic").fill_null(pl.col("p_industry_name_sic"))
    ).drop(["p_industry_name_sic","s_industry_name_sic"])
    .unique()
    .sort("industry_id_sic")
    #
    # .filter(pl.col("p_industry_name_sic").is_not_null() &
    #         pl.col("s_industry_name_sic").is_not_null() &
    #         (pl.col("p_industry_name_sic") != pl.col("s_industry_name_sic")))
)

In [None]:
(
    companies
    .select(["industry_id_sic","industry_name_sic"])
    .sort("industry_id_sic")
    .unique()
)

# Matches between EIA utility ids and SEC 10-K energy companies are sparse

## EIA utilities linked to an SEC filer

In [None]:
with_cik = (
    
    eia_utilities
        .join(
            companies
                .filter(pl.col("utility_id_eia").is_not_null())
                .select(["utility_id_eia","central_index_key"])
                .unique(["utility_id_eia","central_index_key"])
                .group_by(["utility_id_eia"])
                .agg(count=pl.len()),
            on="utility_id_eia",
            how="left"
        )
)

In [None]:
with_subs = (
    
    eia_utilities
        .join(
            parsubs
                .filter(pl.col("subsidiary_company_utility_id_eia").is_not_null())
                .select(["subsidiary_company_utility_id_eia","subsidiary_company_name"])
                .unique(["subsidiary_company_utility_id_eia","subsidiary_company_name"])
                .group_by(["subsidiary_company_utility_id_eia"])
                .agg(count=pl.len()),
            left_on="utility_id_eia",
            right_on="subsidiary_company_utility_id_eia",
            how="left"
        )
)

In [None]:
eia_stats = (
    with_cik
        .rename({"count":"count_cik"})
    .join(with_subs.rename({"count":"count_subsidiary"}),
          on=["utility_id_eia","utility_name_eia"],
          how="left"
         )
    .with_columns(
        has_cik=pl.col("count_cik").is_not_null(),
        has_subs=pl.col("count_subsidiary").is_not_null(),
        either=pl.col("count_cik").is_not_null() | pl.col("count_subsidiary").is_not_null(),
        all=1
    )
)

In [None]:
eia_stats.filter(pl.col("has_cik") & pl.col("has_subs"))

In [None]:
eia_stats.filter(pl.col("has_subs") & ~pl.col("has_cik"))[["has_subs"]].sum()

In [None]:
eia_stats[["either","all","has_cik","has_subs"]].sum()

In [None]:
eia_stats[["either","all","has_cik","has_subs"]].sum()/eia_stats["all"].sum()

## SEC filers linked to an EIA utility

In [None]:
links_with_sic = (
    companies
    .filter(pl.col("utility_id_eia").is_not_null() & pl.col("industry_id_sic").is_not_null())
    .select(["industry_id_sic"])
    .group_by(["industry_id_sic"])
    .agg(links_total=pl.len())
    .with_columns(
        fraction_with_sic=pl.col("links_total") / pl.col("links_total").sum()
    )
    .sort("fraction_with_sic", descending=True)
)
links_with_sic.head(10)

In [None]:
sics_with_link = (
    companies
    .group_by("industry_id_sic")
    .agg(
        fraction_with_utility_id=pl.col("utility_id_eia").is_not_null().mean(),
        sic_total=pl.len()
    )
    .sort("fraction_with_utility_id", descending=True)
)
sics_with_link.head(10)

In [None]:
sic_stats = (
    links_with_sic.join(sics_with_link, on="industry_id_sic")
    .join(sic_lookup, on="industry_id_sic", how="left")
)
sic_stats

### Top 10 SICs by frequency within EIA-matched SEC filers

In [None]:
def make_table(df):
    print("""
.. list-table::
   :header-rows: 1
   :widths: auto

   * - Standard Industrial Code (SIC)
     - All filings reporting this SIC
     - Matches reporting this SIC
     - Percent of all filings using this SIC
     - Percent of all matches""")
    for rec in (
        df.head(10).with_columns(sic=pl.concat_str([
            pl.col("industry_id_sic"),
            pl.col("industry_name_sic"),
        ], separator=" "))
        .select(
        ["sic","sic_total","links_total","fraction_with_utility_id","fraction_with_sic"]
    ).to_dicts()):
        sic,total,links,fru,frs = rec.values()
        print(f"""   * - {sic}
     - {total}
     - {links}
     - {fru*100:.2g}%
     - {frs*100:.2g}%""")
    

In [None]:
make_table(sic_stats.sort("fraction_with_sic", descending=True))

### Top 10 SICs by match rate

In [None]:
make_table(sic_stats.sort("fraction_with_utility_id", descending=True))