# Introduction

In [39]:
def s3(table):
    return f"s3://pudl.catalyst.coop/nightly/{table}.parquet"

# 1. Warm-up: Find all historical names for a company

In [4]:
import pandas as pd

In [14]:
valaris_names = pd.read_parquet(
    s3("out_sec10k__changelog_company_name"),
    columns=[
        "central_index_key",
        "name_change_date",
        "company_name_old",
        "company_name_new",
    ],
    dtype_backend="pyarrow",
    engine="pyarrow",
    filters=[("central_index_key","=","0000314808")],
)
valaris_names

Unnamed: 0,central_index_key,name_change_date,company_name_old,company_name_new
0,314808,1987-10-15,blocker energy corp,energy service company inc
1,314808,1992-07-03,energy service company inc,ensco international inc
2,314808,1995-05-26,ensco international inc,ensco international plc
3,314808,2009-12-23,ensco international plc,ensco rowan plc
4,314808,2019-04-10,ensco rowan plc,valaris plc
5,314808,2019-08-01,valaris plc,valaris ltd


In [37]:
print(
    valaris_names.drop(columns="name_change_date")
    .set_index("central_index_key")
    # put all the names in a single column
    .stack()
    .drop(index="level_1")
    .groupby("central_index_key")
    # join unique names together, |-delimited
    .agg(lambda x: "|".join(sorted(set(x))))
    # format for output
    .reset_index()
    .rename(columns={0:"names"})
    .to_csv(index=False)
)

central_index_key,names
0000314808,blocker energy corp|energy service company inc|ensco international inc|ensco international plc|ensco rowan plc|valaris ltd|valaris plc



# 2. Leverage Industry Codes

## Use SIC codes to evaluate the coverage of record linkages

In [38]:
import polars as pl

In [67]:
company_quarters = pl.read_parquet(
    s3("out_sec10k__quarterly_company_information"),
    storage_options={
        "skip_signature": "true",
        "region": "us-west-2",
    },
)

#### What industries have the highest representation among record linkages?

For how many quarterly records do we have a link between the SEC filer and an EIA utility ID?

In [68]:
linked_company_quarters = company_quarters.filter(pl.col("utility_id_eia").is_not_null())
linked_company_quarters.height

15178

How many unique companies are found within those records?

In [69]:
linked_companies = (
    linked_company_quarters.unique(subset=["central_index_key","utility_id_eia"])
)
linked_companies.height

529

How many records do we retain if we require a valid industry code?

In [70]:
linked_company_quarters.filter(pl.col("industry_id_sic").is_not_null()).height

15133

How many unique companies do we retain if we require a valid industry code?

In [71]:
linked_companies_with_sic = (
    linked_company_quarters.filter(pl.col("industry_id_sic").is_not_null())
        .unique(subset=["central_index_key","utility_id_eia"])
)
linked_companies_with_sic.height

526

Do companies sometimes change their industry code across filings?

/is the number of unique (company, industry) pairs greater than the number of unique companies?

In [72]:
linked_company_quarters.unique(subset=["central_index_key","industry_id_sic"]).height

632

Among quarterly records with a link between the filer and an EIA utility company, as well as a valid SIC code, what industries are most commonly seen?

In [73]:
quarterly_links_per_sic = (
    linked_company_quarters
    .filter(pl.col("industry_id_sic").is_not_null())
    .select(["industry_id_sic","industry_name_sic"])
    .group_by(["industry_id_sic","industry_name_sic"])
    .agg(links_with_sic=pl.len())
    .with_columns(
        fraction_with_sic=pl.col("links_with_sic") / pl.col("links_with_sic").sum()
    )
    .sort("fraction_with_sic", descending=True)
)
quarterly_links_per_sic.head(10)

industry_id_sic,industry_name_sic,links_with_sic,fraction_with_sic
str,str,u32,f64
"""4911""","""electric services""",6903,0.456155
"""4931""","""electric & other services comb…",2105,0.1391
"""6798""","""real estate investment trusts""",413,0.027291
"""6189""","""asset-backed securities""",376,0.024846
"""1311""","""crude petroleum & natural gas""",229,0.015132
"""2621""","""paper mills""",185,0.012225
"""2834""","""pharmaceutical preparations""",177,0.011696
"""4991""","""cogeneration services & small …",136,0.008987
"""2631""","""paperboard mills""",124,0.008194
"""4922""","""natural gas transmission""",124,0.008194


Does the distribution over industries change significantly if we only count unique (company, industry) pairs?

In [75]:
unique_links_per_sic = (
    linked_company_quarters
    .filter(pl.col("industry_id_sic").is_not_null())
    .unique(subset=["central_index_key","industry_id_sic"]) # drop multiple instances of the same company
    .select(["industry_id_sic","industry_name_sic"])
    .group_by(["industry_id_sic","industry_name_sic"])
    .agg(links_with_sic=pl.len())
    .with_columns(
        fraction_with_sic=pl.col("links_with_sic") / pl.col("links_with_sic").sum()
    )
    .sort("fraction_with_sic", descending=True)
)
unique_links_per_sic.head(10)

industry_id_sic,industry_name_sic,links_with_sic,fraction_with_sic
str,str,u32,f64
"""4911""","""electric services""",124,0.197452
"""4931""","""electric & other services comb…",49,0.078025
"""6798""","""real estate investment trusts""",19,0.030255
"""6189""","""asset-backed securities""",17,0.02707
"""2621""","""paper mills""",15,0.023885
"""1311""","""crude petroleum & natural gas""",14,0.022293
"""6770""","""blank checks""",11,0.017516
"""2834""","""pharmaceutical preparations""",11,0.017516
"""2631""","""paperboard mills""",10,0.015924
"""2911""","""petroleum refining""",10,0.015924


lol, "blank checks" -- but otherwise pretty close to the distribution over all quarterly links.

#### Within industries we most associate with electric utilities, what percent of SEC filers have links to an EIA utility ID?

## SIC codes of interest

## Select a meaningful subset of respondents

### Electricity

### Natural Gas

### Fuel

### Links between companies in different industries

# Leverage Subsidiary Relationships

## Find all historical subsidiaries of a company

## Working with multiple layers of subsidiary nesting