In [None]:
import pandas as pd
import sqlalchemy as sa
import pudl

import recordlinkage as rl
from recordlinkage.compare import Exact, String 

from pudl import logging_helpers
logger = pudl.logging_helpers.get_logger(__name__)

# CorpsWatch SEC EX-21 download
I downloaded the CSV's from this file:
https://archive.org/download/corpwatch_api_data_dumps/corpwatch_api_tables_csv.tar.gz

Unzipped the tarball and put it in a dir in *this* directory.

# SEC EX-21 Extract
I downloaded the CSv's from this file:
https://archive.org/download/corpwatch_api_data_dumps/corpwatch_api_tables_csv.tar.gz

And put it in a dir in *this* directory.

In [None]:
directory = "corpwatch_api_tables_csv"
# tables to grab
tables_sec21 = ["companies", "company_relations", "company_info", "sic_codes", "company_locations"]

for table in tables_sec21:
    logger.info(f"{table}: extracting CSV")
    vars()[table] = pd.read_csv(f"{directory}/{table}.csv", sep='\t').convert_dtypes()

# Input Prep

In [None]:
idx_eia = ["utility_id_eia", "report_date"]
idx_sec = ["cw_id", "year", "type"]
shared_cols = ["year", "utility_name_eia", "city", "street_address", "address_2", "zip_code"]

## `pudl_out` setup

In [None]:
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])

pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)
utils_eia860 = pudl_out.utils_eia860().assign(year=lambda x: x.report_date.dt.year)

## SEC setup

In [None]:
def prep_companies_sec(company_info, company_locations):
    # either grab both sectors that have 
    sic_sector_electric = (
        # company_info.loc[company_info.sector_name.str.lower().str.contains("electric"), "sic_sector"].unique()
        [4900]
    )
    company_locations = (
        company_locations.dropna(subset=["cw_id"])
        .assign(year=lambda x: pd.to_datetime(company_locations.date).dt.year).convert_dtypes()
    )

    dupe_address_ratio = (
        len(company_locations[company_locations.duplicated(subset=idx_sec, keep=False)])
        /len(company_locations)
    )
    if dupe_address_ratio > .0001:
        raise AssertionError(
            f"{dupe_address_ratio:.3%} of addresses are duplicates based on {idx_sec}. "
            "More than expected (.01%)."
        )
    companies_sec = (
        company_info
        [company_info.sic_sector.isin(sic_sector_electric)]
        .merge(
            company_locations,on=["cw_id", "year"],
            validate="1:m",
            how="left"
        )
        .rename(columns={
            "company_name": "utility_name_eia",
            "city": "city",
            "street_1": "street_address",
            "street_2": "address_2",
            "postal_code": "zip_code"
        })
    )
    return companies_sec

In [None]:
companies_sec = prep_companies_sec(company_info, company_locations)

# Make Features

In [None]:
def make_features(companies_sec: pd.DataFrame, utils_eia860: pd.DataFrame, block_col: str | None = "year"
    ) -> pd.DataFrame:
        """Generate comparison features based on defined features.

        The recordlinkage package helps us create feature vectors. For each column that
        we have in both datasets, this method generates a column of feature vecotrs,
        which contain values between 0 and 1 that are measures of the similarity between
        each datapoint the two datasets (1 meaning the two datapoints were exactly the
        same and 0 meaning they were not similar at all).

        For more details see recordlinkage's documentaion:
        https://recordlinkage.readthedocs.io/en/latest/ref-compare.html

        Args:
            companies_sec:
            utils_eia860:
            block_col:  If you want to restrict possible matches
                between ferc_df and eia_df based on a particular column,
                block_col is the column name of blocking column. Default is
                ``year``. If None, this method will generate features between all
                possible matches.

        Returns:
            a dataframe of feature vectors between SEC and EIA.
        """
        compare_cl = rl.Compare(
            features=[
                String(
                    "utility_name_eia",
                    "utility_name_eia",
                    label="utility_name_eia",
                    method="jarowinkler",
                ),
                String(
                    "city",
                    "city",
                    label="city",
                    method="jarowinkler",
                ),
                String(
                    "street_address",
                    "street_address",
                    label="street_address",
                    method="jarowinkler",
                ),
                String(
                    "address_2",
                    "address_2",
                    label="address_2",
                    method="jarowinkler",
                ),
                Exact(
                    "zip_code",
                    "zip_code",
                    label="zip_code",
                ),
            ]
        )

        # generate the index of all candidate features
        indexer = rl.Index()
        indexer.block(block_col)
        feature_index = indexer.index(companies_sec, utils_eia860)

        features = compare_cl.compute(feature_index, companies_sec, utils_eia860)
        return features

In [None]:
%%time
features = make_features(companies_sec=companies_sec, utils_eia860=utils_eia860)