# FERC to FERC Match

A classifier for identifying FERC plant time series in FERC Form 1 data.

Input: FERC plant record  
Output: The ID of the group of records that it ought to be part of

Training data: A list of lists of unique FERC plant record IDs (each record ID is the concatenation of: report year, respondent id, supplement number, and row number). It could also be stored as a dataframe where each column is associated with a year of data(some of which could be empty). Not sure what the best structure would be.

In [None]:
%load_ext autoreload
%autoreload 3

In [None]:
import faiss
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer, OneHotEncoder

from pudl.etl import defs
from ferc1_eia_match.name_cleaner import CompanyNameCleaner
from ferc1_eia_match.candidate_set_creation import DataframeEmbedder
from ferc1_eia_match import config

## Get input data

In [None]:
# use denorm_plants_all_ferc1 instead?
plants_df = defs.load_asset_value("denorm_plants_steam_ferc1")

In [None]:
fuel_df = defs.load_asset_value("denorm_fuel_by_plant_ferc1")

In [None]:
fuel_cols = list(fuel_df.filter(regex=".*_fraction_mmbtu$").columns)

In [None]:
plants_df = plants_df.merge(
    fuel_df[["utility_id_ferc1", "plant_name_ferc1", "report_year"] + fuel_cols],
    on=["utility_id_ferc1", "plant_name_ferc1", "report_year"],
    how="left",
)

In [None]:
utility_name_cleaner = CompanyNameCleaner()
# default rules except keep words in parentheses
plant_name_cleaner = CompanyNameCleaner(
    cleaning_rules_list=[
        "replace_amperstand_between_space_by_AND",
        "replace_hyphen_between_spaces_by_single_space",
        "replace_underscore_by_space",
        "replace_underscore_between_spaces_by_single_space",
        "remove_text_puctuation_except_dot",
        "remove_math_symbols",
        "add_space_before_opening_parentheses",
        "add_space_after_closing_parentheses",
        "remove_parentheses",
        "remove_brackets",
        "remove_curly_brackets",
        "enforce_single_space_between_words",
    ]
)

In [None]:
plants_df = (
    plants_df.pipe(
        utility_name_cleaner.get_clean_df,
        "utility_name_ferc1",
        "utility_name",
    )
    .pipe(plant_name_cleaner.get_clean_df, "plant_name_ferc1", "plant_name")
)

Probably should adapt the DataframeEmbedder class to function with a "pipeline"-like structure for all these embedding functions.

Vectorize FERC data:  
* plant_name (via TF-IDF, with ngram_min and ngram_max as parameters)
* plant_type (OneHot encoded categorical feature)
* construction_type (OneHot encoded categorical feature)
* capacity_mw (MinMax scaled numerical feature)
* construction year (OneHot encoded categorical feature)
* utility_id_ferc1 (OneHot encoded categorical feature)
* fuel_fraction_mmbtu (several MinMax scaled numerical columns, which are
normalized and treated as a single feature.)

Note: Integer nulls are filled with 0 in the PUDL module

In [None]:
ngram_min=2
ngram_max=10
plant_name_ferc1_wt=2.0
plant_type_wt=2.0
construction_type_wt=1.0
capacity_mw_wt=1.0
construction_year_wt=1.0
utility_id_ferc1_wt=1.0
fuel_fraction_wt=1.0

In [None]:
str_cols = ["plant_type", "construction_type"]
num_cols = ["capacity_mw", "construction_year", "utility_id_ferc1"] + fuel_cols

In [None]:
plants_df[str_cols] = plants_df[str_cols].astype(str).fillna("").astype("category")

In [None]:
plants_df[num_cols] = plants_df[num_cols].fillna(0)

In [None]:
embedder = ColumnTransformer(
    transformers=[
        (
            "plant_name_ferc1",
            TfidfVectorizer(
                analyzer="char", ngram_range=(ngram_min, ngram_max)
            ),
            "plant_name_ferc1",
        ),
        (
            "plant_type",
            OneHotEncoder(categories="auto"),
            ["plant_type"],
        ),
        (
            "construction_type",
            OneHotEncoder(categories="auto"),
            ["construction_type"],
        ),
        ("capacity_mw", MinMaxScaler(), ["capacity_mw"]),
        (
            "construction_year",
            OneHotEncoder(categories="auto"),
            ["construction_year"],
        ),
        (
            "utility_id_ferc1",
            OneHotEncoder(categories="auto"),
            ["utility_id_ferc1"],
        ),
        (
            "fuel_fraction_mmbtu",
            Pipeline(
                [("scaler", MinMaxScaler()), ("norm", Normalizer())]
            ),
            fuel_cols,
        ),
    ],
    transformer_weights={
        "plant_name_ferc1": plant_name_ferc1_wt,
        "plant_type": plant_type_wt,
        "construction_type": construction_type_wt,
        "capacity_mw": capacity_mw_wt,
        "construction_year": construction_year_wt,
        "utility_id_ferc1": utility_id_ferc1_wt,
        "fuel_fraction_mmbtu": fuel_fraction_wt,
    },
)

In [None]:
plants_matrix_sparse = embedder.fit_transform(plants_df)

In [None]:
# maybe want to downsample
plants_matrix_sparse.shape

Run similarity search

In [None]:
plants_df.plant_id_ferc1.max()

In [None]:
%%time
d = plants_matrix_sparse.shape[1]
plants_matrix = np.float32(plants_matrix_sparse.todense())

In [None]:
plants_small = plants_matrix[:2000]

In [None]:
plants_small_df = plants_df[:2000].copy()

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
# similarity threshold of .75 is used in the PUDL module, but we need a distance threshold
d_threshold = .25

In [None]:
%%time
# use the distance matrix and pass it into sklearn agglomerative clustering
agg_clustering = AgglomerativeClustering(n_clusters=None,
                                         metric="cosine", # try using "cosine", but didn't seem to work as well
                                         # connectivity=similarities,
                                         linkage="single",
                                         distance_threshold=d_threshold)
labels = agg_clustering.fit_predict(np.asarray(plants_small))

In [None]:
plants_small_df["id"] = labels

In [None]:
plants_small_df.groupby("id").plant_id_ferc1.count().value_counts().sort_index()

In [None]:
%%time
faiss.normalize_L2(plants_small)
# use the Inner Product Index, which is equivalent to cosine sim for normalized vectors
index = faiss.IndexFlatIP(d)
index.add(plants_small)

In [None]:
%%time
# I feel like this shouldn't take this long, maybe something is messed up with threading
similarities, neighbors = index.search(plants_small, plants_small.shape[0])

In [None]:
np.all(np.abs(similarities-similarities.T) < 1e-8)

In [None]:
%%time
# this doesn't seem to work
# use the distance matrix and pass it into sklearn agglomerative clustering
agg_clustering = AgglomerativeClustering(n_clusters=None,
                                         metric="cosine", # try using "cosine", but is that a similarity?
                                         connectivity=similarities,
                                         linkage="single",
                                         distance_threshold=d_threshold)
labels = agg_clustering.fit_predict(np.asarray(plants_small))

Graph the results

Deal with orphaned records