# FERC to FERC Match

A classifier for identifying FERC plant time series in FERC Form 1 data.

Input: FERC plant record  
Output: The ID of the group of records that it ought to be part of

Training data: A list of lists of unique FERC plant record IDs (each record ID is the concatenation of: report year, respondent id, supplement number, and row number). It could also be stored as a dataframe where each column is associated with a year of data(some of which could be empty). Not sure what the best structure would be.

In [None]:
%load_ext autoreload
%autoreload 3

In [None]:
import faiss
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, fcluster
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer, OneHotEncoder
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA

from pudl.etl import defs
from ferc1_eia_match.name_cleaner import CompanyNameCleaner
from ferc1_eia_match.candidate_set_creation import DataframeEmbedder
from ferc1_eia_match import config
from ferc1_eia_match.metrics import ferc_to_ferc

## Get input data

In [None]:
# use denorm_plants_all_ferc1 instead?
plants_df = defs.load_asset_value("denorm_plants_steam_ferc1")

In [None]:
fuel_df = defs.load_asset_value("denorm_fuel_by_plant_ferc1")

In [None]:
fuel_cols = list(fuel_df.filter(regex=".*_fraction_mmbtu$").columns)

In [None]:
plants_df = plants_df.merge(
    fuel_df[["utility_id_ferc1", "plant_name_ferc1", "report_year"] + fuel_cols],
    on=["utility_id_ferc1", "plant_name_ferc1", "report_year"],
    how="left",
)

In [None]:
utility_name_cleaner = CompanyNameCleaner()
# default rules except keep words in parentheses
plant_name_cleaner = CompanyNameCleaner(
    cleaning_rules_list=[
        "replace_amperstand_between_space_by_AND",
        "replace_hyphen_between_spaces_by_single_space",
        "replace_underscore_by_space",
        "replace_underscore_between_spaces_by_single_space",
        "remove_text_puctuation_except_dot",
        "remove_math_symbols",
        "add_space_before_opening_parentheses",
        "add_space_after_closing_parentheses",
        "remove_parentheses",
        "remove_brackets",
        "remove_curly_brackets",
        "enforce_single_space_between_words",
    ]
)

In [None]:
plants_df = (
    plants_df.pipe(
        utility_name_cleaner.get_clean_df,
        "utility_name_ferc1",
        "utility_name",
    )
    .pipe(plant_name_cleaner.get_clean_df, "plant_name_ferc1", "plant_name")
)

Vectorize FERC data:  
* plant_name (via TF-IDF, with ngram_min and ngram_max as parameters)
* plant_type (OneHot encoded categorical feature)
* construction_type (OneHot encoded categorical feature)
* capacity_mw (MinMax scaled numerical feature)
* construction year (OneHot encoded categorical feature)
* utility_id_ferc1 (OneHot encoded categorical feature)
* fuel_fraction_mmbtu (several MinMax scaled numerical columns, which are
normalized and treated as a single feature.)

In [None]:
ngram_min=2
ngram_max=10
plant_name_ferc1_wt=2.0
plant_type_wt=2.0
construction_type_wt=1.0
capacity_mw_wt=1.0
construction_year_wt=1.0
utility_id_ferc1_wt=1.0
fuel_fraction_wt=1.0

In [None]:
str_cols = ["plant_type", "construction_type"]
num_cols = ["capacity_mw", "construction_year", "utility_id_ferc1"] + fuel_cols

In [None]:
plants_df[str_cols] = plants_df[str_cols].astype(str).fillna("").astype("category")

In [None]:
plants_df[num_cols] = plants_df[num_cols].fillna(0)

In [None]:
embedder = ColumnTransformer(
    transformers=[
        (
            "plant_name_ferc1",
            TfidfVectorizer(
                analyzer="char", ngram_range=(ngram_min, ngram_max)
            ),
            "plant_name_ferc1",
        ),
        (
            "plant_type",
            OneHotEncoder(categories="auto"),
            ["plant_type"],
        ),
        (
            "construction_type",
            OneHotEncoder(categories="auto"),
            ["construction_type"],
        ),
        ("capacity_mw", MinMaxScaler(), ["capacity_mw"]),
        (
            "construction_year",
            OneHotEncoder(categories="auto"),
            ["construction_year"],
        ),
        (
            "utility_id_ferc1",
            OneHotEncoder(categories="auto"),
            ["utility_id_ferc1"],
        ),
        (
            "fuel_fraction_mmbtu",
            Pipeline(
                [("scaler", MinMaxScaler()), ("norm", Normalizer())]
            ),
            fuel_cols,
        ),
    ],
    transformer_weights={
        "plant_name_ferc1": plant_name_ferc1_wt,
        "plant_type": plant_type_wt,
        "construction_type": construction_type_wt,
        "capacity_mw": capacity_mw_wt,
        "construction_year": construction_year_wt,
        "utility_id_ferc1": utility_id_ferc1_wt,
        "fuel_fraction_mmbtu": fuel_fraction_wt,
    },
)

In [None]:
plants_matrix_sparse = embedder.fit_transform(plants_df)

In [None]:
plants_matrix_sparse.shape

In [None]:
%%time
d = plants_matrix_sparse.shape[1]
plants_matrix = np.float32(plants_matrix_sparse.todense())

In [None]:
plants_small = plants_matrix[:5000]

In [None]:
plants_small_df = plants_df[:5000].copy()


In [None]:
d = 500

In [None]:
%%time
pca_small = PCA(n_components=d)
plants_small_pca = pca_small.fit_transform(np.asarray(plants_small))

In [None]:
%%time
pca = PCA(n_components=d)
plants_matrix_pca = pca.fit_transform(np.asarray(plants_matrix))

In [None]:
dist_matrix_small = pairwise_distances(plants_small_pca, metric='euclidean')

In [None]:
dist_matrix = pairwise_distances(plants_matrix_pca, metric='euclidean')

In [None]:
dist_matrix_small.shape, dist_matrix.shape

In [None]:
dist_matrix_small.mean(), dist_matrix_small.max(), dist_matrix.mean(), dist_matrix.max()

In [None]:
def penalize_same_report_year(df, dist_matrix, penalty=1000):
    report_years = range(df.report_year.min(), df.report_year.max()+1)
    penalty_matrix = np.full(dist_matrix.shape, 0)
    for yr in report_years:
        # get the indices of all the record pairs that have matching report years
        yr_idx = df[df.report_year == yr].index
        yr_match_pairs_idx = np.array(np.meshgrid(yr_idx, yr_idx)).T.reshape(-1, 2)
        idx_x = yr_match_pairs_idx[:, 0]
        idx_y = yr_match_pairs_idx[:, 1]
        penalty_matrix[idx_x, idx_y] = penalty
    # distance from node to itself should still be 0
    np.fill_diagonal(penalty_matrix, 0)
    dist_matrix += penalty_matrix
    return dist_matrix

In [None]:
%%time
dist_matrix_w_penalty_small = penalize_same_report_year(plants_small_df, dist_matrix_small.copy())

In [None]:
%%time
dist_matrix_w_penalty = penalize_same_report_year(plants_df, dist_matrix.copy())

In [None]:
# cosine similarity threshold of .75 is used in the PUDL module, but we need a distance threshold
d_threshold = 1.5

https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_clustering_metrics.html#sphx-glr-auto-examples-cluster-plot-agglomerative-clustering-metrics-py

Maybe cosine distance isn't the best metric here even though it's used in the PUDL module

What are we optimizing for? 
- largest possible clusters so there are few straggler records
- no overlapping report years within a cluster

Options for ensuring report year doesn't overlap:
- calculate a distance matrix, for all record pairs with the same report year make the distance bigger. Make metric = 'precomputed' for the model and pass distance metric into clustering model
- add a dimension to the post-PCA vector? not sure how this would work without training data

In [None]:
%%time
# use the distance matrix and pass it into sklearn agglomerative clustering
agg_clustering_small = AgglomerativeClustering(n_clusters=None,
                                         metric="precomputed",
                                         linkage="average",
                                         distance_threshold=d_threshold,
                                         compute_distances=True)
labels_small = agg_clustering_small.fit_predict(np.asarray(dist_matrix_w_penalty_small))

In [None]:
plants_small_df["id"] = labels_small
ferc_to_ferc.measure_matching(plants_small_df, d_threshold, run_tags={"dataset": "small"})

In [None]:
# do records with the same record year end up in the same group despite penalty
plants_small_df.groupby(by=["id", "report_year"]).size().value_counts().sort_index()

In [None]:
plants_small_df.groupby(by=["id"]).size().hist(bins=20)
plt.xlabel("Size of cluster")
plt.title("Histogram of new modeled cluster sizes")

In [None]:
plants_small_df.groupby(by=["plant_id_ferc1"]).size().hist(bins=20)
plt.xlabel("Size of cluster")
plt.title("Histogram of plant_id_ferc1 cluster sizes")

In [None]:
plants_small_df.groupby(by=["id"]).size().mean()

In [None]:
plants_df.groupby(by=["plant_id_ferc1"]).size().mean()

In [None]:
plants_small_df.groupby("id").plant_id_ferc1.nunique().value_counts().sort_index()

In [None]:
%%time
# use the distance matrix to pass into agglomerative clustering
agg_clustering = AgglomerativeClustering(n_clusters=None,
                                         metric="precomputed",
                                         linkage="average",
                                         distance_threshold=d_threshold,
                                         compute_distances=True)
labels = agg_clustering.fit_predict(np.asarray(dist_matrix_w_penalty))

In [None]:
labels.max()

In [None]:
plants_labels_df = plants_df.copy()

In [None]:
plants_labels_df["id"] = labels
ferc_to_ferc.measure_matching(plants_small_df, d_threshold, run_tags={"dataset": "full"})

In [None]:
# do records with the same report year end up in the same group?
# is there a way to build this restriction into the classifier?
# could add a "distance to the rest of the cluster" metric and choose the record with smaller distance
plants_labels_df.groupby(by=["id", "report_year"]).size().value_counts()

In [None]:
plants_labels_df.groupby(by=["id"]).size().hist(bins=20)

In [None]:
plants_labels_df.groupby(by=["plant_id_ferc1"]).size().hist(bins=20)
plt.xlabel("Size of cluster")
plt.xlim(0, 30)
plt.title("Histogram of plant_id_ferc1 cluster sizes")

In [None]:
plants_labels_df.groupby("id").size().mean()

In [None]:
plants_labels_df.groupby("id").plant_id_ferc1.nunique().value_counts().sort_index().head(10)

Why are there fewer "long time series" clusters in the new records?

In [None]:
matching_cols = ["id", "report_year", "plant_name_ferc1", "plant_type", "construction_type", "capacity_mw", "construction_year", "utility_id_ferc1"] + fuel_cols

In [None]:
plants_labels_df.groupby("plant_id_ferc1").size().sort_values(ascending=False).head(10)

In [None]:
old_id = 605

In [None]:
plants_labels_df[plants_labels_df.plant_id_ferc1 == old_id].id.unique()

In [None]:
plants_labels_df[plants_labels_df.plant_id_ferc1 == old_id][matching_cols]

Dendrogram Plots

In [None]:
def get_linkage_matrix(model, **kwargs):
    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)
    return linkage_matrix

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = get_linkage_matrix(model)
    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
plt.figure(figsize=(5,4))
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(agg_clustering_small, truncate_mode="level", p=40)
plt.axhline(y=d_threshold, color='r', linestyle='-')
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.ylabel("Distance between clusters")
plt.show()

In [None]:
plt.figure(figsize=(5,4))
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(agg_clustering_small, truncate_mode="level", p=40)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.axhline(y=d_threshold, color='r', linestyle='-')
plt.ylabel("Distance between clusters")
plt.ylim(0, 2.5)
plt.show()

In [None]:
plt.figure(figsize=(5,4))
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(agg_clustering_small, truncate_mode="level", p=40)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.axhline(y=d_threshold, color='r', linestyle='-')
plt.ylabel("Distance between clusters")
plt.ylim(0, .05)
plt.show()

In [None]:
plt.figure(figsize=(5,4))
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(agg_clustering, truncate_mode="level", p=20)
plt.axhline(y=d_threshold, color='r', linestyle='-')
plt.ylabel("Distance between clusters")
plt.show()

In [None]:
plt.figure(figsize=(5,4))
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(agg_clustering, truncate_mode="level", p=20)
# plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.axhline(y=d_threshold, color='r', linestyle='-')
plt.ylabel("Distance between clusters")
plt.ylim(0, 2)
plt.show()

In [None]:
plt.figure(figsize=(5,4))
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(agg_clustering, truncate_mode="level", p=20)
# plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.axhline(y=d_threshold, color='r', linestyle='-')
plt.ylabel("Distance between clusters")
plt.ylim(0, .05)
plt.show()

Spot check clusters that get merged right above the distance threshold? Should they be the same cluster?

In [None]:
# couldn't figure out how to use scipy fcluster for this
def get_original_nodes_in_cluster(cluster_n, linkage_matrix, n):
    if cluster_n < n:
        return [cluster_n]
    previous_i = int(cluster_n - n)
    left_cluster_n = linkage_matrix[previous_i][0]
    right_cluster_n = linkage_matrix[previous_i][1]
    # print(f"cluster_n: {cluster_n}, previous_i: {previous_i}, left_cluster_n: {left_cluster_n}, right_cluster_n: {right_cluster_n}")
    left_nodes_list = get_original_nodes_in_cluster(left_cluster_n, linkage_matrix, n)
    right_nodes_list = get_original_nodes_in_cluster(right_cluster_n, linkage_matrix, n)
    return left_nodes_list + right_nodes_list

In [None]:
test_m = np.array([[  0.        ,   1.        ,   1.        ,   2.        ],
                   [  4.        ,   5.        ,   5.5       ,   3.        ],
                   [  2.        ,   6.        ,  16.66666667,   4.        ],
                   [  3.        ,   7.        , 271.5       ,   5.        ]])

In [None]:
get_original_nodes_in_cluster(7, test_m, n=len(test_m)+1)

In [None]:
agg_clustering.children_.shape, agg_clustering.distances_.shape, len(plants_df)

In [None]:
dists = agg_clustering.distances_
children = agg_clustering.children_
linkage_matrix = get_linkage_matrix(agg_clustering)
matching_cols = ["report_year", "plant_id_ferc1", "utility_id_ferc1", "plant_name_ferc1", "plant_type", "construction_type", "capacity_mw", "construction_year", "utility_id_ferc1"] + fuel_cols

In [None]:
lower = 1.4
upper = 1.5
mask = np.logical_and(dists >= lower, dists <= upper)
indices = np.argwhere(mask)

In [None]:
def get_random_i():
    return np.random.randint(0, len(indices))

In [None]:
i = get_random_i()
c1 = children[indices[i]].squeeze()[0]
c2 = children[indices[i]].squeeze()[1]
c1, c2

In [None]:
f"distance: {linkage_matrix[indices[i]].squeeze()[2]}, n_nodes: {linkage_matrix[indices[i]].squeeze()[3]}"

In [None]:
left_records = get_original_nodes_in_cluster(c1, linkage_matrix, n=len(plants_df))
right_records = get_original_nodes_in_cluster(c2, linkage_matrix, n=len(plants_df))

In [None]:
plants_df.iloc[left_records][matching_cols]

In [None]:
plants_df.iloc[right_records][matching_cols]

Deal with orphaned records