In [None]:
import pandas as pd
import zipfile

In [None]:
year = 2020
panda_input_name = "full_inputs"
panda_label_suffix = "19_20_plant"

### Match panda labels to their record IDs

The panda labels use columns `id_l` and `id_r` where `id_l` corresponds
to the `id` column in the FERC data and `id_r` corresponds to the
index in the EIA input data.

In [None]:
panda_labels = pd.read_csv(f"panda_matches/panda_label_{panda_label_suffix}.csv")

In [None]:
input_zip = zipfile.ZipFile(f"panda_inputs/{panda_input_name}.zip")
ferc_input = pd.read_csv(input_zip.open("left.csv"))
eia_input = pd.read_csv(input_zip.open("right.csv"))

In [None]:
panda_labels = panda_labels.join(ferc_input["record_id_ferc1"], on="id_l")
panda_labels = panda_labels.join(eia_input["record_id_eia"], on="id_r")

In [None]:
panda_matches = panda_labels[panda_labels.panda_label == 1]
panda_matches

### Find the percentage of labeled training data that Panda finds

Note: the input EIA data to Panda is distinct, meaning that only true gran records are included. I'm not sure if the training labels are exclusive to true gran records.

Next: do a comparison to the complete training data (not just one year), maybe use `plant_id_pudl`?

In [None]:
training_labels = pd.read_csv("train_ferc1_eia.csv")

In [None]:
# ensure that there are no duplicate records in the training matches or panda matches
len(panda_matches[panda_matches.duplicated(subset=["record_id_eia", "record_id_ferc1"])])

In [None]:
len(training_labels[training_labels.duplicated(subset=["record_id_eia", "record_id_ferc1"])])

In [None]:
# get the records in both found_matches and training_labels
# found_matches and training_labels must include record_id_eia and record_id_ferc1
def get_training_label_recall(found_matches, training_labels):
    concat_labels = pd.concat(
        [found_matches[["record_id_eia", "record_id_ferc1"]], \
        training_labels[["record_id_eia", "record_id_ferc1"]]])
    finds = concat_labels[concat_labels.duplicated()]
    return finds

In [None]:
finds = get_training_label_recall(panda_matches, training_labels)

In [None]:
# this is maybe not the complete 2020 training data, need to verify with CG
train_labels_year = pd.read_csv(f"train_ferc1_eia_{year}.csv")

In [None]:
# if looking at just individual plant parts then search for records with a substring
part = "gen"
train_labels_year = train_labels_year[train_labels_year.record_id_eia.str.contains(part)]

In [None]:
len(finds), len(train_labels_year), len(finds)/len(train_labels_year)

In [None]:
# compare to the baseline model matches
baseline = pd.read_pickle("ferc1_eia_baseline_matches.pkl")

In [None]:
baseline_preds = baseline[
    (baseline.match_type == "prediction") | (baseline.match_type == "correct prediction")]

In [None]:
baseline_finds = get_training_label_recall(panda_matches, baseline_preds)

In [None]:
base_year = len(baseline_preds[baseline_preds.report_year == year])

In [None]:
len(baseline_finds), base_year, len(baseline_finds)/base_year

Note that in the next cells it is found that a handful of these training labels aren't included in the candidate set of matches. So this recall percentage is slightly off.

### Look at the matches that Panda didn't find

In [None]:
finds_and_labels = pd.concat([finds, train_labels_year[["record_id_eia", "record_id_ferc1"]]])
misses = finds_and_labels.drop_duplicates(keep=False)

In [None]:
misses

Look at the Panda soft value for these misses.

Since there are some NaNs in the pandas soft label, it seems like these records were not part of the candidate set. 

In [None]:
misses_soft_label = misses.join(
    panda_labels.set_index(
        ["record_id_eia", "record_id_ferc1"])[["panda_soft_label"]],\
    on=["record_id_eia", "record_id_ferc1"])
misses_soft_label.sort_values(by="panda_soft_label", ascending=False)

Looking at these NaNs:
- One of them (113_4_2020...) is not in the EIA input dataset. Maybe it's not a true gran? It's okay for non true grans to be linked to FERC records. Should we try and get to a place where datasets can be matched without filtering by true gran?
- The other NaN values aren't part of the candidate set it seems. I don't remember off the top of my head why some values aren't included in the candidate set.

In [None]:
eia_input[eia_input.record_id_eia == "113_4_2020_plant_unit_total_14354	"]

In [None]:
# updated recall numbers
excluded = len(misses_soft_label[misses_soft_label.panda_soft_label.isnull()])
len(finds), len(train_labels_year) - excluded, len(finds)/(len(train_labels_year) - excluded)

Looking at the soft labels from Panda:
- These records have a soft label of .5 and the rest have soft labels that are basically 0
- I'm not entirely sure how the soft label is created. My understanding is that it represents the combination of the labeling functions. It doesn't seem to be a softmax. 

In [None]:
misses_soft_label[misses_soft_label.panda_soft_label >= .00001]

### Look at full records

Look at the full EIA and FERC records for these missed pairs.

In [None]:
# join on the EIA and FERC data
full_records = misses_soft_label.join(
    eia_input.set_index("record_id_eia"), on="record_id_eia").join(
    ferc_input.set_index("record_id_ferc1"), on="record_id_ferc1", lsuffix="_eia", rsuffix="_ferc1")

Or look at the full EIA and FERC records for the matched pairs. How good are the matches?

In [None]:
full_records = panda_labels[panda_labels.panda_label == 1].join(eia_input.set_index("record_id_eia"), on="record_id_eia").join(
    ferc_input.set_index("record_id_ferc1"), on="record_id_ferc1", lsuffix="_eia", rsuffix="_ferc1")

In [None]:
# sort the non id columns of the joined dataframe to make easier comparison between columns
non_id_df = full_records[full_records.columns.difference(["record_id_eia", "record_id_ferc1"])]

full_records = pd.concat([full_records[["record_id_eia", "record_id_ferc1"]],
           non_id_df.reindex(sorted(non_id_df.columns), axis=1)], axis=1).sort_values(by=["panda_soft_label"], ascending=False)

In [None]:
full_records = full_records.drop(["id_r", "id_l"], axis=1)

In [None]:
full_records.columns

Compare the primary string columns.

In [None]:
full_records[[
    "plant_name_eia", "plant_name_ferc1", 
    "utility_name_eia", "utility_name_ferc1", 
    "energy_source_code_1", "technology_description",
    "fuel_type_code_pudl_eia", "fuel_type_code_pudl_ferc1"]]

Compare some of the numeric columns.

There are a bunch of NaNs in these columns. This likely is contributing to Panda missing these matches.

In [None]:
full_records[[
    'construction_year_eia', 'construction_year_ferc1', 
    'installation_year_eia', 'installation_year_ferc1']]

In [None]:
cols = ['capacity_factor_eia',
       'capacity_factor_ferc1', 
       'capacity_mw_eia', 
       'capacity_mw_ferc1',
       'fuel_cost_per_mmbtu_eia', 'fuel_cost_per_mmbtu_ferc1',
       'fuel_cost_per_mwh_eia', 'fuel_cost_per_mwh_ferc1',
       'fuel_type_code_pudl_eia', 'fuel_type_code_pudl_ferc1',
       'heat_rate_mmbtu_mwh_eia', 'heat_rate_mmbtu_mwh_ferc1',
       'net_generation_mwh_eia', 'net_generation_mwh_ferc1']
full_records[cols]

### Austen comparisons

In [None]:
df = pd.read_csv("manual_panda_ferc1_eia_compare.csv")

In [None]:
df

### Compare PUDL plant IDs and PUDL utility IDs between Panda matches and training records

In [None]:
ppl_distinct = pd.read_pickle("plant_parts_eia_distinct.pkl.gz")

In [None]:
eia_full = pd.read_pickle("full_eia_plant_parts_clean.pkl")

In [None]:
# get a full ferc not just the clean version
ferc_full = pd.read_pickle("full_ferc.pkl")

In [None]:
panda_matches = panda_labels[panda_labels.panda_label == 1]

In [None]:
# temporary
input_zip = zipfile.ZipFile("panda_inputs/2020.zip")
ferc_input = pd.read_csv(input_zip.open("left.csv")).set_index("record_id_ferc1")

In [None]:
full_training_labels = training_labels.join(ppl_distinct, on="record_id_eia").join(
    ferc_full, on="record_id_ferc1", lsuffix="_eia", rsuffix="_ferc1")

In [None]:
panda_plant_util_matches = panda_matches.join(ppl_distinct, on="record_id_eia").join(
    ferc_full, on="record_id_ferc1", lsuffix="_eia", rsuffix="_ferc1")

In [None]:
panda_plant_util_matches

Check if PUDL plant ID's match up

In [None]:
full_training_labels[["plant_id_pudl_eia", "plant_id_pudl_ferc1"]]

In [None]:
panda_plant_util_matches[["plant_id_pudl_eia", "plant_id_pudl_ferc1"]]

In [None]:
panda_plant_util_matches[
    panda_plant_util_matches.plant_id_pudl_eia == panda_plant_util_matches.plant_id_pudl_ferc1]

In [None]:
panda_plant_util_matches[
    panda_plant_util_matches.plant_id_pudl_eia != panda_plant_util_matches.plant_id_pudl_ferc1]

In [None]:
panda_plant_util_matches[
    panda_plant_util_matches.plant_id_pudl_eia == panda_plant_util_matches.plant_id_pudl_ferc1].panda_soft_label.mean()

In [None]:
panda_plant_util_matches[
    panda_plant_util_matches.plant_id_pudl_eia != panda_plant_util_matches.plant_id_pudl_ferc1].panda_soft_label.mean()

Check if PUDL plant and utility ID's match up

In [None]:
panda_plant_util_matches[
    ["plant_id_pudl_eia", "plant_id_pudl_ferc1", "utility_id_pudl_eia", "utility_id_pudl_ferc1"]]

In [None]:
panda_plant_util_matches[
    panda_plant_util_matches.utility_id_pudl_eia == panda_plant_util_matches.utility_id_pudl_ferc1]

In [None]:
panda_plant_util_matches[
    panda_plant_util_matches.utility_id_pudl_eia == panda_plant_util_matches.utility_id_pudl_ferc1].panda_soft_label.mean()

In [None]:
panda_plant_util_matches[
    panda_plant_util_matches.utility_id_pudl_eia != panda_plant_util_matches.utility_id_pudl_ferc1].panda_soft_label.mean()

### Do the plant parts aggregate to the correct plant
Do different EIA records of the same plant aggregate to the same FERC record or different?

Not entirely sure what I wanted to achieve here. Come back to this sometime.

In [None]:
gens = pd.read_csv("panda_matches/panda_label_19_20_gens.csv")
gens_matches = gens[gens.panda_label == 1]

In [None]:
plants = pd.read_csv("panda_matches/panda_label_19_20_plant.csv")
plants_matches = plants[plants.panda_label == 1]

In [None]:
eia_full = pd.read_pickle("full_eia_plant_parts_clean.pkl")
# get a full ferc not just the clean version
ferc_full = pd.read_pickle("full_ferc.pkl")

In [None]:
full_gens = gens_matches.join(eia_full, on="record_id_eia").join(
    ferc_full, on="record_id_ferc1", lsuffix="_eia", rsuffix="_ferc1")

In [None]:
full_plants = plants_matches.join(eia_full, on="record_id_eia").join(
    ferc_full, on="record_id_ferc1", lsuffix="_eia", rsuffix="_ferc1")

In [None]:
full_plants.plant_id_pudl.is_unique

In [None]:
full_plants[full_plants.duplicated(subset=["plant_id_pudl", "report_date"], keep=False)].sort_values(by="plant_id_pudl")

In [None]:
plant_gen = pd.concat([full_plants, full_gens])

In [None]:
# not sure what I wanted to do here. Use .agg and do something?
plant_gen[["plant_id_pudl", "report_date", "record_id_ferc1"]].groupby(["plant_id_pudl", "report_date", "record_id_ferc1"]).value_counts()