In [1]:
import pandas as pd
import zipfile

In [55]:
year = 2020
panda_input_name = "2020_gens"
panda_label_suffix = "2020_gens_2"

### Match panda labels to their record IDs

The panda labels use columns `id_l` and `id_r` where `id_l` corresponds
to the `id` column in the FERC data and `id_r` corresponds to the
index in the EIA input data.

In [56]:
panda_labels = pd.read_csv(f"panda_matches/panda_label_{panda_label_suffix}.csv")

In [57]:
input_zip = zipfile.ZipFile(f"panda_inputs/{panda_input_name}.zip")
ferc_input = pd.read_csv(input_zip.open("left.csv"))
eia_input = pd.read_csv(input_zip.open("right.csv"))

In [58]:
panda_labels = panda_labels.join(ferc_input["record_id_ferc1"], on="id_l")
panda_labels = panda_labels.join(eia_input["record_id_eia"], on="id_r")

In [59]:
panda_matches = panda_labels[panda_labels.panda_label == 1]
panda_matches

Unnamed: 0,id_l,id_r,panda_label,panda_soft_label,record_id_ferc1,record_id_eia
7275,851,2789,1,0.836556,f1_steam_2020_12_191_0_2,1240_gt1_2020_plant_gen_total_10005
7276,851,2790,1,0.836556,f1_steam_2020_12_191_0_2,1240_gt2_2020_plant_gen_total_10005
8432,1773,4370,1,0.724792,f1_hydro_2020_12_45_1_3,2726_2_2020_plant_gen_total_5416
8433,1773,4371,1,0.724792,f1_hydro_2020_12_45_1_3,2726_3_2020_plant_gen_total_5416
10575,1844,1351,1,0.836556,f1_hydro_2020_12_202_0_2,77_1_2020_plant_gen_total_11824
...,...,...,...,...,...,...
17649,81,11666,1,0.995060,f1_steam_2020_12_7_4_4,55522_ct5_2020_plant_gen_total_803
17650,81,11667,1,0.995060,f1_steam_2020_12_7_4_4,55522_ct6_2020_plant_gen_total_803
17651,81,11668,1,0.995060,f1_steam_2020_12_7_4_4,55522_ct7_2020_plant_gen_total_803
17652,594,14187,1,0.997755,f1_steam_2020_12_403_0_3,57703_01a_2020_plant_gen_total_3461


### Find the percentage of labeled training data that Panda finds

Note: the input EIA data to Panda is distinct, meaning that only true gran records are included. I'm not sure if the training labels are exclusive to true gran records.

Next: do a comparison to the complete training data (not just one year), maybe use `plant_id_pudl`?

In [60]:
training_labels = pd.read_csv("train_ferc1_eia.csv")

In [61]:
# ensure that there are no duplicate records in the training matches or panda matches
len(panda_matches[panda_matches.duplicated(subset=["record_id_eia", "record_id_ferc1"])])

0

In [62]:
len(training_labels[training_labels.duplicated(subset=["record_id_eia", "record_id_ferc1"])])

0

In [44]:
# get the records in both found_matches and training_labels
# found_matches and training_labels must include record_id_eia and record_id_ferc1
def get_training_label_recall(found_matches, training_labels):
    concat_labels = pd.concat(
        [found_matches[["record_id_eia", "record_id_ferc1"]], \
        training_labels[["record_id_eia", "record_id_ferc1"]]])
    finds = concat_labels[concat_labels.duplicated()]
    return finds

In [45]:
finds = get_training_label_recall(panda_matches, training_labels)

In [46]:
# this is maybe not the complete 2020 training data, need to verify with CG
train_labels_year = pd.read_csv(f"train_ferc1_eia_{year}.csv")

In [47]:
# if looking at just individual plant parts then search for records with a substring
part = "gen"
train_labels_year = train_labels_year[train_labels_year.record_id_eia.str.contains(part)]

In [48]:
len(finds), len(train_labels_year), len(finds)/len(train_labels_year)

(2, 9, 0.2222222222222222)

In [17]:
# compare to the baseline model matches
baseline = pd.read_pickle("ferc1_eia_baseline_matches.pkl")

In [27]:
baseline_preds = baseline[
    (baseline.match_type == "prediction") | (baseline.match_type == "correct prediction")]

In [29]:
baseline_finds = get_training_label_recall(panda_matches, baseline_preds)

In [39]:
base_year = len(baseline_preds[baseline_preds.report_year == year])

In [41]:
len(baseline_finds), base_year, len(baseline_finds)/base_year

(203, 1151, 0.17636837532580366)

Note that in the next cells it is found that a handful of these training labels aren't included in the candidate set of matches. So this recall percentage is slightly off.

### Look at the matches that Panda didn't find

In [52]:
finds_and_labels = pd.concat([finds, train_labels_year[["record_id_eia", "record_id_ferc1"]]])
misses = finds_and_labels.drop_duplicates(keep=False)

In [53]:
misses

Unnamed: 0,record_id_eia,record_id_ferc1
3,2322_gt4_2020_plant_gen_total_13407,f1_steam_2020_12_108_1_1
15,56841_1_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_35
16,56841_2_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_36
17,7082_gt1_2020_plant_gen_total_13407,f1_steam_2020_12_108_2_1
18,7082_gt4_2020_plant_gen_total_13407,f1_steam_2020_12_108_0_2
30,56466_1_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_40
31,56466_2_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_41


Look at the Panda soft value for these misses.

Since there are some NaNs in the pandas soft label, it seems like these records were not part of the candidate set. 

In [55]:
misses_soft_label = misses.join(
    panda_labels.set_index(
        ["record_id_eia", "record_id_ferc1"])[["panda_soft_label"]],\
    on=["record_id_eia", "record_id_ferc1"])
misses_soft_label.sort_values(by="panda_soft_label", ascending=False)

Unnamed: 0,record_id_eia,record_id_ferc1,panda_soft_label
3,2322_gt4_2020_plant_gen_total_13407,f1_steam_2020_12_108_1_1,
15,56841_1_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_35,
16,56841_2_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_36,
17,7082_gt1_2020_plant_gen_total_13407,f1_steam_2020_12_108_2_1,
18,7082_gt4_2020_plant_gen_total_13407,f1_steam_2020_12_108_0_2,
30,56466_1_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_40,
31,56466_2_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_41,


Looking at these NaNs:
- One of them (113_4_2020...) is not in the EIA input dataset. Maybe it's not a true gran? It's okay for non true grans to be linked to FERC records. Should we try and get to a place where datasets can be matched without filtering by true gran?
- The other NaN values aren't part of the candidate set it seems. I don't remember off the top of my head why some values aren't included in the candidate set.

In [32]:
eia_input[eia_input.record_id_eia == "113_4_2020_plant_unit_total_14354	"]

Unnamed: 0,record_id_eia,plant_id_eia,report_date,plant_part,generator_id,unit_id_pudl,prime_mover_code,energy_source_code_1,technology_description,ferc_acct_name,...,plant_part_id_eia,record_count,retirement_date,total_fuel_cost,total_mmbtu,utility_id_pudl,report_year,plant_id_report_year,plant_id_report_year_util_id,utility_name_eia


In [37]:
# updated recall numbers
excluded = len(misses_soft_label[misses_soft_label.panda_soft_label.isnull()])
len(finds), len(train_labels_year) - excluded, len(finds)/(len(train_labels_year) - excluded)

(24, 115, 0.20869565217391303)

Looking at the soft labels from Panda:
- These records have a soft label of .5 and the rest have soft labels that are basically 0
- I'm not entirely sure how the soft label is created. My understanding is that it represents the combination of the labeling functions. It doesn't seem to be a softmax. 

In [None]:
misses_soft_label[misses_soft_label.panda_soft_label >= .00001]

### Look at full records

Look at the full EIA and FERC records for these missed pairs.

In [60]:
# join on the EIA and FERC data
full_records = misses_soft_label.join(
    eia_input.set_index("record_id_eia"), on="record_id_eia").join(
    ferc_input.set_index("record_id_ferc1"), on="record_id_ferc1", lsuffix="_eia", rsuffix="_ferc1")

Or look at the full EIA and FERC records for the matched pairs. How good are the matches?

In [63]:
full_records = panda_labels[panda_labels.panda_label == 1].join(eia_input.set_index("record_id_eia"), on="record_id_eia").join(
    ferc_input.set_index("record_id_ferc1"), on="record_id_ferc1", lsuffix="_eia", rsuffix="_ferc1")

In [64]:
# sort the non id columns of the joined dataframe to make easier comparison between columns
non_id_df = full_records[full_records.columns.difference(["record_id_eia", "record_id_ferc1"])]

full_records = pd.concat([full_records[["record_id_eia", "record_id_ferc1"]],
           non_id_df.reindex(sorted(non_id_df.columns), axis=1)], axis=1).sort_values(by=["panda_soft_label"], ascending=False)

In [67]:
full_records = full_records.drop(["id_r", "id_l"], axis=1)

In [71]:
full_records.columns

Index(['record_id_eia', 'record_id_ferc1', 'capacity_mw_eia',
       'capacity_mw_ferc1', 'construction_year_eia', 'construction_year_ferc1',
       'fuel_type', 'fuel_type_code_pudl_eia', 'fuel_type_code_pudl_ferc1',
       'installation_year_eia', 'installation_year_ferc1', 'panda_label',
       'panda_soft_label', 'plant_id_pudl_eia', 'plant_id_pudl_ferc1',
       'plant_id_report_year_eia', 'plant_id_report_year_ferc1',
       'plant_id_report_year_util_id_eia',
       'plant_id_report_year_util_id_ferc1', 'plant_name_clean',
       'plant_name_eia', 'plant_name_ferc1', 'plant_type', 'prime_mover_code',
       'report_year_eia', 'report_year_ferc1', 'technology_description',
       'utility_id_pudl_eia', 'utility_id_pudl_ferc1', 'utility_name_eia',
       'utility_name_ferc1'],
      dtype='object')

Compare the primary string columns.

In [107]:
full_records[[
    "plant_name_eia", "plant_name_ferc1", 
    "utility_name_eia", "utility_name_ferc1", 
    "energy_source_code_1", "technology_description",
    "fuel_type_code_pudl_eia", "fuel_type_code_pudl_ferc1"]]

Unnamed: 0,plant_name_eia,plant_name_ferc1,utility_name_eia,utility_name_ferc1,energy_source_code_1,technology_description,fuel_type_code_pudl,fuel_type
20412,Cunningham,cunningham gas,Southwestern Public Service Co,Southwestern Public Service Company,NG,Natural Gas Fired Combustion Turbine,gas,
19614,Walters,walters hydro,Duke Energy Progress - (NC),"Duke Energy Progress, Inc.",WAT,Conventional Hydroelectric,hydro,
19562,Belle River,belle river oil pkr,DTE Electric Company,DTE Electric Company,DFO,Petroleum Liquids,oil,
20160,Blewett,blewett hydro,Duke Energy Progress - (NC),"Duke Energy Progress, Inc.",WAT,Conventional Hydroelectric,hydro,
20159,Blewett,blewett hydro,Duke Energy Progress - (NC),"Duke Energy Progress, Inc.",WAT,Conventional Hydroelectric,hydro,
...,...,...,...,...,...,...,...,...
18945,Chisago Community Solar,brown solar,Chisago Community Solar,Kentucky Utilities Company,SUN,Solar Photovoltaic,solar,
16439,Honbushin Solar Blessings Park,legoland solar,Waihonu South LLC,Tampa Electric Company,SUN,Solar Photovoltaic,solar,
18943,Chisago Community Solar,brown solar,Chisago Community Solar,Kentucky Utilities Company,SUN,Solar Photovoltaic,solar,
18946,Chisago Community Solar,brown solar,Chisago Community Solar,Kentucky Utilities Company,SUN,Solar Photovoltaic,solar,


Compare some of the numeric columns.

There are a bunch of NaNs in these columns. This likely is contributing to Panda missing these matches.

In [104]:
full_records[[
    'construction_year_eia', 'construction_year_ferc1', 
    'installation_year_eia', 'installation_year_ferc1']]

Unnamed: 0,construction_year_eia,construction_year_ferc1,installation_year_eia,installation_year_ferc1
20412,1998.0,1998.0,1998.0,1998.0
19614,1930.0,1930.0,1930.0,1930.0
19562,1981.0,1981.0,1981.0,1981.0
20160,1912.0,1912.0,1912.0,1912.0
20159,1912.0,1912.0,1912.0,1912.0
...,...,...,...,...
18945,2016.0,2016.0,2016.0,
16439,2016.0,2016.0,2016.0,
18943,2016.0,2016.0,2016.0,
18946,2016.0,2016.0,2016.0,


In [67]:
cols = ['capacity_factor_eia',
       'capacity_factor_ferc1', 
       'capacity_mw_eia', 
       'capacity_mw_ferc1',
       'fuel_cost_per_mmbtu_eia', 'fuel_cost_per_mmbtu_ferc1',
       'fuel_cost_per_mwh_eia', 'fuel_cost_per_mwh_ferc1',
       'fuel_type_code_pudl_eia', 'fuel_type_code_pudl_ferc1',
       'heat_rate_mmbtu_mwh_eia', 'heat_rate_mmbtu_mwh_ferc1',
       'net_generation_mwh_eia', 'net_generation_mwh_ferc1']
full_records[cols]

Unnamed: 0,capacity_factor_eia,capacity_factor_ferc1,capacity_mw_eia,capacity_mw_ferc1,fuel_cost_per_mmbtu_eia,fuel_cost_per_mmbtu_ferc1,fuel_cost_per_mwh_eia,fuel_cost_per_mwh_ferc1,fuel_type_code_pudl_eia,fuel_type_code_pudl_ferc1,heat_rate_mmbtu_mwh_eia,heat_rate_mmbtu_mwh_ferc1,net_generation_mwh_eia,net_generation_mwh_ferc1
68,,,115.8,115.80,,,,,wind,,,,,3.632210e+08
30,,,156.0,156.00,,,,,wind,,,,,4.487080e+08
39,,,122.1,122.10,,,,,wind,,,,,4.278560e+08
59,,,110.4,110.38,,,,,wind,,,,,3.163680e+08
62,,,35.2,35.15,,,,,wind,,,,,1.092720e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,,,15.0,15.00,,,,,solar,,,,,4.008800e+04
2,,0.460336,,414.00,,2.609087,,29.307521,,coal,,11.232844,,1.669474e+06
3,,0.017254,72.4,72.40,,2.028781,,30.106735,gas,gas,,14.842085,,1.094300e+04
25,0.475715,0.477485,1465.6,1465.40,2.659401,2.584960,19.866597,19.776492,gas,gas,7.470327,7.650750,6124268.0,6.129435e+06
