In [2]:
import pandas as pd
import zipfile

In [16]:
year = 2020
panda_label_num = 1

### Match panda labels to their record IDs

The panda labels use columns `id_l` and `id_r` where `id_l` corresponds
to the `id` column in the FERC data and `id_r` corresponds to the
index in the EIA input data.

In [17]:
panda_labels = pd.read_csv(f"panda_matches/panda_label_{panda_label_num}.csv")

In [18]:
input_zip = zipfile.ZipFile(f"panda_inputs/{year}.zip")
ferc_input = pd.read_csv(input_zip.open("left.csv"))
eia_input = pd.read_csv(input_zip.open("right.csv"))

In [19]:
panda_labels = panda_labels.join(ferc_input.set_index("id")["record_id_ferc1"], on="id_l")
panda_labels = panda_labels.join(eia_input["record_id_eia"], on="id_r")

In [20]:
panda_labels

Unnamed: 0,id_l,id_r,panda_label,panda_soft_label,record_id_ferc1,record_id_eia
0,27884,23861,-1,8.310147e-15,f1_steam_2020_12_144_1_4,235_1_2020_plant_gen_total_14328
1,27884,23862,-1,8.310147e-15,f1_steam_2020_12_144_1_4,235_2_2020_plant_gen_total_14328
2,27771,32223,-1,8.310147e-15,f1_steam_2020_12_164_1_1,7538_1_2020_plant_gen_total_3046
3,27771,32224,-1,8.310147e-15,f1_steam_2020_12_164_1_1,7538_2_2020_plant_gen_total_3046
4,27771,32225,-1,8.310147e-15,f1_steam_2020_12_164_1_1,7538_3_2020_plant_gen_total_3046
...,...,...,...,...,...,...
20146,28273,17335,1,9.999273e-01,f1_steam_2020_12_44_0_5,6035_1_2020_plant_unit_total_5109
20147,27571,20806,1,1.000000e+00,f1_steam_2020_12_176_1_5,59784_ct_2020_plant_prime_mover_total_24211
20148,27727,2167,-1,2.415847e-14,f1_steam_2020_12_57_1_5,708_2020_plant_total_7140_retired
20149,27718,2161,1,1.000000e+00,f1_steam_2020_12_57_0_1,703_2020_plant_total_7140


In [21]:
panda_matches = panda_labels[panda_labels.panda_label == 1]
panda_matches

Unnamed: 0,id_l,id_r,panda_label,panda_soft_label,record_id_ferc1,record_id_eia
3021,27986,16846,1,1.000000,f1_steam_2020_12_315_0_1,1393_4_2020_plant_unit_total_11241
5031,27512,41911,1,1.000000,f1_steam_2020_12_186_7_4,59913_ct02_2020_plant_gen_total_19876
5032,27512,41912,1,1.000000,f1_steam_2020_12_186_7_4,59913_ct03_2020_plant_gen_total_19876
5262,28203,20148,1,1.000000,f1_steam_2020_12_147_1_2,55210_ca_2020_plant_prime_mover_total_15473
5498,27881,2404,1,1.000000,f1_steam_2020_12_144_1_1,1082_2020_plant_total_5742
...,...,...,...,...,...,...
20145,27720,17919,1,1.000000,f1_steam_2020_12_57_0_3,56150_2_2020_plant_unit_total_7140
20146,28273,17335,1,0.999927,f1_steam_2020_12_44_0_5,6035_1_2020_plant_unit_total_5109
20147,27571,20806,1,1.000000,f1_steam_2020_12_176_1_5,59784_ct_2020_plant_prime_mover_total_24211
20149,27718,2161,1,1.000000,f1_steam_2020_12_57_0_1,703_2020_plant_total_7140


### Find the percentage of labeled training data that Panda finds

Note: the input EIA data to Panda is distinct, meaning that only true gran records are included. I'm not sure if the training labels are exclusive to true gran records.

In [52]:
training_labels = pd.read_csv("train_ferc1_eia.csv")

In [23]:
# ensure that there are no duplicate records in the training matches or panda matches
len(panda_matches[panda_matches.duplicated(subset=["record_id_eia", "record_id_ferc1"])])

0

In [24]:
len(training_labels[training_labels.duplicated(subset=["record_id_eia", "record_id_ferc1"])])

0

In [54]:
# get the records in found_matches that are also inclued in training_labels
# found_matches and training_labels must include record_id_eia and record_id_ferc1
def get_training_label_recall(found_matches, training_labels):
    concat_labels = pd.concat(
        [found_matches[["record_id_eia", "record_id_ferc1"]], \
        training_labels[["record_id_eia", "record_id_ferc1"]]])
    finds = concat_labels[concat_labels.duplicated()]
    return finds

In [48]:
finds = get_training_label_recall(panda_matches, training_labels)

In [27]:
# this is maybe not the complete 2020 training data, need to verify with CG
train_labels_2020 = pd.read_csv("train_ferc1_eia_2020.csv")

In [49]:
len(finds), len(train_labels_2020), len(finds)/len(train_labels_2020)

(24, 119, 0.20168067226890757)

Note that in the next cells it is found that a handful of these training labels aren't included in the candidate set of matches. So this recall percentage is slightly off.

### Look at the matches that Panda didn't find

In [29]:
finds_and_labels = pd.concat([finds, train_labels_2020[["record_id_eia", "record_id_ferc1"]]])
misses = finds_and_labels.drop_duplicates(keep=False)

In [30]:
misses

Unnamed: 0,record_id_eia,record_id_ferc1
0,299_2020_plant_total_14354_retired,f1_steam_2020_12_134_3_1
2,113_4_2020_plant_unit_total_14354,f1_steam_2020_12_134_0_1
3,2322_gt4_2020_plant_gen_total_13407,f1_steam_2020_12_108_1_1
4,6076_2020_plant_owned_14354,f1_steam_2020_12_134_0_2
8,6063_2020_plant_total_12341,f1_steam_2020_12_210_1_5
...,...,...
114,62591_2020_plant_total_14354,f1_gnrt_plant_2020_12_134_0_33
115,63053_2020_plant_total_12341,f1_gnrt_plant_2020_12_210_2_13
116,63915_2020_plant_total_14354,f1_gnrt_plant_2020_12_134_0_31
117,63641_2020_plant_total_12341,f1_gnrt_plant_2020_12_210_2_18


Look at the Panda soft value for these misses.

Since there are some NaNs in the pandas soft label, it seems like these records were not part of the candidate set. 

In [31]:
misses_soft_label = misses.join(
    panda_labels.set_index(
        ["record_id_eia", "record_id_ferc1"])[["panda_soft_label"]],\
    on=["record_id_eia", "record_id_ferc1"])
misses_soft_label.sort_values(by="panda_soft_label", ascending=False)

Unnamed: 0,record_id_eia,record_id_ferc1,panda_soft_label
68,56842_2020_plant_total_14354,f1_gnrt_plant_2020_12_134_0_44,5.000000e-01
30,56466_1_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_40,5.000000e-01
39,56843_1_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_45,5.000000e-01
59,56360_2020_plant_total_14354,f1_gnrt_plant_2020_12_134_0_39,5.000000e-01
62,57039_2020_plant_total_14354,f1_gnrt_plant_2020_12_134_0_42,5.000000e-01
...,...,...,...
64,59919_2020_plant_total_13407,f1_gnrt_plant_2020_12_108_0_7,2.574490e-15
2,113_4_2020_plant_unit_total_14354,f1_steam_2020_12_134_0_1,
3,2322_gt4_2020_plant_gen_total_13407,f1_steam_2020_12_108_1_1,
25,55322_2020_plant_total_13407,f1_steam_2020_12_108_2_2,


Looking at these NaNs:
- One of them (113_4_2020...) is not in the EIA input dataset. Maybe it's not a true gran?
- The other NaN values aren't part of the candidate set it seems. I don't remember off the top of my head why some values aren't included in the candidate set.

In [32]:
eia_input[eia_input.record_id_eia == "113_4_2020_plant_unit_total_14354	"]

Unnamed: 0,record_id_eia,plant_id_eia,report_date,plant_part,generator_id,unit_id_pudl,prime_mover_code,energy_source_code_1,technology_description,ferc_acct_name,...,plant_part_id_eia,record_count,retirement_date,total_fuel_cost,total_mmbtu,utility_id_pudl,report_year,plant_id_report_year,plant_id_report_year_util_id,utility_name_eia


In [37]:
# updated recall numbers
excluded = len(misses_soft_label[misses_soft_label.panda_soft_label.isnull()])
len(finds), len(train_labels_2020) - excluded, len(finds)/(len(train_labels_2020) - excluded)

(24, 115, 0.20869565217391303)

Looking at the soft labels from Panda:
- These records have a soft label of .5 and the rest have soft labels that are basically 0. So it seems like Panda soft labels are essentially 1, .5, or 0. 
- I'm not entirely sure how the soft label is created. My understanding is that it represents the combination of the labeling functions. It doesn't seem to be a softmax. 

In [46]:
misses_soft_label[misses_soft_label.panda_soft_label >= .00001]

Unnamed: 0,record_id_eia,record_id_ferc1,panda_soft_label
15,56841_1_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_35,0.5
16,56841_2_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_36,0.5
30,56466_1_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_40,0.5
39,56843_1_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_45,0.5
40,56843_2_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_46,0.5
52,55607_2020_plant_total_14354,f1_gnrt_plant_2020_12_134_0_34,0.5
53,56666_2020_plant_total_14354,f1_gnrt_plant_2020_12_134_0_37,0.5
55,57040_2020_plant_total_14354,f1_gnrt_plant_2020_12_134_0_38,0.5
59,56360_2020_plant_total_14354,f1_gnrt_plant_2020_12_134_0_39,0.5
62,57039_2020_plant_total_14354,f1_gnrt_plant_2020_12_134_0_42,0.5


Look at the full EIA and FERC records for these missed pairs.

In [60]:
# join on the EIA and FERC data
misses_full = misses_soft_label.join(
    eia_input.set_index("record_id_eia"), on="record_id_eia").join(
    ferc_input.set_index("record_id_ferc1"), on="record_id_ferc1", lsuffix="_eia", rsuffix="_ferc1")

In [61]:
# sort the non id columns of the joined dataframe to make easier comparison between columns
non_id_df = misses_full[misses_full.columns.difference(["record_id_eia", "record_id_ferc1"])]

misses_full = pd.concat([misses_full[["record_id_eia", "record_id_ferc1"]],
           non_id_df.reindex(sorted(non_id_df.columns), axis=1)], axis=1).sort_values(by=["panda_soft_label"], ascending=False)

In [62]:
misses_full

Unnamed: 0,record_id_eia,record_id_ferc1,appro_part_label,appro_record_id_eia,capacity_eoy_mw,capacity_factor_eia,capacity_factor_ferc1,capacity_mw_eia,capacity_mw_ferc1,construction_type,...,total_fuel_cost_ferc1,total_mmbtu_eia,total_mmbtu_ferc1,true_gran,unit_id_pudl,utility_id_eia,utility_id_pudl_eia,utility_id_pudl_ferc1,utility_name_eia,utility_name_ferc1
68,56842_2020_plant_total_14354,f1_gnrt_plant_2020_12_134_0_44,plant,56842_2020_plant_total_14354,115.8,,,115.8,115.80,,...,,,,True,,14354.0,246.0,246,PacifiCorp,PacifiCorp
30,56466_1_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_40,plant_gen,56466_1_2020_plant_gen_total_14354,156.0,,,156.0,156.00,,...,,,,True,,14354.0,246.0,246,PacifiCorp,PacifiCorp
39,56843_1_2020_plant_gen_total_14354,f1_gnrt_plant_2020_12_134_0_45,plant_gen,56843_1_2020_plant_gen_total_14354,122.1,,,122.1,122.10,,...,,,,True,,14354.0,246.0,246,PacifiCorp,PacifiCorp
59,56360_2020_plant_total_14354,f1_gnrt_plant_2020_12_134_0_39,plant,56360_2020_plant_total_14354,110.4,,,110.4,110.38,,...,,,,True,,14354.0,246.0,246,PacifiCorp,PacifiCorp
62,57039_2020_plant_total_14354,f1_gnrt_plant_2020_12_134_0_42,plant,57039_2020_plant_total_14354,35.2,,,35.2,35.15,,...,,,,True,,14354.0,246.0,246,PacifiCorp,PacifiCorp
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,59919_2020_plant_total_13407,f1_gnrt_plant_2020_12_108_0_7,plant,59919_2020_plant_total_13407,15.0,,,15.0,15.00,,...,,,,True,,13407.0,204.0,204,Nevada Power Co,"Nevada Power Company, d/b/a NV Energy"
2,113_4_2020_plant_unit_total_14354,f1_steam_2020_12_134_0_1,,,,,0.460336,,414.00,outdoor,...,4.892805e+07,,1.875294e+07,,,,,246,,PacifiCorp
3,2322_gt4_2020_plant_gen_total_13407,f1_steam_2020_12_108_1_1,plant_gen,2322_gt4_2020_plant_gen_total_13407,72.4,,0.017254,72.4,72.40,conventional,...,3.295083e+05,,1.624169e+05,True,,13407.0,204.0,204,Nevada Power Co,"Nevada Power Company, d/b/a NV Energy"
25,55322_2020_plant_total_13407,f1_steam_2020_12_108_2_2,plant,55322_2020_plant_total_13407,1465.6,0.475715,0.477485,1465.6,1465.40,semioutdoor,...,1.212211e+08,45715668.40,4.689478e+07,True,,13407.0,204.0,204,Nevada Power Co,"Nevada Power Company, d/b/a NV Energy"


Compare the primary string columns.

In [63]:
misses_full[[
    "plant_name_eia", "plant_name_ferc1", 
    "utility_name_eia", "utility_name_ferc1", 
    "energy_source_code_1", "technology_description",
    "fuel_type_code_pudl_eia", "fuel_type_code_pudl_ferc1"]]

Unnamed: 0,plant_name_eia,plant_name_ferc1,utility_name_eia,utility_name_ferc1,energy_source_code_1,technology_description,fuel_type_code_pudl_eia,fuel_type_code_pudl_ferc1
68,Rolling Hills,rolling hills,PacifiCorp,PacifiCorp,WND,Onshore Wind Turbine,wind,
30,Marengo Wind Plant,marengo,PacifiCorp,PacifiCorp,WND,Onshore Wind Turbine,wind,
39,Seven Mile Hill,seven mile hill,PacifiCorp,PacifiCorp,WND,Onshore Wind Turbine,wind,
59,Leaning Juniper,leaning juniper 1,PacifiCorp,PacifiCorp,WND,Onshore Wind Turbine,wind,
62,McFadden Ridge,mcfadden ridge i,PacifiCorp,PacifiCorp,WND,Onshore Wind Turbine,wind,
...,...,...,...,...,...,...,...,...
64,Nellis Solar PV II,nellis solar,Nevada Power Co,"Nevada Power Company, d/b/a NV Energy",SUN,Solar Photovoltaic,solar,
2,,cholla,,PacifiCorp,,,,coal
3,Clark,clark 4,Nevada Power Co,"Nevada Power Company, d/b/a NV Energy",NG,Natural Gas Fired Combustion Turbine,gas,gas
25,Moapa Energy Facility,lenzie 1 & 2,Nevada Power Co,"Nevada Power Company, d/b/a NV Energy",NG,Natural Gas Fired Combined Cycle,gas,gas


Compare some of the numeric columns.

There are a bunch of NaNs in these columns. This likely is contributing to Panda missing these matches.

In [66]:
misses_full[[
    'construction_year_eia', 'construction_year_ferc1', 
    'installation_year_eia', 'installation_year_ferc1']]

Unnamed: 0,construction_year_eia,construction_year_ferc1,installation_year_eia,installation_year_ferc1
68,2009.0,2009.0,2009.0,
30,2007.0,2007.0,2007.0,
39,2008.0,2008.0,2008.0,
59,2006.0,2006.0,2006.0,
62,2009.0,2009.0,2009.0,
...,...,...,...,...
64,2015.0,2015.0,2015.0,
2,,1981.0,,1981.0
3,1973.0,1973.0,1973.0,1973.0
25,2006.0,2005.0,2006.0,2006.0


In [67]:
cols = ['capacity_factor_eia',
       'capacity_factor_ferc1', 
       'capacity_mw_eia', 
       'capacity_mw_ferc1',
       'fuel_cost_per_mmbtu_eia', 'fuel_cost_per_mmbtu_ferc1',
       'fuel_cost_per_mwh_eia', 'fuel_cost_per_mwh_ferc1',
       'fuel_type_code_pudl_eia', 'fuel_type_code_pudl_ferc1',
       'heat_rate_mmbtu_mwh_eia', 'heat_rate_mmbtu_mwh_ferc1',
       'net_generation_mwh_eia', 'net_generation_mwh_ferc1']
misses_full[cols]

Unnamed: 0,capacity_factor_eia,capacity_factor_ferc1,capacity_mw_eia,capacity_mw_ferc1,fuel_cost_per_mmbtu_eia,fuel_cost_per_mmbtu_ferc1,fuel_cost_per_mwh_eia,fuel_cost_per_mwh_ferc1,fuel_type_code_pudl_eia,fuel_type_code_pudl_ferc1,heat_rate_mmbtu_mwh_eia,heat_rate_mmbtu_mwh_ferc1,net_generation_mwh_eia,net_generation_mwh_ferc1
68,,,115.8,115.80,,,,,wind,,,,,3.632210e+08
30,,,156.0,156.00,,,,,wind,,,,,4.487080e+08
39,,,122.1,122.10,,,,,wind,,,,,4.278560e+08
59,,,110.4,110.38,,,,,wind,,,,,3.163680e+08
62,,,35.2,35.15,,,,,wind,,,,,1.092720e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,,,15.0,15.00,,,,,solar,,,,,4.008800e+04
2,,0.460336,,414.00,,2.609087,,29.307521,,coal,,11.232844,,1.669474e+06
3,,0.017254,72.4,72.40,,2.028781,,30.106735,gas,gas,,14.842085,,1.094300e+04
25,0.475715,0.477485,1465.6,1465.40,2.659401,2.584960,19.866597,19.776492,gas,gas,7.470327,7.650750,6124268.0,6.129435e+06
