In [78]:
import pandas as pd
import zipfile

In [80]:
year = 2020
panda_label_num = 0

### Match panda labels to their record IDs

The panda labels use columns `id_l` and `id_r` where `id_l` corresponds
to the `id` column in the FERC data and `id_r` corresponds to the
index in the EIA input data.

In [81]:
panda_labels = pd.read_csv(f"panda_matches/panda_label_{panda_label_num}.csv")

In [82]:
input_zip = zipfile.ZipFile(f"panda_inputs/{year}.zip")
ferc_input = pd.read_csv(input_zip.open("left.csv"))
eia_input = pd.read_csv(input_zip.open("right.csv"))

In [83]:
panda_labels = panda_labels.join(ferc_input.set_index("id")["record_id_ferc1"], on="id_l")
panda_labels = panda_labels.join(eia_input["record_id_eia"], on="id_r")

In [84]:
panda_labels

Unnamed: 0,id_l,id_r,panda_label,panda_soft_label,record_id_ferc1,record_id_eia
0,27884,23861,-1,9.165326e-15,f1_steam_2020_12_144_1_4,235_1_2020_plant_gen_total_14328
1,27884,23862,-1,9.165326e-15,f1_steam_2020_12_144_1_4,235_2_2020_plant_gen_total_14328
2,27771,32223,-1,9.165326e-15,f1_steam_2020_12_164_1_1,7538_1_2020_plant_gen_total_3046
3,27771,32224,-1,9.165326e-15,f1_steam_2020_12_164_1_1,7538_2_2020_plant_gen_total_3046
4,27771,32225,-1,9.165326e-15,f1_steam_2020_12_164_1_1,7538_3_2020_plant_gen_total_3046
...,...,...,...,...,...,...
20146,28273,17335,1,9.983238e-01,f1_steam_2020_12_44_0_5,6035_1_2020_plant_unit_total_5109
20147,27571,20806,1,1.000000e+00,f1_steam_2020_12_176_1_5,59784_ct_2020_plant_prime_mover_total_24211
20148,27727,2167,-1,2.575863e-14,f1_steam_2020_12_57_1_5,708_2020_plant_total_7140_retired
20149,27718,2161,1,9.999991e-01,f1_steam_2020_12_57_0_1,703_2020_plant_total_7140


### Find the percentage of labeled training data that Panda finds

Note: the input EIA data to Panda is distinct, meaning that only true gran records are included. I'm not sure if the training labels are exclusive to true gran records.

In [85]:
training_labels = pd.read_csv("train_ferc1_eia.csv")

In [86]:
concat_labels = pd.concat(
    [panda_labels[["record_id_eia", "record_id_ferc1"]], training_labels[["record_id_eia", "record_id_ferc1"]]])

In [87]:
finds = concat_labels[concat_labels.duplicated()]

In [88]:
# this is maybe not the complete 2020 training data, need to verify with CG
train_labels_2020 = pd.read_csv("train_ferc1_eia_2020.csv")

In [89]:
len(finds), len(train_labels_2020), len(finds)/len(train_labels_2020)

(115, 119, 0.9663865546218487)

### Look at the matches that Panda didn't find

See note above about input EIA data only being the true gran records. Seems like that's why one of the missed records isn't even in the EIA input data.

In [90]:
finds_and_labels = pd.concat([finds, train_labels_2020[["record_id_eia", "record_id_ferc1"]]])
misses = finds_and_labels.drop_duplicates(keep=False)

In [91]:
misses

Unnamed: 0,record_id_eia,record_id_ferc1
2,113_4_2020_plant_unit_total_14354,f1_steam_2020_12_134_0_1
3,2322_gt4_2020_plant_gen_total_13407,f1_steam_2020_12_108_1_1
25,55322_2020_plant_total_13407,f1_steam_2020_12_108_2_2
28,10761_2020_plant_total_13407,f1_steam_2020_12_108_0_1


Look at the Panda soft value for these misses.

Since there is Nan in the pandas soft label, it seems like these records were not part of the candidate set. Off the top of my head I forget why Panda excludes some records from the candidate set.

In [74]:
misses.join(
    panda_labels.set_index(
        ["record_id_eia", "record_id_ferc1"])[["panda_soft_label"]],\
    on=["record_id_eia", "record_id_ferc1"])

Unnamed: 0,record_id_eia,record_id_ferc1,panda_soft_label
2,113_4_2020_plant_unit_total_14354,f1_steam_2020_12_134_0_1,
3,2322_gt4_2020_plant_gen_total_13407,f1_steam_2020_12_108_1_1,
25,55322_2020_plant_total_13407,f1_steam_2020_12_108_2_2,
28,10761_2020_plant_total_13407,f1_steam_2020_12_108_0_1,


Look at the full EIA and FERC records for these missed pairs.

In [75]:
# join on the EIA and FERC data
misses_full = misses.join(
    eia_input.set_index("record_id_eia"), on="record_id_eia").join(
    ferc_input.set_index("record_id_ferc1"), on="record_id_ferc1", lsuffix="_eia", rsuffix="_ferc1")

In [76]:
# sort the non id columns of the joined dataframe to make easier comparison between columns
non_id_df = misses_full[misses_full.columns.difference(["record_id_eia", "record_id_ferc1"])]

misses_full = pd.concat([misses_full[["record_id_eia", "record_id_ferc1"]],
           non_id_df.reindex(sorted(non_id_df.columns), axis=1)], axis=1)

In [77]:
misses_full

Unnamed: 0,record_id_eia,record_id_ferc1,appro_part_label,appro_record_id_eia,capacity_eoy_mw,capacity_factor_eia,capacity_factor_ferc1,capacity_mw_eia,capacity_mw_ferc1,construction_type,...,total_fuel_cost_ferc1,total_mmbtu_eia,total_mmbtu_ferc1,true_gran,unit_id_pudl,utility_id_eia,utility_id_pudl_eia,utility_id_pudl_ferc1,utility_name_eia,utility_name_ferc1
2,113_4_2020_plant_unit_total_14354,f1_steam_2020_12_134_0_1,,,,,0.460336,,414.0,outdoor,...,48928050.0,,18752940.0,,,,,246,,PacifiCorp
3,2322_gt4_2020_plant_gen_total_13407,f1_steam_2020_12_108_1_1,plant_gen,2322_gt4_2020_plant_gen_total_13407,72.4,,0.017254,72.4,72.4,conventional,...,329508.3,,162416.9,True,,13407.0,204.0,204,Nevada Power Co,"Nevada Power Company, d/b/a NV Energy"
25,55322_2020_plant_total_13407,f1_steam_2020_12_108_2_2,plant,55322_2020_plant_total_13407,1465.6,0.475715,0.477485,1465.6,1465.4,semioutdoor,...,121221100.0,45715668.4,46894780.0,True,,13407.0,204.0,204,Nevada Power Co,"Nevada Power Company, d/b/a NV Energy"
28,10761_2020_plant_total_13407,f1_steam_2020_12_108_0_1,plant,10761_2020_plant_total_13407,358.9,0.100082,0.100578,358.9,358.8,outdoor,...,7403684.0,2785333.39,2919426.0,True,,13407.0,204.0,204,Nevada Power Co,"Nevada Power Company, d/b/a NV Energy"


Compare the primary string columns.

In [50]:
misses_full[[
    "plant_name_eia", "plant_name_ferc1", 
    "utility_name_eia", "utility_name_ferc1", 
    "energy_source_code_1", "technology_description",
    "fuel_type_code_pudl_eia", "fuel_type_code_pudl_ferc1"]]

Unnamed: 0,plant_name_eia,plant_name_ferc1,utility_name_eia,utility_name_ferc1,energy_source_code_1,technology_description,fuel_type_code_pudl_eia,fuel_type_code_pudl_ferc1
2,,cholla,,PacifiCorp,,,,coal
3,Clark,clark 4,Nevada Power Co,"Nevada Power Company, d/b/a NV Energy",NG,Natural Gas Fired Combustion Turbine,gas,gas
25,Moapa Energy Facility,lenzie 1 & 2,Nevada Power Co,"Nevada Power Company, d/b/a NV Energy",NG,Natural Gas Fired Combined Cycle,gas,gas
28,Las Vegas Cogen,lv generation,Nevada Power Co,"Nevada Power Company, d/b/a NV Energy",NG,Natural Gas Fired Combined Cycle,gas,gas


Compare some of the numeric columns.

In [48]:
misses_full[[
    'construction_year_eia', 'construction_year_ferc1', 
    'installation_year_eia', 'installation_year_ferc1']]

Unnamed: 0,construction_year_eia,construction_year_ferc1,installation_year_eia,installation_year_ferc1
2,,1981.0,,1981.0
3,1973.0,1973.0,1973.0,1973.0
25,2006.0,2005.0,2006.0,2006.0
28,1994.0,1994.0,2003.0,2002.0


In [51]:
cols = ['capacity_factor_eia',
       'capacity_factor_ferc1', 
       'capacity_mw_eia', 
       'capacity_mw_ferc1',
       'fuel_cost_per_mmbtu_eia', 'fuel_cost_per_mmbtu_ferc1',
       'fuel_cost_per_mwh_eia', 'fuel_cost_per_mwh_ferc1',
       'fuel_type_code_pudl_eia', 'fuel_type_code_pudl_ferc1',
       'heat_rate_mmbtu_mwh_eia', 'heat_rate_mmbtu_mwh_ferc1',
       'net_generation_mwh_eia', 'net_generation_mwh_ferc1']
misses_full[cols]

Unnamed: 0,capacity_factor_eia,capacity_factor_ferc1,capacity_mw_eia,capacity_mw_ferc1,fuel_cost_per_mmbtu_eia,fuel_cost_per_mmbtu_ferc1,fuel_cost_per_mwh_eia,fuel_cost_per_mwh_ferc1,fuel_type_code_pudl_eia,fuel_type_code_pudl_ferc1,heat_rate_mmbtu_mwh_eia,heat_rate_mmbtu_mwh_ferc1,net_generation_mwh_eia,net_generation_mwh_ferc1
2,,0.460336,,414.0,,2.609087,,29.307521,,coal,,11.232844,,1669474.0
3,,0.017254,72.4,72.4,,2.028781,,30.106735,gas,gas,,14.842085,,10943.0
25,0.475715,0.477485,1465.6,1465.4,2.659401,2.58496,19.866597,19.776492,gas,gas,7.470327,7.65075,6124268.0,6129435.16
28,0.100082,0.100578,358.9,358.8,2.572691,2.536007,22.677536,23.416055,gas,gas,8.814714,9.235021,315516.0,316125.5
