In [1]:
import sys  
sys.path.insert(0, '../')
sys.path.insert(0, '../../')
from panda_backend.AutoLF.PandaAutoFJJoinFunctions import *
import re
from panda_backend.Labeler import Labeler
labeler = Labeler()
def labeling_function(f):
    labeler.save_lf(f)
    return f


#### Write Labeling Functions

In [12]:
labeler.refresh() #refresh labeler whenever update a lf

"""
HEURISTICS 1:
Matching of plant names.
"""
@labeling_function
def plant_name_overlap(row):
    x = row.plant_name_ferc1_l
    y = row.plant_name_eia_r
    w = TokenWeight("uniformWeight").weight(None)
    d = lower_splitBySpace_uniformWeight_containJaccardDistance(x, y, w)
    score = 1 - d

    if x != "nan" and y != "nan" and score > 0.01:
        return 1
    else:
        return 0
        

@labeling_function
def plant_name_ferc1_contains(row):

    def contains_eia_prime_mover_code(row):
        x = lower(row.plant_name_ferc1_l)
        y = lower(row.prime_mover_code_r)
        x = splitBySpace(x)

        if x != "nan" and y != "nan" and y in x:
            return 1
        else:
            return 0

    def contains_eia_fuel_type_code_pudl(row):
        x = lower(row.plant_name_ferc1_l)
        y = lower(row.fuel_type_code_pudl_r)
        x = splitBySpace(x)

        if x != "nan" and y != "nan" and y in x:
            return 1
        else:
            return 0

    def contains_eia_energy_source_code(row):
        x = lower(row.plant_name_ferc1_l)
        y = lower(row.energy_source_code_1_r)
        x = splitBySpace(x)

        if x != "nan" and y != "nan" and y in x:
            return 1
        else:
            return 0

    def contains_eia_technology_description(row):
        x = lower(row.plant_name_ferc1_l)
        y = lower(row.technology_description_r)

        if x != "nan" and y != "nan" and y in x:
            return 1
        else:
            return 0

    scores = sum(
        [
            contains_eia_prime_mover_code(row),
            contains_eia_fuel_type_code_pudl(row),
            contains_eia_energy_source_code(row),
            contains_eia_technology_description(row),
        ]
    )

    if scores >= 2:
        return 1
    else:
        return 0
    
@labeling_function
def utility_name_overlap(row):
    x = row.utility_name_ferc1_l
    y = row.utility_name_eia_r
    w = TokenWeight("uniformWeight").weight(None)
    d = lower_splitBySpace_uniformWeight_containJaccardDistance(x, y, w)
    score = 1 - d
    
    if x == "nan" or y == "nan":
        return 0
    elif score < 0.01:
        return -1
    else:
        return 0
    
"""
HEURISTICS 3:
The `installation_year` and `construction_year` columns are directly reported on the FERC1 
side, and they can be constructed on the EIA side, based on other reported EIA columns 
(the initial dates of generator operation, and their retirement dates). It may be preferable 
to treat installation_year and construction_year as categorical columns, or to have a binary 
cutoff with +/- 1 year of tolerance.
"""
@labeling_function
def installation_year_match(row):
    x = row.installation_year_l
    y = row.installation_year_r
 
    if not math.isnan(x) and not math.isnan(y) and abs(x - y) > 1:
        return -1
    else:
        return 0
    
@labeling_function
def construction_year_match(row):
    x = row.construction_year_l
    y = row.construction_year_r
 
    if not math.isnan(x) and not math.isnan(y) and abs(x - y) > 1:
        return -1
    else:
        return 0
    
"""
HEURISTICS 4:
Numerical matching on `capacity` related attributes.
"""
@labeling_function
def capacity_mw_neg(row):
    x = row.capacity_mw_l
    y = row.capacity_mw_r
    
    threshold = 10.0
    
    if math.isnan(x) or math.isnan(y):
        return 0
    elif abs(x - y) > threshold:
        return -1
    else:
        return 0

@labeling_function
def net_generation_mwh_match(row):
    x = row.net_generation_mwh_l
    y = row.net_generation_mwh_r
    
    threshold = float("1e5")
    
    if math.isnan(x) or math.isnan(y):
        return 0
    elif abs(x - y) > threshold:
        return -1
    else:
        return 0
    
    
@labeling_function
def capacity_factor_match(row):
    x = row.capacity_factor_l
    y = row.capacity_factor_r
    
    threshold = float("5e-4")
    
    if math.isnan(x) or math.isnan(y):
        return 0
    elif abs(x - y) > threshold:
        return -1
    else:
        return 0

@labeling_function
def capacity_positive_match(row):
    x_capacity_mw = row.capacity_mw_l
    y_capacity_mw = row.capacity_mw_r
    x_net_gen = row.net_generation_mwh_l
    y_net_gen = row.net_generation_mwh_r
    x_cap_factor = row.capacity_factor_l
    y_cap_factor = row.capacity_factor_r
    
    capacity_mw_threshold = 0.0
    net_gen_threshold = 0.0
    cap_factor_threshold = float("1e-5")
    
    if math.isnan(x_capacity_mw) or \
        math.isnan(y_capacity_mw) or \
        math.isnan(x_net_gen) or \
        math.isnan(y_net_gen) or \
        math.isnan(x_cap_factor) or \
        math.isnan(y_cap_factor):
        return 0
    elif abs(x_capacity_mw - y_capacity_mw) <= capacity_mw_threshold and \
        abs(x_net_gen - y_net_gen) <= net_gen_threshold and \
        abs(x_cap_factor - y_cap_factor) <= cap_factor_threshold:
        return 1
    else:
        return 0
    
    
"""
HEURISTICS 5:
Numerical matching on `fuel` related attributes.
"""
@labeling_function
def total_fuel_cost_match(row):
    x = row.total_fuel_cost_l
    y = row.total_fuel_cost_r
    
    neg_threshold = float("1e5")
    pos_threshold = 0.0
    
    if math.isnan(x) or math.isnan(y):
        return 0
    elif abs(x - y) > neg_threshold:
        return -1
    elif abs(x - y) <= pos_threshold:
        return 1
    else:
        return 0
    
@labeling_function
def total_mmbtu_match(row):
    x = row.total_mmbtu_l
    y = row.total_mmbtu_r
    
    neg_threshold = float("1e3")
    pos_threshold = 0.0
    
    if math.isnan(x) or math.isnan(y):
        return 0
    elif abs(x - y) > neg_threshold:
        return -1
    elif abs(x - y) <= pos_threshold:
        return 1
    else:
        return 0
    
@labeling_function
def total_fuel_cost_and_mmbtu_positive_match(row):
    x_fuel_cost = row.total_fuel_cost_l
    y_fuel_cost = row.total_fuel_cost_r
    x_mmbtu = row.total_mmbtu_l
    y_mmbtu = row.total_mmbtu_r
    
    fuel_cost_threshold = 0.5
    mmbtu_threshold = 0.5
    
    if math.isnan(x_fuel_cost) or \
        math.isnan(y_fuel_cost) or \
        math.isnan(x_mmbtu) or \
        math.isnan(y_mmbtu):
        return 0
    elif abs(x_fuel_cost - y_fuel_cost) <= fuel_cost_threshold and \
        abs(x_mmbtu - x_mmbtu) <= mmbtu_threshold:
        return 1
    else:
        return 0

@labeling_function
def fuel_cost_per_mmbtu_match(row):
    x = row.fuel_cost_per_mmbtu_l
    y = row.fuel_cost_per_mmbtu_r
    
    threshold = float("1e-3")
    
    if math.isnan(x) or math.isnan(y):
        return 0
    elif abs(x - y) > threshold:
        return -1
    else:
        return 0
    
@labeling_function
def fuel_cost_per_mwh_match(row):
    x = row.fuel_cost_per_mwh_l
    y = row.fuel_cost_per_mwh_r
    
    threshold = float("1e-3")
    
    if math.isnan(x) or math.isnan(y):
        return 0
    elif abs(x - y) > threshold:
        return -1
    else:
        return 0
    

@labeling_function
def fuel_cost_per_mmbtu_and_per_mwh_positive_match(row):
    x_per_mmbtu = row.fuel_cost_per_mmbtu_l
    y_per_mmbtu = row.fuel_cost_per_mmbtu_r
    x_per_mwh = row.fuel_cost_per_mwh_l
    y_per_mwh = row.fuel_cost_per_mwh_r
    
    per_mmbtu_threshold = float("1e-4")
    per_mwh_threshold = float("1e-4")
    
    if math.isnan(x_per_mmbtu) or \
        math.isnan(y_per_mmbtu) or \
        math.isnan(x_per_mwh) or \
        math.isnan(y_per_mwh):
        return 0
    elif abs(x_per_mmbtu - y_per_mmbtu) <= per_mmbtu_threshold and \
        abs(x_per_mwh - y_per_mwh) <= per_mwh_threshold:
        return 1
    else:
        return 0

"""
HEURISTICS 6:
Numerical matching on `heat_rate` related attributes.
"""
@labeling_function
def heat_rate_mmbtu_mwh_match(row):
    x = row.heat_rate_mmbtu_mwh_l
    y = row.heat_rate_mmbtu_mwh_r
    
    neg_threshold = float("1e-2")
    
    if math.isnan(x) or math.isnan(y):
        return 0
    elif abs(x - y) > neg_threshold:
        return -1
    else:
        return 0
    
# @labeling_function
# def report_year_match(row):
#     x = row.report_year_l
#     y = row.report_year_r
    
#     if not math.isnan(x) and not math.isnan(y) and x == y:
#         return 1
#     else:
#         return 0

"""
HEURISTICS 7:
Numerical matching on `fuel_type` related attributes.
"""
@labeling_function
def ferc1_fuel_type_contains_eia_fuel_type_code_pudl(row):
    x = lower(row.fuel_type_l)
    y = lower(row.fuel_type_code_pudl_r)
    x = splitBySpace(x)
    
    if x != "nan" and y != "nan" and y not in x:
        return -1
    else:
        return 0



#### Apply Labeling Functions

In [13]:
LR_pred, err_row = labeler.apply()

[INFO] Applying LF plant_name_ferc1_contains:   2%|▏         | 426/20151 [00:00<00:04, 4256.72it/s]

[INFO] No change has been made to plant_name_overlap , using cached prediction.


[INFO] Applying LF plant_name_ferc1_contains: 100%|██████████| 20151/20151 [00:04<00:00, 4248.29it/s]


[INFO] No change has been made to utility_name_overlap , using cached prediction.
[INFO] No change has been made to installation_year_match , using cached prediction.
[INFO] No change has been made to construction_year_match , using cached prediction.
[INFO] No change has been made to capacity_mw_neg , using cached prediction.
[INFO] No change has been made to net_generation_mwh_match , using cached prediction.
[INFO] No change has been made to capacity_factor_match , using cached prediction.
[INFO] No change has been made to capacity_positive_match , using cached prediction.
[INFO] No change has been made to total_fuel_cost_match , using cached prediction.
[INFO] No change has been made to total_mmbtu_match , using cached prediction.
[INFO] No change has been made to total_fuel_cost_and_mmbtu_positive_match , using cached prediction.
[INFO] No change has been made to fuel_cost_per_mmbtu_match , using cached prediction.
[INFO] No change has been made to fuel_cost_per_mwh_match , using 