In [1]:
import sys  
sys.path.insert(0, '../')
sys.path.insert(0, '../../')
from panda_backend.AutoLF.PandaAutoFJJoinFunctions import *
import re
from panda_backend.Labeler import Labeler
labeler = Labeler()
def labeling_function(f):
    labeler.save_lf(f)
    return f


#### Write Labeling Functions

In [10]:
labeler.refresh() #refresh labeler whenever update a lf

from collections import defaultdict


"""
HEURISTICS 1:
Matching of plant names.
"""

"""
- play with weights
- play with distance metric
- play with threshold
- same goes for utility name
"""


@labeling_function
def plant_name_overlap(row):
    x = row.plant_name_ferc1_l
    y = row.plant_name_eia_r
    w = TokenWeight("uniformWeight").weight(None)
    d = lower_splitBySpace_uniformWeight_jaccardDistance(x, y, w)
    score = 1 - d

    if x != "nan" and y != "nan" and score > 0.01:
        return 1
    else:
        return 0


"""
- recombine this into one LF?
- some of these aren't doing anything (energy_source_code_1)
"""


@labeling_function
def contains_eia_prime_mover_code(row):
    x = lower(row.plant_name_ferc1_l)
    y = lower(row.prime_mover_code_r)
    x = splitBySpace(x)

    if x != "nan" and y != "nan" and y in x:
        return 1
    else:
        return 0


@labeling_function
def contains_eia_fuel_type_code_pudl(row):
    x = lower(row.plant_name_ferc1_l)
    y = lower(row.fuel_type_code_pudl_r)
    x = splitBySpace(x)

    if x != "nan" and y != "nan" and y in x:
        return 1
    else:
        return 0


"""
@labeling_function
def contains_eia_energy_source_code(row):
    x = lower(row.plant_name_ferc1_l)
    y = lower(row.energy_source_code_1_r)
    x = splitBySpace(x)

    if x != "nan" and y != "nan" and y in x:
        return 1
    else:
        return 0
"""


@labeling_function
def contains_eia_technology_description(row):
    x = lower(row.plant_name_ferc1_l)
    y = lower(row.technology_description_r)

    if x != "nan" and y != "nan" and y in x:
        return 1
    else:
        return 0


"""
- see plant name
"""


@labeling_function
def utility_name_overlap(row):
    x = row.utility_name_ferc1_l
    y = row.utility_name_eia_r
    w = TokenWeight("uniformWeight").weight(None)
    d = lower_splitBySpace_uniformWeight_jaccardDistance(x, y, w)
    score = 1 - d

    if x != "nan" and y != "nan" and score > 0.01:
        return 1
    else:
        return 0


"""
HEURISTICS 3:
The `installation_year` and `construction_year` columns are directly reported on the FERC1 
side, and they can be constructed on the EIA side, based on other reported EIA columns 
(the initial dates of generator operation, and their retirement dates). It may be preferable 
to treat installation_year and construction_year as categorical columns, or to have a binary 
cutoff with +/- 1 year of tolerance.
"""

"""
- combine into one summed LF?
"""

"""
@labeling_function
def year_match(row):
    def installation_year_match(row):
        x = row.installation_year_l
        y = row.installation_year_r

        if not math.isnan(x) and not math.isnan(y) and abs(x - y) > 1:
            return -1
        else:
            return 0

    def construction_year_match(row):
        x = row.construction_year_l
        y = row.construction_year_r

        if not math.isnan(x) and not math.isnan(y) and abs(x - y) > 1:
            return -1
        else:
            return 0

    total = installation_year_match(row) + construction_year_match(row)
    # could just make this a negative only LF and return -1 or 0
    return total + 1
"""


@labeling_function
def installation_year_match(row):
    x = row.installation_year_l
    y = row.installation_year_r

    if not math.isnan(x) and not math.isnan(y) and abs(x - y) < 2:
        return 1
    else:
        return 0


@labeling_function
def construction_year_match(row):
    x = row.construction_year_l
    y = row.construction_year_r

    if not math.isnan(x) and not math.isnan(y) and abs(x - y) < 2:
        return 1
    else:
        return 0


"""
HEURISTICS 4:
Numerical matching on `capacity` related attributes.
"""
"""
- play with the threshold here
- should the capacity match be a percentage?
"""


@labeling_function
def capacity_mw_neg(row):
    x = row.capacity_mw_l
    y = row.capacity_mw_r

    threshold = 10.0

    if math.isnan(x) or math.isnan(y):
        return 0
    elif abs(x - y) > threshold:
        return -1
    else:
        return 0


"""
HEURISTICS 7:
Numerical matching on `fuel_type` related attributes.
"""


@labeling_function
def ferc1_fuel_type_contains_eia_fuel_type_code_pudl(row):
    x = lower(row.fuel_type_l)
    y = lower(row.fuel_type_code_pudl_r)
    x = splitBySpace(x)

    if x != "nan" and y != "nan" and y in x:
        return 1
    else:
        return 0


"""
How is this best expressed? As a 1 and -1 or just as -1?
"""

"""
@labeling_function
def report_year_match(row):
    x = row.report_year_l
    y = row.report_year_r

    if not math.isnan(x) and not math.isnan(y) and x == y:
        return 0
    else:
        return -1
"""


@labeling_function
def fuel_type_code_pudl_match(row):
    x = lower(row.fuel_type_code_pudl_l)
    y = lower(row.fuel_type_code_pudl_r)

    if x != "nan" and y != "nan" and y != x:
        return -1
    else:
        return 0


"""
Could also match to prime_mover_code equal to GT or IC
Thought it made more sense to map to fuel_type_code_pudl
Research this
"""


@labeling_function
def name_contains_peaking(row):
    if "peaking" in lower(row.plant_name_ferc1_l):
        fuel = lower(row.fuel_type_code_pudl_r)
        if fuel != "nan" and (fuel == "oil" or fuel == "gas"):
            return 1
        else:
            return 0
    else:
        return 0


"""
- sometimes when FERC fuel_type_code_pudl is missing fuel_type is present
"""


@labeling_function
def match_fuel_type_code_pudl_to_fuel_type(row):
    ftcp = lower(row.fuel_type_code_pudl_r)
    ft = lower(row.fuel_type_l)
    ft_to_ftcp = defaultdict(
        int,
        {
            "water": "hydro",
            "diesel": "oil",
            "waste heat/gas": "other",
            "waste heat/gas": "gas",
            "waste heat/gas": "waste",
            "natural gas": "gas",
        },
    )
    if ftcp != "nan" and ft != "nan" and ft != ftcp and ft_to_ftcp[ft] != ftcp:
        return -1
    else:
        return 0


"""
- could break these up into separate positive LFs
- make tech descriptions func a string distance measure
- fuel_type_code_pudl make the oil values the same as gas
"""


def match_to_plant_type(row):
    def plant_type_to_tech_description(row, pt):
        pt = pt.split("_")
        tech = lower(row.technology_description_r)
        tech = splitBySpace(tech)
        if tech != "nan" and all([x in tech for x in pt]):
            return 1
        else:
            return 0

    def plant_type_to_prime_mover_code(row, pt):
        pm = row.prime_mover_code_r
        pm_to_pt = defaultdict(
            list,
            {
                "ST": ["steam", "nuclear", "geothermal", "waste_heat"],
                "GT": ["combustion_turbine", "steam"],
                "HY": [
                    "run-of-river",
                    "storage",
                    "hydro",
                    "storage (re-reg)",
                    "run-of river",
                ],
                "WT": ["wind"],
                "IC": ["internal_combustion"],
                "PV": ["photovoltaic", "solar_pv"],
                "CT": ["combined_cycle", "combustion_turbine"],
                "CA": ["combustion_turbine", "combined_cycle"],
            },
        )
        if pt in pm_to_pt[pm]:
            return 1
        else:
            return 0

    def plant_type_to_fuel_type_code_pudl(row, pt):
        ftcp = lower(row.fuel_type_code_pudl_r)
        ftcp_to_pt = defaultdict(
            list,
            {
                "coal": ["steam"],
                "hydro": [
                    "hydro",
                    "run-of-river",
                    "storage",
                    "storage (re-reg)",
                    "run-of river",
                ],
                "gas": [
                    "combustion_turbine",
                    "combined_cycle",
                    "internal_combustion",
                    "steam",
                    "waste_heat",
                ],
                "oil": ["combustion_turbine", "internal_combustion"],
                "solar": ["solar", "photovoltaic", "solar_pv"],
                "wind": ["wind"],
                "nuclear": ["nuclear"],
                "waste": ["steam"],
                "other": ["geothermal", "waste_heat"],
            },
        )
        if pt in ftcp_to_pt[ftcp]:
            return 1
        else:
            return 0

    pt = lower(row.plant_type_l)
    if pt == "nan":
        return 0
    total = (
        plant_type_to_tech_description(row, pt)
        + plant_type_to_prime_mover_code(row, pt)
        + plant_type_to_fuel_type_code_pudl(row, pt)
    )
    if total > 0:
        return 1
    else:
        return 0



#### Apply Labeling Functions

In [11]:
LR_pred, err_row = labeler.apply()

[INFO] Applying LF installation_year_match:   4%|▎         | 590/16359 [00:00<00:02, 5896.74it/s]

[INFO] No change has been made to plant_name_overlap , using cached prediction.
[INFO] No change has been made to contains_eia_prime_mover_code , using cached prediction.
[INFO] No change has been made to contains_eia_fuel_type_code_pudl , using cached prediction.
[INFO] No change has been made to contains_eia_technology_description , using cached prediction.
[INFO] No change has been made to utility_name_overlap , using cached prediction.


[INFO] Applying LF installation_year_match: 100%|██████████| 16359/16359 [00:02<00:00, 5906.82it/s]
[INFO] Applying LF construction_year_match: 100%|██████████| 16359/16359 [00:02<00:00, 5847.00it/s]


[INFO] No change has been made to capacity_mw_neg , using cached prediction.
[INFO] No change has been made to ferc1_fuel_type_contains_eia_fuel_type_code_pudl , using cached prediction.
[INFO] No change has been made to fuel_type_code_pudl_match , using cached prediction.
[INFO] No change has been made to name_contains_peaking , using cached prediction.
[INFO] No change has been made to match_fuel_type_code_pudl_to_fuel_type , using cached prediction.
[INFO] Combining LFs with labeling model...
[INFO] Finished combining.
