In [1]:
%reload_ext autoreload
%autoreload 2

# Tell python where to look for modules. 
# Depending on how your jupyter handles working directories, this may not be needed.
import sys
sys.path.append('../../hourly-egrid/')

# Useful high-level external modules.
import numpy as np
import pandas as pd


from src.distribute_eia923 import *
from src.distribute_eia923 import _associate_unconnected_records, _associate_energy_source_only


year = 2020

  from pandas import Int64Index as NumericIndex


In [2]:
pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)


# extract all of the tables from pudl_out early in the process and select
# only the columns we need. this is for speed and clarity.

# gf contains the more complete generation and fuel data at the plant prime mover level
gf = load_data.load_pudl_table("generation_fuel_eia923", year=year).loc[
    :, IDX_PM_FUEL + ["net_generation_mwh", "fuel_consumed_mmbtu"]
].pipe(apply_dtype)
# load the nuclear gf data
gf_nuc = load_data.load_pudl_table("generation_fuel_nuclear_eia923", year=year).loc[
    :, IDX_PM_FUEL + ["net_generation_mwh", "fuel_consumed_mmbtu"]
].pipe(apply_dtype)
# concat the nuclear data with the main gf dataframe
gf = pd.concat([gf,gf_nuc], axis=0)
# remove non-grid connected plants
gf = data_cleaning.remove_non_grid_connected_plants(gf)

# gen contrains more granular generation data at the generator level for a subset of generators
gen = (
    load_data.load_pudl_table("generation_eia923", year=year).loc[:, IDX_GENS + ["net_generation_mwh"]]
    # removes 4 records with NaN generator_id as of pudl v0.5
    .dropna(subset=IDX_GENS)
).pipe(apply_dtype)
# remove non-grid connected plants
gen = data_cleaning.remove_non_grid_connected_plants(gen)

# gens contains a complete list of all generators
gens = load_data.load_pudl_table("generators_eia860", year=year).loc[
    :,
    IDX_GENS
    + [
        "capacity_mw",
        "operational_status",
        "retirement_date",
    ]
    + list(load_data.load_pudl_table("generators_eia860", year=year).filter(like="energy_source_code")),
]
# remove non-grid connected plants
gens = data_cleaning.remove_non_grid_connected_plants(gens)
# get a list of fuel types for later
gen_primary_fuel = gens.copy()[['plant_id_eia','generator_id','energy_source_code_1']]
# add the prime mover code to the gens df from generators entity
gens = gens.merge(load_data.load_pudl_table("generators_entity_eia").loc[:,["plant_id_eia", "generator_id","prime_mover_code"]],
                    how='left', 
                    on=["plant_id_eia", "generator_id"]).pipe(apply_dtype)
# add records for each month of the year
gens = data_cleaning.create_monthly_gens_records(gens, year)
# remove retired generator months
#gens = remove_retired_generators(gens)

# the gen table is missing some generator ids. Let's fill this using the gens table, leaving a missing value for net generation
gen = gen.merge(gens[['plant_id_eia','generator_id','report_date']], how='outer', on=['plant_id_eia','generator_id','report_date'])

Removing 0 plants that are not grid-connected
Removing 0 plants that are not grid-connected
Removing 1 plants that are not grid-connected


In [3]:
bga = load_data.load_pudl_table("boiler_generator_assn_eia860", year=2020)

In [4]:
gen_entity = load_data.load_pudl_table("generators_entity_eia").loc[:,["plant_id_eia", "generator_id","prime_mover_code"]]

In [15]:
gen_entity[gen_entity['plant_id_eia'] == 50410]

Unnamed: 0,plant_id_eia,generator_id,prime_mover_code
17627,50410,7A,CA
17628,50410,7,CT
17629,50410,6A,CA
17630,50410,6,CT
17631,50410,5,ST
35178,50410,#5,ST


In [16]:
gf[gf['plant_id_eia'] == 50410]

Unnamed: 0,plant_id_eia,prime_mover_code,energy_source_code,report_date,net_generation_mwh,fuel_consumed_mmbtu
65606,50410,CA,NG,2020-08-01,0.0,630.0
65607,50410,CA,NG,2020-09-01,0.0,1475.0
65608,50410,CA,NG,2020-10-01,0.0,195.0
65609,50410,CA,NG,2020-11-01,0.0,1964.0
65610,50410,CA,NG,2020-12-01,0.0,7620.0
65611,50410,CT,BIT,2020-07-01,0.0,0.0
65612,50410,CT,BIT,2020-08-01,0.0,0.0
65613,50410,CT,BIT,2020-09-01,0.0,0.0
65614,50410,CT,BIT,2020-10-01,0.0,0.0
65615,50410,CT,BIT,2020-11-01,0.0,0.0


In [17]:
gen[gen['plant_id_eia'] == 50410]

Unnamed: 0,plant_id_eia,generator_id,report_date,net_generation_mwh
24324,50410,6,2020-01-01,
24325,50410,6,2020-02-01,
24326,50410,6,2020-03-01,
24327,50410,6,2020-04-01,
24328,50410,6,2020-05-01,
24329,50410,6,2020-06-01,
24330,50410,6,2020-07-01,0.0
24331,50410,6,2020-08-01,8820.0
24332,50410,6,2020-09-01,7223.0
24333,50410,6,2020-10-01,10159.0


In [14]:
stack_gens = stack_generators(
        gens, cat_col="energy_source_code_num", stacked_col="energy_source_code"
    )

gen_assoc = (
    pd.merge(stack_gens, gen, on=IDX_GENS, how="outer")
    #.pipe(remove_retired_generators)
    .merge(
        gf.groupby(by=IDX_PM_FUEL, as_index=False).sum(min_count=1),
        on=IDX_PM_FUEL,
        suffixes=("_g_tbl", "_gf_tbl"),
        how="outer",
    )
)

gen_assoc = (
        pd.merge(
            gen_assoc,
            gen_assoc.groupby(by=IDX_FUEL)[["capacity_mw", "net_generation_mwh_g_tbl"]]
            .sum(min_count=1)
            .add_suffix("_fuel")
            .reset_index(),
            on=IDX_FUEL,
        )
        .pipe(apply_dtype)
        .pipe(_associate_unconnected_records)
        .pipe(_associate_energy_source_only, gf=gf)
    )

KeyError: "['fuel_consumed_for_electricity_mmbtu'] not in index"

In [15]:
stack_gens[stack_gens['plant_id_eia'] == 61242]

Unnamed: 0,plant_id_eia,generator_id,report_date,capacity_mw,operational_status,retirement_date,prime_mover_code,energy_source_code_num,energy_source_code
21007,10444,LOC1,2020-01-01,80.0,proposed,NaT,PV,energy_source_code_1,SUN
21008,10444,GEN1,2020-01-01,92.0,existing,NaT,ST,energy_source_code_1,SUN
21009,10444,GEN1,2020-01-01,92.0,existing,NaT,ST,energy_source_code_2,NG
57901,10444,LOC1,2020-02-01,80.0,proposed,NaT,PV,energy_source_code_1,SUN
57902,10444,GEN1,2020-02-01,92.0,existing,NaT,ST,energy_source_code_1,SUN
57903,10444,GEN1,2020-02-01,92.0,existing,NaT,ST,energy_source_code_2,NG
94795,10444,LOC1,2020-03-01,80.0,proposed,NaT,PV,energy_source_code_1,SUN
94796,10444,GEN1,2020-03-01,92.0,existing,NaT,ST,energy_source_code_1,SUN
94797,10444,GEN1,2020-03-01,92.0,existing,NaT,ST,energy_source_code_2,NG
131689,10444,LOC1,2020-04-01,80.0,proposed,NaT,PV,energy_source_code_1,SUN


In [32]:
gen_assoc[gen_assoc['plant_id_eia'] == 62562]

Unnamed: 0,plant_id_eia,generator_id,report_date,capacity_mw,operational_status,retirement_date,prime_mover_code,energy_source_code_num,energy_source_code,net_generation_mwh_g_tbl,net_generation_mwh_gf_tbl,fuel_consumed_mmbtu,capacity_mw_fuel,net_generation_mwh_g_tbl_fuel,net_generation_mwh_fuel,fuel_consumed_mmbtu_fuel,capacity_mw_plant,net_generation_mwh_fuel_missing_pm,fuel_consumed_mmbtu_fuel_missing_pm
2826,62562,HILO2,2020-01-01,50.0,proposed,NaT,WT,energy_source_code_1,WND,,,,499.5,,,,549.5,,
2827,62562,HILO,2020-01-01,449.5,proposed,NaT,WT,energy_source_code_1,WND,,,,499.5,,,,549.5,,
2828,62562,BA,2020-01-01,50.0,proposed,NaT,BA,energy_source_code_1,MWH,,,,50.0,,,,549.5,,
39713,62562,HILO2,2020-02-01,50.0,proposed,NaT,WT,energy_source_code_1,WND,,,,499.5,,,,549.5,,
39714,62562,HILO,2020-02-01,449.5,proposed,NaT,WT,energy_source_code_1,WND,,,,499.5,,,,549.5,,
39715,62562,BA,2020-02-01,50.0,proposed,NaT,BA,energy_source_code_1,MWH,,,,50.0,,,,549.5,,
76600,62562,HILO2,2020-03-01,50.0,proposed,NaT,WT,energy_source_code_1,WND,,,,499.5,,,,549.5,,
76601,62562,HILO,2020-03-01,449.5,proposed,NaT,WT,energy_source_code_1,WND,,,,499.5,,,,549.5,,
76602,62562,BA,2020-03-01,50.0,proposed,NaT,BA,energy_source_code_1,MWH,,,,50.0,,,,549.5,,
113487,62562,HILO2,2020-04-01,50.0,proposed,NaT,WT,energy_source_code_1,WND,,,,499.5,,,,549.5,,


In [47]:
stack_gens[stack_gens['plant_id_eia'] == 57846]

Unnamed: 0,plant_id_eia,generator_id,report_date,capacity_mw,operational_status,retirement_date,prime_mover_code,energy_source_code_num,energy_source_code
9523,57846,UNIT2,2020-01-01,1.6,retired,2020-06-01,IC,energy_source_code_1,LFG
9524,57846,UNIT1,2020-01-01,1.6,retired,2020-06-01,IC,energy_source_code_1,LFG
46417,57846,UNIT2,2020-02-01,1.6,retired,2020-06-01,IC,energy_source_code_1,LFG
46418,57846,UNIT1,2020-02-01,1.6,retired,2020-06-01,IC,energy_source_code_1,LFG
83311,57846,UNIT2,2020-03-01,1.6,retired,2020-06-01,IC,energy_source_code_1,LFG
83312,57846,UNIT1,2020-03-01,1.6,retired,2020-06-01,IC,energy_source_code_1,LFG
120205,57846,UNIT2,2020-04-01,1.6,retired,2020-06-01,IC,energy_source_code_1,LFG
120206,57846,UNIT1,2020-04-01,1.6,retired,2020-06-01,IC,energy_source_code_1,LFG
157099,57846,UNIT2,2020-05-01,1.6,retired,2020-06-01,IC,energy_source_code_1,LFG
157100,57846,UNIT1,2020-05-01,1.6,retired,2020-06-01,IC,energy_source_code_1,LFG


In [10]:
year = 2020
pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

start_date='2020-01-01'
end_date='2020-12-31'

plants_ba = plants_eia860(pudl_engine, start_date=start_date, end_date=end_date)[['plant_id_eia','balancing_authority_code_eia','state']]

plants_ba

Unnamed: 0,plant_id_eia,balancing_authority_code_eia,state
0,1,,AK
1,2,SOCO,AL
2,3,SOCO,AL
3,4,SOCO,AL
5,7,SOCO,AL
...,...,...,...
14435,64876,ISNE,MA
14436,64877,CISO,CA
14437,64878,CISO,CA
14438,64879,CISO,CA


In [2]:
gen_allocated = allocate_gen_fuel_by_gen(year=2020)

        plant_id_eia prime_mover_code energy_source_code report_date  frac  \
3413             377               CA                 NG  2020-04-01   2.0   
3414             377               CT                 NG  2020-04-01   2.0   
3441             377               CA                 NG  2020-11-01   2.0   
3442             377               CT                 NG  2020-11-01   2.0   
3445             377               CA                 NG  2020-12-01   2.0   
...              ...              ...                ...         ...   ...   
104483         58207               CA                 NG  2020-03-01   4.0   
104487         58207               CA                 NG  2020-04-01   4.0   
104495         58207               CA                 NG  2020-06-01   4.0   
104515         58207               CA                 NG  2020-11-01   4.0   
104519         58207               CA                 NG  2020-12-01   4.0   

        net_generation_mwh_g_tbl  frac_fuel  net_generation_mwh

In [6]:
gen_allocated[gen_allocated['plant_id_eia'] == 3].sum()

  gen_allocated[gen_allocated['plant_id_eia'] == 3].sum()


plant_id_eia                                                         360
plant_id_pudl                                                       3840
plant_name_eia         BarryBarryBarryBarryBarryBarryBarryBarryBarryB...
utility_id_eia                                                   23400.0
utility_id_pudl                                                     2160
utility_name_eia       Alabama Power CoAlabama Power CoAlabama Power ...
generator_id           1245A1CTA1CT2A1STA2C1A2C2A2ST1245A1CTA1CT2A1ST...
fuel_consumed_mmbtu                                           82061242.0
net_generation_mwh                                          10499146.011
unit_id_pudl                                                       612.0
dtype: object

In [11]:
year = 2020

IDX_GENS = ["plant_id_eia", "generator_id", "report_date"]
"""Id columns for generators."""

IDX_PM_FUEL = ["plant_id_eia", "prime_mover_code", "energy_source_code", "report_date"]
"""Id columns for plant, prime mover & fuel type records."""

IDX_FUEL = ["report_date", "plant_id_eia", "energy_source_code"]

DATA_COLS = ["net_generation_mwh", "fuel_consumed_mmbtu"]
"""Data columns from generation_fuel_eia923 that are being allocated."""

pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

# specify the date filter for retrieving data
year_filter = f"report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'"

gens = load_data.load_pudl_table(f"SELECT * FROM generators_eia860 WHERE {year_filter}").loc[
    :,
    IDX_GENS
    + [
        "capacity_mw",
        "operational_status",
        "retirement_date",
    ]
    + list(load_data.load_pudl_table(f"SELECT * FROM generators_eia860 WHERE {year_filter}").filter(like="energy_source_code")),
]
# add the prime mover code to the gens df from generators entity
gens = gens.merge(load_data.load_pudl_table("generators_entity_eia").loc[:,["plant_id_eia", "generator_id","prime_mover_code"]],
                    how='left', 
                    on=["plant_id_eia", "generator_id"]).pipe(apply_dtype)

existing = gens.loc[(gens.operational_status == "existing")]
# keep the gens for each month until they retire, if they have any data to report in that month
retiring = gens.loc[
    (gens.operational_status == "retired")
    & (gens.report_date <= gens.retirement_date)
]

gens = pd.concat([existing, retiring])

In [12]:
gens[gens['plant_id_eia'] == 57846]

Unnamed: 0,plant_id_eia,generator_id,report_date,capacity_mw,operational_status,retirement_date,energy_source_code_1,energy_source_code_2,energy_source_code_3,energy_source_code_4,energy_source_code_5,energy_source_code_6,planned_energy_source_code_1,prime_mover_code
9151,57846,UNIT2,2020-01-01,1.6,retired,2020-06-01,LFG,,,,,,,IC
9152,57846,UNIT1,2020-01-01,1.6,retired,2020-06-01,LFG,,,,,,,IC


In [47]:
geo_in_eia = gens[gens['energy_source_code_1'] == 'GEO'].groupby('plant_id_eia')['prime_mover_code'].unique().astype(str).reset_index()
eia_geo_plants = list(geo_in_eia.plant_id_eia.unique())
geo_in_eia

Unnamed: 0,plant_id_eia,prime_mover_code
0,286,['ST']
1,299,['ST']
2,510,['ST']
3,902,['ST']
4,7368,['ST']
...,...,...
60,60419,['BT']
61,60785,['BT']
62,61912,['BT']
63,63001,['BT']


In [2]:
# update teh geothermal geotypes with new generators
geothermal_geotype = pd.read_csv('../data/egrid/egrid_static_tables/table_geothermal_geotype.csv')
epa_geo_plants = list(geothermal_geotype.plant_id_eia.unique())
geothermal_geotype

Unnamed: 0,plant_id_eia,geotype_code,geotype_description,notes
0,286,S,Steam,
1,299,F,Flash,
2,510,S,Steam,
3,902,S,Steam,
4,7368,S,Steam,
...,...,...,...,...
71,58211,B,Binary,
72,58319,B,Binary,2014 860 Generator File lists one generator fo...
73,58533,B,Binary,
74,58570,B,Binary,2014 860 Generator File lists 2 generators for...


In [5]:
steam_geo = list(geothermal_geotype[geothermal_geotype['geotype_code'] == 'S']['plant_id_eia'])

In [7]:
plants = load_data.load_pudl_table("plants_entity_eia")

In [8]:
plants[plants['plant_id_eia'].isin(steam_geo)]

Unnamed: 0,plant_id_eia,plant_name_eia,balancing_authority_code_eia,balancing_authority_name_eia,city,county,ferc_cogen_status,ferc_exempt_wholesale_generator,ferc_small_power_producer,grid_voltage_kv,...,iso_rto_code,latitude,longitude,primary_purpose_id_naics,sector_name_eia,sector_id_eia,state,street_address,zip_code,timezone
187,286,Geysers Unit 5-20,CISO,California Independent System Operator,Middletown,Sonoma,False,True,False,230.0,...,CAISO,38.777,-122.745,22.0,IPP Non-CHP,2.0,CA,10350 Socrates Mine Road,95461,America/Los_Angeles
375,510,Calpine Geysers-Sonoma,CISO,California Independent System Operator,Middletown,Lake,False,False,True,230.0,...,CAISO,38.7903,-122.7559,22.0,IPP Non-CHP,2.0,CA,10350 Socrates Mine Road,95461,America/Los_Angeles
630,902,Bottle Rock Power,CISO,California Independent System Operator,Cobb,Lake,False,False,False,230.0,...,CAISO,38.8348,-122.7677,22.0,IPP Non-CHP,2.0,CA,7385 High Valley Road,95426,America/Los_Angeles
2815,7368,Geothermal 1,CISO,California Independent System Operator,Middletown,Sonoma,False,False,False,230.0,...,CAISO,38.752,-122.7195,22.0,Electric Utility,1.0,CA,12000 Ridge Road,95461,America/Los_Angeles
2816,7369,Geothermal 2,CISO,California Independent System Operator,Middletown,Sonoma,False,False,False,230.0,...,CAISO,38.7492,-122.7117,22.0,Electric Utility,1.0,CA,12000 Ridge Road,95461,America/Los_Angeles
3422,10199,West Ford Flat,CISO,California Independent System Operator,Middletown,Lake,False,False,True,230.0,...,CAISO,38.7881,-122.7219,22.0,IPP Non-CHP,2.0,CA,10350 Socrates Mine Road,95461,America/Los_Angeles
3573,10469,Bear Canyon,CISO,California Independent System Operator,Middletown,Lake,False,False,True,230.0,...,CAISO,38.762968,-122.6929,22.0,IPP Non-CHP,2.0,CA,10350 Scorates Mine Road,95461,America/Los_Angeles
3871,50066,Calistoga,CISO,California Independent System Operator,Middletown,Lake,False,False,True,230.0,...,CAISO,38.7879,-122.7434,22.0,IPP Non-CHP,2.0,CA,10350 Socrates Mine Road,95461,America/Los_Angeles
4523,52158,Aidlin Geothermal,CISO,California Independent System Operator,Middletown,Sonoma,False,False,True,115.0,...,CAISO,38.8339,-122.881,22.0,IPP Non-CHP,2.0,CA,10350 Socrates Mine Road,95425,America/Los_Angeles


In [50]:
geo_not_in_epa = list(set(eia_geo_plants) - set(epa_geo_plants))

geo_in_eia[geo_in_eia['plant_id_eia'].isin(geo_not_in_epa)]

Unnamed: 0,plant_id_eia,prime_mover_code
59,59382,['ST']
60,60419,['BT']
61,60785,['BT']
62,61912,['BT']
63,63001,['BT']
64,63365,['BT']


In [152]:
def calculate_geothermal_emission_factors():
    """
    Updates the list of geothermal plants provided by EPA using EIA data
    Calculates a weighted average EF for each plant-month based on the fraction 
    of fuel consumed from each type of prime mover (steam, binary, flash)
    """

    # load the eia generation fuel data
    generation_fuel_eia923 = pd.read_sql(f"SELECT * FROM generation_fuel_eia923 WHERE report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'", pudl_engine)

    # create a dataframe of total heat input by prime mover for each geothermal plant
    geo_in_eia = generation_fuel_eia923[generation_fuel_eia923['energy_source_code'] == 'GEO'].groupby(['plant_id_eia','prime_mover_code','report_date']).sum()['fuel_consumed_mmbtu'].reset_index()
    # remove prime movers for which there was no heat input
    geo_in_eia = geo_in_eia[geo_in_eia['fuel_consumed_mmbtu'] > 0]

    # merge in the EPA's assigned Geotype
    geothermal_geotype = pd.read_csv('../data/egrid/egrid_static_tables/table_geothermal_geotype.csv')
    geo_in_eia = geo_in_eia.merge(geothermal_geotype[['plant_id_eia','geotype_code']], how='left', on='plant_id_eia')

    # identify plants with multiple prime mover types
    multi_type_plants = geo_in_eia.groupby(['plant_id_eia','prime_mover_code']).count().reset_index().groupby('plant_id_eia').count()['prime_mover_code']
    multi_type_plants = multi_type_plants[multi_type_plants > 1]
    multi_type_plants = list(multi_type_plants.index)

    #update the geotype codes for plants with multiple types
    # for plants identified as flash steam that also have a binary component, update to binary
    geo_in_eia.loc[(geo_in_eia['plant_id_eia'].isin(multi_type_plants)) & (geo_in_eia['geotype_code'] == 'F') & (geo_in_eia['prime_mover_code'] == 'BT'),'geotype_code'] = 'B'
    # for plants identified as binary that also have a steam component, update to flash (it seems that all other multi-types are F/B combinatioms)
    geo_in_eia.loc[(geo_in_eia['plant_id_eia'].isin(multi_type_plants)) & (geo_in_eia['geotype_code'] == 'B') & (geo_in_eia['prime_mover_code'] == 'ST'),'geotype_code'] = 'F'

    # if EPA assigned a plant as flash or steam, but EIA identified it as binary, re-assign as binary
    geo_in_eia.loc[(geo_in_eia['prime_mover_code'] == 'BT') & (geo_in_eia['geotype_code'].isin(['F','S'])), 'geotype_code'] = 'B'

    # if EPA assigned a plant as binary, but EIA identified it as a steam turbine, re-assign as flash
    # we use flash instead of steam, b/c flash is more common than steam according to EIA
    # Source: https://www.eia.gov/energyexplained/geothermal/geothermal-power-plants.php
    geo_in_eia.loc[(geo_in_eia['prime_mover_code'] == 'ST') & (geo_in_eia['geotype_code'].isin(['B'])), 'geotype_code'] = 'F'

    # where plants are missing a geotype code, assign based on the EIA-identified prime mover
    geo_in_eia.loc[(geo_in_eia['geotype_code'].isna()) & (geo_in_eia['prime_mover_code'] == 'BT'), 'geotype_code'] = 'B'
    geo_in_eia.loc[(geo_in_eia['geotype_code'].isna()) & (geo_in_eia['prime_mover_code'] == 'ST'), 'geotype_code'] = 'F'

    # calculate the fraction of heat input from each prime mover in each month
    fuel_frac = (geo_in_eia.set_index(['plant_id_eia','report_date','geotype_code'])[['fuel_consumed_mmbtu']] / geo_in_eia.groupby(['plant_id_eia','report_date']).sum()).reset_index()
    fuel_frac = fuel_frac.rename(columns={'fuel_consumed_mmbtu': 'fuel_frac'})
    geo_in_eia = geo_in_eia.merge(fuel_frac, how='left', on=['plant_id_eia','report_date','geotype_code'])

    # calculate a weighted average emission factor for each plant

    # load geothermal efs
    geothermal_efs = pd.read_csv('../data/egrid/egrid_static_tables/table_C6_geothermal_emission_factors.csv')[['geotype_code','co2_lb_per_mmbtu']]
    # convert lb to ton
    geothermal_efs['co2_tons_per_mmbtu'] = geothermal_efs['co2_lb_per_mmbtu'] / 2000
    geothermal_efs = geothermal_efs[['geotype_code','co2_tons_per_mmbtu']]
    # merge in the emission factor
    geo_in_eia = geo_in_eia.merge(geothermal_efs, how='left', on='geotype_code')
    # multiply the emission factor by the fraction
    geo_in_eia['co2_tons_per_mmbtu'] = geo_in_eia['fuel_frac'] * geo_in_eia['co2_tons_per_mmbtu']  

    # groupby plant and month to get the weighted emission factor
    geo_in_eia = geo_in_eia.groupby(['plant_id_eia','report_date']).sum()['co2_tons_per_mmbtu'].reset_index()

    # if there are any plants missing from our list, add them back in

    # identify the plants that are in the epa geotype table but not the EIA-derived one
    epa_geo_plants = list(geothermal_geotype.plant_id_eia.unique())
    plants_from_eia = list(geo_in_eia.plant_id_eia.unique())
    missing_plants = list(set(epa_geo_plants) - set(plants_from_eia))

    # create a dataframe with the geotype of all misisng plants
    missing_plants = geothermal_geotype.loc[geothermal_geotype['plant_id_eia'].isin(missing_plants), ['plant_id_eia','geotype_code']]

    # merge in the efs
    missing_plants = missing_plants.merge(geothermal_efs, how='left', on='geotype_code')

    # drop the geotype code
    missing_plants = missing_plants.drop(columns=['geotype_code'])

    # create a record for each month of the year
    missing_plants = create_monthly_gens_records(missing_plants, year)

    # concat the missing plants to the other dataframe
    geo_efs = pd.concat([geo_in_eia, missing_plants], axis=0)

    return geo_efs


In [153]:
geo_efs = calculate_geothermal_emission_factors()
geo_efs

Unnamed: 0,plant_id_eia,report_date,co2_tons_per_mmbtu
0,286,2020-01-01,0.013013
1,286,2020-02-01,0.013013
2,286,2020-03-01,0.013013
3,286,2020-04-01,0.013013
4,286,2020-05-01,0.013013
...,...,...,...
16,57181,2020-12-01 00:00:00,0.000000
17,57477,2020-12-01 00:00:00,0.000000
18,57478,2020-12-01 00:00:00,0.000000
19,57479,2020-12-01 00:00:00,0.000000


# Identify mismatches between 923 tables

In [3]:
import sqlalchemy as sa
# specify the year for analysis
year = 2020

# specify the relative path to the sqllite database, and create an sqalchemy engine
pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

# load the generation fuel data for the year
generation_eia923 = pd.read_sql(f"SELECT * FROM generation_eia923 WHERE report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'", pudl_engine)
generation_fuel_eia923 = pd.read_sql(f"SELECT * FROM generation_fuel_eia923 WHERE report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'", pudl_engine)
boiler_fuel_eia923 = pd.read_sql(f"SELECT * FROM boiler_fuel_eia923 WHERE report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'", pudl_engine)

In [8]:
gen_agg = generation_eia923.groupby(['plant_id_eia']).sum().reset_index()
gf_agg = generation_fuel_eia923.groupby(['plant_id_eia']).sum()['net_generation_mwh'].reset_index()

In [7]:
generation_eia923[generation_eia923['plant_id_eia'] == 64408]

Unnamed: 0,plant_id_eia,generator_id,report_date,net_generation_mwh
45996,64408,WEG,2020-01-01,
45997,64408,WEG,2020-02-01,
45998,64408,WEG,2020-03-01,
45999,64408,WEG,2020-04-01,
46000,64408,WEG,2020-05-01,
46001,64408,WEG,2020-06-01,
46002,64408,WEG,2020-07-01,
46003,64408,WEG,2020-08-01,
46004,64408,WEG,2020-09-01,
46005,64408,WEG,2020-10-01,


In [6]:
generation_fuel_eia923[generation_fuel_eia923['plant_id_eia'] == 64408]

Unnamed: 0,plant_id_eia,report_date,energy_source_code,fuel_type_code_pudl,fuel_type_code_aer,prime_mover_code,fuel_consumed_units,fuel_consumed_for_electricity_units,fuel_mmbtu_per_unit,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,net_generation_mwh
171127,64408,2020-01-01,WDS,waste,WWW,ST,10246.0,7016.0,8.5,87091.0,59633.0,4718.576
171128,64408,2020-02-01,WDS,waste,WWW,ST,9236.0,6324.0,8.5,78506.0,53752.0,4253.165
171129,64408,2020-03-01,WDS,waste,WWW,ST,9432.0,6458.0,8.5,80172.0,54889.0,4343.15
171130,64408,2020-04-01,WDS,waste,WWW,ST,9156.0,6268.0,8.5,77826.0,53282.0,4215.988
171131,64408,2020-05-01,WDS,waste,WWW,ST,8961.0,6135.0,8.5,76169.0,52149.0,4126.361
171132,64408,2020-06-01,WDS,waste,WWW,ST,9378.0,6420.0,8.5,79713.0,54573.0,4318.131
171133,64408,2020-07-01,WDS,waste,WWW,ST,10448.0,7153.0,8.5,88808.0,60804.0,4811.222
171134,64408,2020-08-01,WDS,waste,WWW,ST,10276.0,7035.0,8.5,87346.0,59800.0,4731.747
171135,64408,2020-09-01,WDS,waste,WWW,ST,9050.0,6196.0,8.5,76925.0,52667.0,4167.334
171136,64408,2020-10-01,WDS,waste,WWW,ST,8611.0,5895.0,8.5,73194.0,50110.0,3964.998


In [10]:
compare = gen_agg.merge(gf_agg, how='inner', on=['plant_id_eia'], suffixes=('_g','_gf'))
compare[(compare['net_generation_mwh_g'].round(0) != compare['net_generation_mwh_gf'].round(0)) & (compare['net_generation_mwh_g'] > 0)]

Unnamed: 0,plant_id_eia,net_generation_mwh_g,net_generation_mwh_gf
2,10,1023602.00,1071142.000
3,26,3796813.00,3796945.013
12,117,3573136.00,3604513.000
13,120,246777.00,416760.000
14,126,955452.00,1463470.760
...,...,...,...
1353,58330,1388.07,57653.790
1367,58697,4013694.00,4078353.000
1372,59035,48657.00,48797.140
1373,59073,473149.00,617727.000


In [23]:
boiler_fuel_eia923['fuel_consumed_mmbtu'] = boiler_fuel_eia923['fuel_consumed_units'] * boiler_fuel_eia923['fuel_mmbtu_per_unit']

In [26]:
gf_agg_f = generation_fuel_eia923.groupby(['plant_id_eia','report_date']).sum()['fuel_consumed_mmbtu'].reset_index()
bf_agg = boiler_fuel_eia923.groupby(['plant_id_eia','report_date']).sum()['fuel_consumed_mmbtu'].reset_index()
compare = gf_agg_f.merge(bf_agg, how='inner', on=['plant_id_eia','report_date'], suffixes=('_gf','_bf'))
compare[(compare['fuel_consumed_mmbtu_gf'].round(0) != compare['fuel_consumed_mmbtu_bf'].round(0)) & (compare['fuel_consumed_mmbtu_bf'] > 0)]

Unnamed: 0,plant_id_eia,report_date,fuel_consumed_mmbtu_gf,fuel_consumed_mmbtu_bf
1,3,2020-03-01,5962165.0,5962165.523
2,3,2020-04-01,7602660.0,7602719.481
3,3,2020-05-01,6083966.0,6083965.338
7,3,2020-09-01,8711075.0,8711156.079
8,3,2020-10-01,8808784.0,8808714.970
...,...,...,...,...
15835,63923,2020-12-01,108548.0,1127157.200
15847,63924,2020-12-01,97011.0,1120503.280
15859,63927,2020-12-01,100708.0,1088467.120
15861,64020,2020-06-01,872309.0,872308.500


In [10]:
generation_fuel_eia923[(generation_fuel_eia923['plant_id_eia'] == 3)].sum()

plant_id_eia                                                                         165
report_date                            2020-02-012020-03-012020-04-012020-05-012020-0...
energy_source_code                     NGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGBI...
fuel_type_code_pudl                    gasgasgasgasgasgasgasgasgasgasgasgasgasgasgasg...
fuel_type_code_aer                     NGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGNGCO...
prime_mover_code                       CACACACACACACACACACACACTCTCTCTCTCTCTCTCTCTCTST...
fuel_consumed_units                                                           56086082.0
fuel_consumed_for_electricity_units                                           56086082.0
fuel_mmbtu_per_unit                                                              258.481
fuel_consumed_mmbtu                                                           82061242.0
fuel_consumed_for_electricity_mmbtu                                           82061242.0
net_generation_mwh   

In [10]:
generation_fuel_eia923[(generation_fuel_eia923['plant_id_eia'] == 404) & (generation_fuel_eia923['report_date'] == '2020-01-01')]

Unnamed: 0,plant_id_eia,report_date,energy_source_code,fuel_type_code_pudl,fuel_type_code_aer,prime_mover_code,fuel_consumed_units,fuel_consumed_for_electricity_units,fuel_mmbtu_per_unit,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,net_generation_mwh
4098,404,2020-01-01,NG,gas,NG,CA,0.0,0.0,0.0,0.0,0.0,-3.0
4110,404,2020-01-01,OG,gas,OOG,CA,0.0,0.0,0.0,0.0,0.0,0.0
4122,404,2020-01-01,NG,gas,NG,CT,0.0,0.0,0.0,0.0,0.0,0.0
4134,404,2020-01-01,OG,gas,OOG,CT,0.0,0.0,0.0,0.0,0.0,0.0
4146,404,2020-01-01,NG,gas,NG,GT,158421.0,158421.0,1.044,165392.0,165392.0,17618.0
4158,404,2020-01-01,OBG,waste,ORW,GT,0.0,0.0,0.0,0.0,0.0,0.0
4170,404,2020-01-01,NG,gas,NG,ST,26241.0,26241.0,1.044,27396.0,27396.0,2288.0
4182,404,2020-01-01,OBG,waste,ORW,ST,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
import src.load_data as load_data
import sqlalchemy as sa

year = 2020

IDX_GENS = ["plant_id_eia", "generator_id", "report_date"]
"""Id columns for generators."""

IDX_PM_FUEL = ["plant_id_eia", "prime_mover_code", "energy_source_code", "report_date"]
"""Id columns for plant, prime mover & fuel type records."""

IDX_FUEL = ["report_date", "plant_id_eia", "energy_source_code"]

DATA_COLS = ["net_generation_mwh", "fuel_consumed_mmbtu"]
"""Data columns from generation_fuel_eia923 that are being allocated."""

pudl_db = 'sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite'
pudl_engine = sa.create_engine(pudl_db)

# specify the date filter for retrieving data
year_filter = f"report_date >= '{year}-01-01' AND report_date <= '{year}-12-01'"

# extract all of the tables from pudl_out early in the process and select
# only the columns we need. this is for speed and clarity.
gf = load_data.load_pudl_table(f"SELECT * FROM generation_fuel_eia923 WHERE {year_filter}").loc[
    :, IDX_PM_FUEL + ["net_generation_mwh", "fuel_consumed_mmbtu"]
].pipe(apply_dtype)
gen = (
    load_data.load_pudl_table(f"SELECT * FROM generation_eia923 WHERE {year_filter}").loc[:, IDX_GENS + ["net_generation_mwh"]]
    # removes 4 records with NaN generator_id as of pudl v0.5
    .dropna(subset=IDX_GENS)
).pipe(apply_dtype)
gens = load_data.load_pudl_table(f"SELECT * FROM generators_eia860 WHERE {year_filter}").loc[
    :,
    IDX_GENS
    + [
        "capacity_mw",
        "operational_status",
        "retirement_date",
    ]
    + list(load_data.load_pudl_table(f"SELECT * FROM generators_eia860 WHERE {year_filter}").filter(like="energy_source_code")),
]
# add the prime mover code to the gens df from generators entity
gens = gens.merge(load_data.load_pudl_table("generators_entity_eia").loc[:,["plant_id_eia", "generator_id","prime_mover_code"]],
                    how='left', 
                    on=["plant_id_eia", "generator_id"]).pipe(apply_dtype)

In [30]:
gf[gf['net_generation_mwh'] < 0]

Unnamed: 0,plant_id_eia,prime_mover_code,energy_source_code,report_date,net_generation_mwh,fuel_consumed_mmbtu
46,3,ST,BIT,2020-02-01,-152.343,1344.0
57,3,ST,NG,2020-02-01,-3391.657,29922.0
95,7,ST,NG,2020-05-01,-397.000,0.0
96,7,ST,NG,2020-06-01,-335.000,0.0
97,7,ST,NG,2020-07-01,-341.000,0.0
...,...,...,...,...,...,...
170950,64295,BA,MWH,2020-12-01,-20.892,0.0
170951,64296,BA,MWH,2020-11-01,-15.424,0.0
170952,64296,BA,MWH,2020-12-01,-14.576,0.0
170953,64297,BA,MWH,2020-12-01,-4.000,0.0


In [None]:
# duplicate the entries for each month
gens_month = gens.copy()

month = 2
while month <= 12:
    # add one month to the copied data each iteration
    gens_month['report_date'] = gens_month['report_date'] + pd.DateOffset(months=1)
    # concat this data to the gens file
    gens = pd.concat([gens, gens_month], axis = 0)
    month += 1

