# Validate EIA-930 data against net generation outputs

In [1]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
import plotly.io as pio
from datetime import datetime
from datetime import timedelta
import json

import requests

In [2]:
year = 2020

In [3]:
# EIA-930 data after timestamp adjustments but no cleaning
raw = pd.read_csv("../data/outputs/2020/eia930/eia930_raw.csv", index_col=0, parse_dates=True)

In [4]:
GEN_ID = "EBA.{}-ALL.NG.H"
path = f"../data/results/{year}/power_sector_data/hourly/us_units/"
cors = {}
percent_difs = {}
annual_gen = {}
for ba_f in os.listdir(path):
    ba = ba_f.replace(".csv", "")
    print(ba, end="...")
    col_name = GEN_ID.format(ba)
    if col_name not in raw.columns: 
        continue
    else:
        dat = pd.read_csv(path+ba_f, parse_dates=["datetime_utc"])
        dat = dat[dat.fuel_category=="total"]
        dat = dat.merge(raw[ col_name], left_on="datetime_utc", right_index=True)
        c = dat[["net_generation_mwh", col_name]].corr().to_numpy()[0,1]
        cors[ba] = c
        difs = (dat[col_name]-dat["net_generation_mwh"])/dat["net_generation_mwh"]
        difs = difs.replace(np.inf, np.nan)
        percent_difs[ba] = difs.median()
        annual_gen[ba] = dat["net_generation_mwh"].sum()

OHMS...FMPP...ERCO...TPWR...AMPL...SOCO...IPCO...PJM...WWA...EEI...AZPS....DS_Store...TEC...DOPD...YAD...IID...HGMA...DEAA...CPLW...SPA...GVL...ORMS...FPL...TAL...SEC...JEA...GCPD...TVA...TIDC...HECO...SCEG...INMS...CPLE...SEPA...AKMS...CHPD...PNM...WAUW...WACM...WALC...NWMT...PACE...AVA...SC...NSB...GWA...HIMS...LGEE...TEPC...ISNE...SRP...GRIS...HST...LDWP...RIMS...FPC...PACW...GRIF...PSEI...AECI...DUK...AVRN...CEA...MISO...AEC...PSCO...SWPP...BANC...NYIS...EPE...NBSO...SCL...BPAT...NEVP...CISO...PGE...

In [120]:
out = pd.DataFrame(data={"Difference as percent of hourly-egrid":percent_difs, "Correlation":cors, "Annual BA generation":annual_gen})
out = out.sort_values("Annual BA generation", ascending=False)
out.to_csv(f"../data/results/{year}/validation_metrics/us_units/compare_930_hourlyegrid.csv")

# Visualize BA of interest

In [121]:
ba = "BPAT"
col_name = GEN_ID.format(ba)
dat = pd.read_csv(path+ba+".csv", parse_dates=["datetime_utc"])
dat = dat[dat.fuel_category=="total"]
dat = dat.merge(raw[ col_name], left_on="datetime_utc", right_index=True)

px.line(dat, x="datetime_utc", y=["net_generation_mwh", col_name])

# ~~Validate against carbon intensity and emissions data~~

Scraping data is too slow (1 hour per BA) and users without a paid API plan won't have enough requests. Instead, re-calculate rates from EIA-930 using Singularity per-fuel emission factors (derived from eGRID) 

* Scrape data from Singularity if not already downloaded
* Compare

Data location: 

`data/downloads/singularity/\<year\>_generated_rate.csv`

`data/downloads/singularity/\<year\>_consumed_rate.csv`

To scrape data, env var SINGULARITY_API_KEY should be set 

In [122]:
# year = 2020
# ba = "ISNE"


In [123]:
# EIA_REGIONS = { # regions for which singularity name is prefixed by EIA.
#     'BPAT',
#     'CISO',
#     'ISNE',
#     'MISO',
#     'NYIS',
#     'PJM',
#     'SWPP',
# }

# def download_singularity_carbon_intensities(ba, year):
#     # TODO GAILIN REMOVE API KEY

#     os.makedirs(f"../data/downloads/singularity/{year}", exist_ok=True)
#     fpath = f"../data/downloads/singularity/{year}/{ba}_rate.csv"
#     if not os.path.exists(fpath): 
#         url = "https://api.singularity.energy/v1/region_events/search"
#         headers = {
#             'X-Api-Key': os.environ['SINGULARITY_API_KEY'],
#         }
#         params = {
#             'event_type': "carbon_intensity",
#             'region': ("EIA." + ba) if ba in EIA_REGIONS else ba
#         }

#         dat = {}

#         # Iterate through weeks in year 
#         start = datetime(year=year, month=1, day=1, hour=0, minute=0)
#         while start.year == year:
#             end = start + timedelta(weeks=1) 
#             # Format params for search 
#             params["start"] = start.strftime("%Y-%m-%dT%H:%MZ")
#             params["end"] = end.strftime("%Y-%m-%dT%H:%MZ")
#             # Show some progress 
#             #print(start.strftime("%Y-%m-%d"), end="...")
#             # Update start 
#             start = end 

#             next = 1
#             last = 1

#             # Get all pages of data for this week
#             while next <= last: 
#                 params['page'] = next
#                 response = requests.request("GET", url, headers=headers, params=params)
#                 if len(response.json()['data']) == 0:
#                     print(f"Found no data for {params['region']}, will not create file")
#                     return
#                 if "next" in response.json()['meta']['pagination']: 
#                     next = response.json()['meta']['pagination']['next']
#                 else: 
#                     next = next + 1
#                 last = response.json()['meta']['pagination']['last']
#                 for d in response.json()['data']:
#                     # Only bother if this is real data, not infilled five minute intervals 
#                     if d['start_date'][-11:-9] != "00":
#                         continue
#                     # Only bother if eGRID source year is 2019: some data is computed using 2018 factors
#                     try:
#                         if "2019" not in d['meta']['generated_emissions_source']:
#                             continue
#                     except:
#                         print(f"Failed with response {d}")
#                         raise Exception

#                     # Parse dat to consistent names based on source 
#                     this_dat = dat.get(d['start_date'], {})
#                     try:
#                         for accounting_type in ["consumed","generated"]:
#                             if f"{accounting_type}_rate" not in d["data"].keys():
#                                 continue
#                             if "eq" in d["meta"]["generated_emissions_source"]: # CO2 equivalent
#                                 this_dat[f"{accounting_type}_eq"] = d["data"][f"{accounting_type}_rate"]
#                             elif "u" in d["meta"]["generated_emissions_source"]: # unadjusted
#                                 this_dat[f"{accounting_type}_u"] = d["data"][f"{accounting_type}_rate"]
#                             else: # normal adjustments
#                                 this_dat[f"{accounting_type}"] = d["data"][f"{accounting_type}_rate"]
#                         # Index by start date
#                         dat[d['start_date']] = this_dat
                        
#                     except:
#                         print(f"Failed on \n {d} \n\n")
#                         raise Exception
         
#         ba_dat = pd.DataFrame(dat).transpose()
#         ba_dat.index = pd.to_datetime(ba_dat.index)
#         ba_dat.to_csv(fpath)
#     else: 
#         print(f"File {fpath} exists")




In [None]:
# for ba in os.listdir("../data/results/2020/power_sector_data/hourly/us_units/"):
#     ba = ba.replace(".csv","")
#     print(ba, end="...")
#     download_singularity_carbon_intensities(ba, 2020)

In [None]:
# for ba in os.listdir("../data/results/2019/power_sector_data/hourly/us_units/"):
#     ba = ba.replace(".csv","")
#     print(ba)
#     download_singularity_carbon_intensities(ba, 2019)

# Alternative from above: calculate real-time-rates from 930 + eGRID

In [125]:
eia930 = pd.read_csv(f"../data/outputs/{year}/eia930/eia930_raw.csv", parse_dates=True, index_col=0)

In [126]:
## Load factors from Singularity API

# Use last year's egrid because that's all we have in real time
# TODO: could expand to other pollutants if we use eGRID download 
url = f"https://api.singularity.energy/v1/emissions/" 
egrid_year = str(year-1) # use last year as eGRID year

headers = {
    'X-Api-Key': os.environ['SINGULARITY_API_KEY'],
}

factors = {}

for adjustment in ["adjusted", "unadjusted"]: 
    adjusted = adjustment == "adjusted"
    key = f"EGRID_{egrid_year}" if adjusted else f"EGRID_u{egrid_year}"
    response = requests.request("GET", url+key, headers=headers)
    factors[adjustment] = json.loads(response.content)["data"]


KeyError: 'SINGULARITY_API_KEY'

In [127]:
## For each BA, use singularity factors to calculate emission rate 
bas_to_calc = [ba.replace(".csv", "") for ba in os.listdir("../data/results/2020/power_sector_data/hourly/us_units/")]

fuel_categories = {
    "coal":"COL",
    "natural_gas":"NG",
    "other":"OTH",
    "hydro":"WAT",
    "wind":"WND",
    "solar":"SUN",
    "nuclear":"NUC",
    "petroleum":"OIL"
}

for ba in bas_to_calc:
    singularity_ba = "EIA." + ba if ba in EIA_REGIONS else ba
    if singularity_ba not in factors[adjustment].keys():
        print(f"missing ba {singularity_ba}")
        continue

    out = pd.DataFrame(index=eia930.index, columns=["adjusted_carbon","unajusted_carbon", "adjusted_rate", "unadjusted_rate"])

    for adjustment in ["adjusted", "unadjusted"]:
        s_fuels = list(factors[adjustment][singularity_ba].keys())
        s_factors = [factors[adjustment][singularity_ba][f]['value'] for f in s_fuels]
        fuels = [fuel_categories[f] for f in s_fuels]
        generation_labels = [f"EBA.{ba}-ALL.NG.{f}.H" for f in fuels]

        out.loc[:,f"{adjustment}_carbon"] = eia930[generation_labels].mul(s_factors, axis='columns').sum(axis='columns')
        out.loc[:,f"{adjustment}_rate"] = out.loc[:,f"{adjustment}_carbon"] / eia930.loc[:,f"EBA.{ba}-ALL.NG.H"]

    os.makedirs(f"../data/outputs/{year}/validation/real_time_rate/", exist_ok=True)
    out.to_csv(f"../data/outputs/{year}/validation/real_time_rate/{ba}.csv")


missing ba OHMS
missing ba AMPL
missing ba EEI
missing ba .DS_Store
missing ba ORMS
missing ba JEA
missing ba HECO
missing ba INMS
missing ba AKMS
missing ba NSB
missing ba HIMS
missing ba GRIS
missing ba RIMS
missing ba CEA
missing ba NBSO


# Rate: correlations and percent differences

Evaluation of rates

In [5]:
gen_path = f"../data/results/{year}/power_sector_data/hourly/us_units/"
consumed_path = f"../data/results/{year}/carbon_accounting/hourly/us_units/"

In [129]:
year = 2020

In [130]:
percent_difs = {}
cors = {}
for ba in os.listdir(f"../data/outputs/{year}/validation/real_time_rate/"):
    if ba == ".DS_Store": # just some os stuff
        continue 
    ba = ba.replace(".csv", "")
    singularity_dat = pd.read_csv(f"../data/outputs/{year}/validation/real_time_rate/{ba}.csv", index_col=0, parse_dates=True)
    # hourly_consumed = pd.read_csv(consumed_path+ba+".csv",
    #     usecols=["datetime_utc", "consumed_co2_rate_lb_per_mwh_for_electricity", "consumed_co2_rate_lb_per_mwh_adjusted"], 
    #     index_col="datetime_utc", parse_dates=True)
    hourly_generated = pd.read_csv(gen_path+ba+".csv", 
        usecols=["datetime_utc", "generated_co2_rate_lb_per_mwh_for_electricity", "generated_co2_rate_lb_per_mwh_for_electricity_adjusted", "co2_mass_lb", "fuel_category"], 
        index_col="datetime_utc", parse_dates=True)
    hourly_generated = hourly_generated.loc[hourly_generated.fuel_category=="total"]
    hourly_generated = hourly_generated.sort_index()
    all_dat = pd.concat([singularity_dat, hourly_generated], axis='columns')
    all_dat = all_dat.sort_index()
    cors[ba] = all_dat[["generated_co2_rate_lb_per_mwh_for_electricity_adjusted", "adjusted_rate"]].corr().to_numpy()[0,1]
    percent_difs[ba] = ((all_dat["adjusted_rate"] - all_dat["generated_co2_rate_lb_per_mwh_for_electricity_adjusted"])/all_dat["generated_co2_rate_lb_per_mwh_for_electricity_adjusted"]).median()



In [131]:
out = pd.DataFrame(data={"Difference as percent of OGE":percent_difs, "Correlation":cors, "Annual BA generation":annual_gen})
out = out.sort_values("Annual BA generation", ascending=False)
out.to_csv(f"../data/results/{year}/validation_metrics/us_units/compare_real_time_rates.csv")

In [132]:
out.head()

Unnamed: 0,Difference as percent of OGE,Correlation,Annual BA generation
PJM,0.00052,0.753461,795430300.0
MISO,-0.050578,0.977785,616931100.0
ERCO,-0.022224,0.989143,410100900.0
SWPP,-0.002332,0.991532,259016500.0
SOCO,-0.048614,0.983606,240677200.0


# Visualize emission rate differences

In [16]:
# For one-off interactive plotting
ba_of_interest = "DEAA"

real_time = pd.read_csv(f"../data/outputs/{year}/validation/real_time_rate/{ba_of_interest}.csv", index_col=0, parse_dates=True)
real_time = real_time["2020-01-01T00:00":]

hourly_consumed = pd.read_csv(consumed_path+ba_of_interest+".csv",
    usecols=["datetime_utc", "consumed_co2_rate_lb_per_mwh_for_electricity", "consumed_co2_rate_lb_per_mwh_for_electricity_adjusted"], 
    index_col="datetime_utc", parse_dates=True)
hourly_generated = pd.read_csv(gen_path+ba_of_interest+".csv", 
    usecols=["datetime_utc", "generated_co2_rate_lb_per_mwh_for_electricity", "generated_co2_rate_lb_per_mwh_for_electricity_adjusted", "co2_mass_lb", "fuel_category"], 
    index_col="datetime_utc", parse_dates=True)

all_dat = pd.concat([real_time, hourly_consumed, hourly_generated.loc[hourly_generated.fuel_category=="total"]], axis='columns')
all_dat = all_dat.sort_index()

fig = px.line(all_dat, x=all_dat.index, y=["generated_co2_rate_lb_per_mwh_for_electricity", "adjusted_rate"], 
    title=f"{ba_of_interest} rate comparison",
    labels={
        "value":"Adjsuted CO2 emission rate (lb/mwh)",
        "index":"Hour"
    })

newnames = {'generated_co2_rate_lb_per_mwh_for_electricity': 'Our data', 'adjusted_rate': 'Real-time data'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name]))

In [18]:
# What's happening Jun 16? 
to_investigate = pd.read_csv(gen_path+ba_of_interest+".csv", 
    index_col="datetime_utc", parse_dates=True)

In [20]:
to_investigate.loc["2020-06-16T14:00"].to_csv("~/Desktop/plant_")

Unnamed: 0_level_0,fuel_category,datetime_local,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_lb,ch4_mass_lb,n2o_mass_lb,co2e_mass_lb,nox_mass_lb,...,generated_n2o_rate_lb_per_mwh_for_electricity,generated_co2e_rate_lb_per_mwh_for_electricity,generated_nox_rate_lb_per_mwh_for_electricity,generated_so2_rate_lb_per_mwh_for_electricity,generated_co2_rate_lb_per_mwh_for_electricity_adjusted,generated_ch4_rate_lb_per_mwh_for_electricity_adjusted,generated_n2o_rate_lb_per_mwh_for_electricity_adjusted,generated_co2e_rate_lb_per_mwh_for_electricity_adjusted,generated_nox_rate_lb_per_mwh_for_electricity_adjusted,generated_so2_rate_lb_per_mwh_for_electricity_adjusted
datetime_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-16 14:00:00+00:00,natural_gas,2020-06-16 07:00:00-07:00,0.06,171.64,171.64,20403.72,0.38,0.04,20427.81,19.22,...,0.666667,340463.5,320.333333,1.666667,340062.0,6.333333,0.666667,340463.5,320.333333,1.666667
2020-06-16 14:00:00+00:00,total,2020-06-16 07:00:00-07:00,0.06,171.64,171.64,20403.72,0.38,0.04,20427.81,19.22,...,0.666667,340463.5,320.333333,1.666667,340062.0,6.333333,0.666667,340463.5,320.333333,1.666667


In [15]:
# Plot and save all BAs 
for ba_of_interest in os.listdir("../data/outputs/2020/validation/real_time_rate/"):
    ba_of_interest = ba_of_interest.replace(".csv", "")
    if ".DS_" in ba_of_interest:
        continue
    
    real_time = pd.read_csv(f"../data/outputs/{year}/validation/real_time_rate/{ba_of_interest}.csv", index_col=0, parse_dates=True)
    real_time = real_time["2020-01-01T00:00":]

    hourly_consumed = pd.read_csv(consumed_path+ba_of_interest+".csv",
        usecols=["datetime_utc", "consumed_co2_rate_lb_per_mwh_for_electricity", "consumed_co2_rate_lb_per_mwh_for_electricity_adjusted"], 
        index_col="datetime_utc", parse_dates=True)
    hourly_generated = pd.read_csv(gen_path+ba_of_interest+".csv", 
        usecols=["datetime_utc", "generated_co2_rate_lb_per_mwh_for_electricity", "generated_co2_rate_lb_per_mwh_for_electricity_adjusted", "co2_mass_lb", "fuel_category"], 
        index_col="datetime_utc", parse_dates=True)

    all_dat = pd.concat([real_time, hourly_consumed, hourly_generated.loc[hourly_generated.fuel_category=="total"]], axis='columns')
    all_dat = all_dat.sort_index()

    fig = px.line(all_dat, x=all_dat.index, y=["generated_co2_rate_lb_per_mwh_for_electricity", "adjusted_rate"], 
        title=f"{ba_of_interest} rate comparison",
        labels={
            "value":"Adjsuted CO2 emission rate (lb/mwh)",
            "index":"Hour"
        })

    newnames = {'generated_co2_rate_lb_per_mwh_for_electricity': 'Our data', 'adjusted_rate': 'Real-time data'}
    fig.for_each_trace(lambda t: t.update(name = newnames[t.name]))
    pio.write_image(fig, f"../data/outputs/viz/{ba_of_interest}.jpg", width=1000, height=400, scale=3)