# About this notebook

This notebook is for test running the data pipeline and  as a sandbox for testing new functions that we are adding to data pipeline.

In [1]:
# import packages
import numpy as np
import pandas as pd
import argparse
import os

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../hourly-egrid/")

# import local modules
import src.data_cleaning as data_cleaning
import src.load_data as load_data
import src.impute_hourly_profiles as impute_hourly_profiles
import src.eia930 as eia930
import src.output_data as output_data

from src.column_checks import get_dtypes, apply_dtypes

year = 2020

# Run the Pipeline

In [2]:
%cd ../src
%run data_pipeline --year 2020

c:\Users\gmill\GitHub\hourly-egrid\src
1. Downloading data
   PUDL data already downloaded
   egrid2019_data.xlsx already downloaded
   egrid2020_data.xlsx already downloaded
   EBA_elec.csv already downloaded
   EBA_raw.csv already downloaded
   BALANCE_2020_Jan_Jun data already downloaded
   BALANCE_2020_Jul_Dec data already downloaded
   INTERCHANGE_2020_Jan_Jun data already downloaded
   INTERCHANGE_2020_Jul_Dec data already downloaded
   epa_eia_crosswalk.csv already downloaded
2. Identifying subplant IDs
   Subplant IDs already created
3. Cleaning EIA-923 data




   Removing 0 plants that are not grid-connected
   Removing 0 plants located in the following states: ['PR']
4. Cleaning CEMS data
   Removing 45 plants that are not grid-connected
   Removing 0 plants located in the following states: ['PR']
   Removing 3 units that only produce steam and do not report to EIA
   Removing 7254921 observations from cems for unit-months where no data reported
5. Loading plant static attributes
   Exporting plant_static_attributes to data/outputs
   Exporting plant_static_attributes to data/results
6. Converting CEMS gross generation to net generation
   Exporting gross_to_net_conversions to data/outputs
   Exporting cems to data/outputs
7. Identifying source for hourly data
8. Scaling partial CEMS data
   Exporting eia923_allocated to data/outputs
   Exporting partial_cems_scaled to data/outputs
9. Cleaning EIA-930 data
   Skipping EIA-930 scraping/cleaning as already completed.


  data.loc[data.ba_code == ba, "datetime_local"] = data.loc[


10. Calculating residual net generation profiles from EIA-930
   Exporting residual_profiles to data/outputs
11. Assigning hourly profile to monthly EIA-923 data
Summary of methods used to estimate missing hourly profiles:
                report_date                                       
profile_method DIBA_average assumed_flat national_average residual
fuel_category                                                     
biomass                   0          529                0        0
coal                      0           40                0      456
geothermal                0          108                0        0
hydro                     0           84                0      564
natural_gas               0           48                0      624
nuclear                   0            0                0      216
other                     0           60                0      456
petroleum                 0          204                0      276
solar                    48            0

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
%cd ../src
%run data_pipeline --small SMALL --year 2020

# Functions for loading intermediate outputs

In [None]:

# load data from csv
year = 2020
path_prefix = ''

cems = pd.read_csv(f'../data/outputs/{path_prefix}{year}/cems_{year}.csv', dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])
partial_cems_scaled = pd.read_csv(f'../data/outputs/{path_prefix}{year}/partial_cems_scaled_{year}.csv', dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}{year}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
plant_attributes = pd.read_csv(f"../data/outputs/{path_prefix}{year}/plant_static_attributes_{year}.csv")
primary_fuel_table = plant_attributes.drop_duplicates(subset="plant_id_eia")[["plant_id_eia", "plant_primary_fuel"]]
residual_profiles = pd.read_csv(f"../data/outputs/{path_prefix}{year}/residual_profiles_{year}.csv")

# TODO: Add output metrics back in

In [None]:
# output data quality metrics
output_data.output_to_results(
    validation.co2_source_metric(cems, partial_cems_scaled, monthly_eia_data_to_shape),
    "co2_measurement_source",
    "validation_metrics/",
    path_prefix,
)
output_data.output_to_results(
    validation.net_generation_method_metric(
        cems, partial_cems_scaled, monthly_eia_data_to_shape
    ),
    "net_generation_method",
    "validation_metrics/",
    path_prefix,
)

output_data.output_to_results(
    validation.hourly_profile_source_metric(
        cems, partial_cems_scaled, monthly_eia_data_to_shape
    ),
    "hourly_profile_method",
    "validation_metrics/",
    path_prefix,
)

# Run EIA-930 cleaning

In [None]:
from gridemissions.workflows import make_dataset

year=2020

data_folder = "../data/downloads/eia930/"

# if not small, scrape 2 months before start of year for rolling window cleaning
start = f"{year-1}1001T00Z"
# Scrape 1 week if small, else 1 year
end = f"{year}1231T23Z"

make_dataset(
            start,
            end,
            file_name="EBA",
            tmp_folder=data_folder,
            folder_hist=data_folder,
            scrape=False,
            add_ca_fuels=False,
            calc_consumed=False,
        )

# Test Scaled Residuals

- [ ] Issue 1: if there are any hours in the 930 net generation data that drop down close to zero (it seems like it never goes to zero, but instead drops to 1.0... not sure if that's part of the 930 data cleaning process?) the scaling factor that gets calculated is very small, meaning that the CEMS profile essentially goes to zero, which is not what we want.
- [ ] Issue 2: In some cases, the scaled residual is turning out negative, which it should not be
- [ ] Make sure the residual scaling is consistent with new methods

To start:
- load residual profiles and impute profile as new column

TODO:
- [x] Add profiles for non-EIA fuel categories to the residual profile table
- [x] Ensure complete hourly timeseries especially for cems only profiles
- [ ] Maybe rename the residual profiles hourly profiles and add new columns for the imputed values
- [ ] Flag where EIA-930 data is potentially missing so we can use CEMS data
- [ ] When calculating the profile as a percent, if the profile goes negative, we want to calculate based on the absolute value. 





In [9]:
from src.impute_hourly_profiles import *
import plotly.express as px

In [10]:
# load data from csv
year = 2020
path_prefix = ''
cems = pd.read_csv(f'../data/outputs/{path_prefix}{year}/cems_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date', 'datetime_utc'])
partial_cems_scaled = pd.read_csv(f'../data/outputs/{path_prefix}{year}/partial_cems_scaled_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
plant_attributes = pd.read_csv(f"../data/outputs/{path_prefix}{year}/plant_static_attributes_{year}.csv")

# aggregate cems data to subplant level
cems = data_cleaning.aggregate_cems_to_subplant(cems)
# drop data from cems that is now in partial_cems
cems = data_cleaning.filter_unique_cems_data(cems, partial_cems_scaled)

KeyboardInterrupt: 

In [None]:
clean_930_file = "../data/downloads/eia930/EBA_elec.csv"
eia930_data = eia930.load_chalendar_for_pipeline(clean_930_file, year=year)

In [None]:
# Name column same as 930, hourly_profiles.
cems = cems.merge(plant_attributes, how="left", on="plant_id_eia")

cems_agg = aggregate_for_residual(
    cems,
    plant_attributes,
    "datetime_utc",
    "ba_code",
    False,
)

In [38]:
residual_profiles  = impute_hourly_profiles.calculate_residual(
    cems,
    eia930_data,
    plant_attributes,
    year,
    transmission_only=False,
    ba_column_name="ba_code",
)

In [40]:
output_data.output_intermediate_data(
        residual_profiles, "residual_profiles", path_prefix, year
    )

   Exporting residual_profiles to data/outputs


In [45]:
test_data = residual_profiles[(residual_profiles['ba_code'] == 'CISO') & (residual_profiles['fuel_category'] == 'natural_gas')]

In [46]:
px.line(test_data, x='datetime_utc', y=["eia930_profile","cems_profile","residual_profile","scaled_residual_profile"])

# Test Hourly Profiles

In [2]:
# load data from csv
year = 2020
path_prefix = ''

eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}{year}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])

cems = pd.read_csv(f'../data/outputs/{path_prefix}{year}/cems_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date', 'datetime_utc'])
partial_cems_scaled = pd.read_csv(f'../data/outputs/{path_prefix}{year}/partial_cems_scaled_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
plant_attributes = pd.read_csv(f"../data/outputs/{path_prefix}{year}/plant_static_attributes_{year}.csv")

# aggregate cems data to subplant level
cems = data_cleaning.aggregate_cems_to_subplant(cems)
# drop data from cems that is now in partial_cems
cems = data_cleaning.filter_unique_cems_data(cems, partial_cems_scaled)

In [None]:
# create a separate dataframe containing only the EIA data that is missing from cems
monthly_eia_data_to_shape = eia923_allocated[
    (eia923_allocated["hourly_data_source"] == "eia")
    & ~(eia923_allocated["fuel_consumed_mmbtu"].isna())
]

In [4]:
clean_930_file = "../data/downloads/eia930/EBA_elec.csv"

eia930_data = eia930.load_chalendar_for_pipeline(clean_930_file, year=year)

  data.loc[data.ba_code == ba, "datetime_local"] = data.loc[


In [37]:
residual_profiles = impute_hourly_profiles.calculate_residual(
    cems,
    eia930_data,
    plant_attributes,
    year,
    transmission_only=False,
    ba_column_name="ba_code",
)

In [39]:
output_data.output_intermediate_data(
        residual_profiles, "residual_profiles", path_prefix, year
    )

   Exporting residual_profiles to data/outputs


In [41]:
# 11. Assign hourly profile to monthly data
print("11. Assigning hourly profile to monthly EIA-923 data")

# load profile data and format for use in the pipeline
# TODO: once this is in the pipeline (step 10), may not need to read file
hourly_profiles = impute_hourly_profiles.impute_missing_hourly_profiles(
    monthly_eia_data_to_shape, residual_profiles, plant_attributes, year
)

hourly_profiles = impute_hourly_profiles.add_missing_cems_profiles(hourly_profiles, cems, plant_attributes)

11. Assigning hourly profile to monthly EIA-923 data
Summary of methods used to estimate missing hourly profiles:
                   report_date                              
imputation_method DIBA_average assumed_flat national_average
fuel_category                                               
biomass                      0          529                0
coal                         0           40                0
geothermal                   0          108                0
hydro                        0           84                0
natural_gas                  0           48                0
other                        0           60                0
petroleum                    0          204                0
solar                       48            0               36
waste                        0          156                0
wind                        36            0               72


### Choose hourly profile
Hierarchy of profiles to use:
1. Residual
2. eia-930 profile
3. cems profile
4. imputed profile


Steps:
- specify an hourl profile using hierarchy
- convert profile to percent


In [None]:
# if there are any months that have incomplete cems data, replace the cems profile with na
incomplete_cems = hourly_profiles.loc[hourly_profiles["cems_profile"].isna(),["ba_code","fuel_category", "report_date"]].drop_duplicates()
hourly_profiles = hourly_profiles.merge(incomplete_cems, how="outer", on=["ba_code","fuel_category", "report_date"], indicator="source")
hourly_profiles.loc[(hourly_profiles["source"] == "both"), "cems_profile"] = np.NaN
hourly_profiles = hourly_profiles.drop(columns="source")

In [46]:
hourly_profiles["profile"] = np.NaN
hourly_profiles["profile_method"] = np.NaN
# specify the profile as the best available data
for source_column in ["residual_profile","eia930_profile","cems_profile","imputed_profile"]:
    hourly_profiles.loc[hourly_profiles['profile'].isna() & ~hourly_profiles[source_column].isna(), "profile_method"] = source_column
    hourly_profiles["profile"] = hourly_profiles["profile"].fillna(hourly_profiles[source_column])

hourly_profiles.loc[hourly_profiles["profile_method"] == "imputed_profile", "profile_method"] = hourly_profiles.loc[hourly_profiles["profile_method"] == "imputed_profile", "imputation_method"]
hourly_profiles = hourly_profiles.drop(columns=["imputation_method"])

In [48]:
print(
    hourly_profiles[
        ["ba_code", "fuel_category", "report_date", "profile_method"]
    ]
    .drop_duplicates()
    .drop(columns=["ba_code"])
    .pivot_table(
        index="fuel_category", columns="profile_method", aggfunc="count"
    )
    .fillna(0)
    .astype(int)
)

                report_date                                                            
profile_method DIBA_average assumed_flat cems_profile national_average residual_profile
fuel_category                                                                          
biomass                   0          433           99                0                0
coal                      0           39            1                0              456
geothermal                0          108            0                0                0
hydro                     0           84            0                0              564
natural_gas               0           48            0                0              624
nuclear                   0            0            0                0              216
other                     0           60            0                0              456
petroleum                 0          192           12                0              276
solar                    48     

In [52]:
hourly_profiles = hourly_profiles.drop(columns=["imputation_method"])

In [53]:
output_data.output_intermediate_data(
        hourly_profiles, "hourly_profiles", path_prefix, year
    )

   Exporting hourly_profiles to data/outputs


In [57]:
MONTHLY_GROUP_COLUMNS = [
    "ba_code",
    "fuel_category",
    "report_date",
    "profile_method",
]

monthly_total = hourly_profiles[MONTHLY_GROUP_COLUMNS + ["profile"]]
monthly_total['profile'] = abs(monthly_total['profile'])

monthly_total = (
    monthly_total.groupby(MONTHLY_GROUP_COLUMNS, dropna=False).sum().reset_index()
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  monthly_total['profile'] = abs(monthly_total['profile'])


In [60]:
monthly_total[monthly_total.duplicated(subset=["ba_code","fuel_category","report_date"], keep=False)]

Unnamed: 0,ba_code,fuel_category,report_date,profile_method,profile
2882,NYIS,waste,2020-10-01,assumed_flat,743.0
2883,NYIS,waste,2020-10-01,cems_profile,0.0
3711,SCEG,biomass,2020-10-01,assumed_flat,743.0
3712,SCEG,biomass,2020-10-01,cems_profile,41822.0
4558,TVA,biomass,2020-04-01,assumed_flat,719.0
4559,TVA,biomass,2020-04-01,cems_profile,43.454839
4565,TVA,biomass,2020-10-01,assumed_flat,743.0
4566,TVA,biomass,2020-10-01,cems_profile,36045.8


In [63]:
hourly_profiles[(hourly_profiles["ba_code"] == "TVA") & (hourly_profiles["fuel_category"] == "biomass") & (hourly_profiles["report_date"] == "2020-10-01")]

Unnamed: 0,ba_code,fuel_category,datetime_utc,datetime_local,report_date,eia930_profile,cems_profile,residual_profile,scaled_residual_profile,imputed_profile,profile,profile_method
3522533,TVA,biomass,2020-10-01 05:00:00+00:00,2020-10-01 00:00:00-05:00,2020-10-01,,36045.8,,,1.0,36045.8,cems_profile
3522534,TVA,biomass,2020-10-01 06:00:00+00:00,2020-10-01 01:00:00-05:00,2020-10-01,,,,,1.0,1.0,assumed_flat
3522535,TVA,biomass,2020-10-01 07:00:00+00:00,2020-10-01 02:00:00-05:00,2020-10-01,,,,,1.0,1.0,assumed_flat
3522536,TVA,biomass,2020-10-01 08:00:00+00:00,2020-10-01 03:00:00-05:00,2020-10-01,,,,,1.0,1.0,assumed_flat
3522537,TVA,biomass,2020-10-01 09:00:00+00:00,2020-10-01 04:00:00-05:00,2020-10-01,,,,,1.0,1.0,assumed_flat
...,...,...,...,...,...,...,...,...,...,...,...,...
3523272,TVA,biomass,2020-11-01 00:00:00+00:00,2020-10-31 19:00:00-05:00,2020-10-01,,,,,1.0,1.0,assumed_flat
3523273,TVA,biomass,2020-11-01 01:00:00+00:00,2020-10-31 20:00:00-05:00,2020-10-01,,,,,1.0,1.0,assumed_flat
3523274,TVA,biomass,2020-11-01 02:00:00+00:00,2020-10-31 21:00:00-05:00,2020-10-01,,,,,1.0,1.0,assumed_flat
3523275,TVA,biomass,2020-11-01 03:00:00+00:00,2020-10-31 22:00:00-05:00,2020-10-01,,,,,1.0,1.0,assumed_flat


# Investigate missing data

In [None]:
year = 2020
path_prefix = ''
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}eia923_allocated_{year}.csv', parse_dates=['report_date'])
plant_frame = pd.read_csv(f"../data/outputs/{path_prefix}plant_static_attributes_{year}.csv")
residual_profiles = pd.read_csv(f"../data/outputs/{path_prefix}residual_profiles_{year}.csv")
eia923_allocated = eia923_allocated.merge(plant_frame, how='left', on='plant_id_eia')

In [None]:
# 11. Assign hourly profile to monthly data
print("Assigning hourly profile to monthly EIA-923 data")
# create a separate dataframe containing only the generators for which we do not have CEMS data
monthly_eia_data_to_shape = eia923_allocated[
    (eia923_allocated["hourly_data_source"] == "eia")
    & ~(eia923_allocated["fuel_consumed_mmbtu"].isna())
]

In [None]:
monthly_data_to_shape[monthly_data_to_shape['energy_source_code'] == 'SUN']

In [None]:
hourly_profiles = impute_hourly_profiles.impute_missing_hourly_profiles(
    monthly_eia_data_to_shape, residual_profiles, year
)

In [None]:
hourly_profiles['report_date'] = pd.to_datetime(hourly_profiles['report_date'])

In [None]:
hourly_profiles[(hourly_profiles['fuel_category'] == 'solar') & (hourly_profiles['ba_code'] == 'ISNE')]

In [None]:
px.line(hourly_profiles[hourly_profiles['fuel_category'] == 'solar'], x='datetime_local', y='profile', color='ba_code')

In [None]:
hourly_profiles = impute_hourly_profiles.convert_profile_to_percent(hourly_profiles)

In [None]:
px.line(hourly_profiles[hourly_profiles['fuel_category'] == 'solar'], x='datetime_local', y='profile', color='ba_code')

In [None]:

shaped_eia_data = impute_hourly_profiles.shape_monthly_eia_data_as_hourly(
    monthly_eia_data_to_shape, hourly_profiles
)

In [None]:
# specify columns containing monthly data that should be distributed to hourly
columns_to_shape = [
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
    "ch4_mass_lb",
    "n2o_mass_lb",
    "nox_mass_lb",
    "so2_mass_lb",
    "co2_mass_lb_for_electricity",
    "ch4_mass_lb_for_electricity",
    "n2o_mass_lb_for_electricity",
    "nox_mass_lb_for_electricity",
    "so2_mass_lb_for_electricity",
    "co2_mass_lb_adjusted",
    "ch4_mass_lb_adjusted",
    "n2o_mass_lb_adjusted",
    "nox_mass_lb_adjusted",
    "so2_mass_lb_adjusted",
]

# group eia data by plant
shaped_monthly_data = (
    monthly_eia_data_to_shape.groupby(
        [
            "plant_id_eia",
            "subplant_id",
            "report_date",
            "plant_primary_fuel",
            "hourly_data_source",
            "fuel_category",
            "fuel_category_eia930",
            "ba_code",
            "ba_code_physical",
            "state",
            "distribution_flag",
        ],dropna=False)
    .sum()
    .reset_index()
)

In [None]:
monthly_eia_data_to_shape.plant_primary_fuel.unique()

In [None]:
shaped_monthly_data.plant_primary_fuel.unique()

In [None]:


# merge the hourly profiles into each plant-month
shaped_monthly_data = shaped_monthly_data.merge(
    hourly_profiles, how="left", on=["report_date", "fuel_category", "ba_code"]
)

# plant-months where there is negative net generation, assign a flat profile
shaped_monthly_data.loc[
    shaped_monthly_data["net_generation_mwh"] < 0, "profile"
] = 1 / (shaped_monthly_data["report_date"].dt.daysinmonth * 24)
shaped_monthly_data.loc[
    shaped_monthly_data["net_generation_mwh"] < 0, "profile_method"
] = "flat_negative_generation"

# shape the data
for column in columns_to_shape:
    shaped_monthly_data[column] = (
        shaped_monthly_data[column] * shaped_monthly_data["profile"]
    )
shaped_monthly_data = shaped_monthly_data.drop(columns=["profile"])

# re order the columns
column_order = [
    "plant_id_eia",
    "subplant_id",
    "datetime_local",
    "datetime_utc",
    "report_date",
    "net_generation_mwh",
    "fuel_consumed_mmbtu",
    "fuel_consumed_for_electricity_mmbtu",
    "co2_mass_lb",
    "ch4_mass_lb",
    "n2o_mass_lb",
    "nox_mass_lb",
    "so2_mass_lb",
    "co2_mass_lb_for_electricity",
    "ch4_mass_lb_for_electricity",
    "n2o_mass_lb_for_electricity",
    "nox_mass_lb_for_electricity",
    "so2_mass_lb_for_electricity",
    "co2_mass_lb_adjusted",
    "ch4_mass_lb_adjusted",
    "n2o_mass_lb_adjusted",
    "nox_mass_lb_adjusted",
    "so2_mass_lb_adjusted",
    "profile_method",
    "hourly_data_source",
]
shaped_monthly_data = shaped_monthly_data[column_order]

In [None]:
shaped_eia_data[(shaped_eia_data['fuel_category'] == 'solar')]

# Combine all plant data together

In [None]:
# load data from csv
year = 2020
path_prefix = ''
cems = pd.read_csv(f'../data/outputs/{path_prefix}cems_{year}.csv')
partial_cems = pd.read_csv(f'../data/outputs/{path_prefix}partial_cems_scaled_{year}.csv')
shaped_eia_data = pd.read_csv(f'../data/outputs/{path_prefix}shaped_eia923_data{year}.csv')
plant_frame = pd.read_csv(f"../data/outputs/{path_prefix}plant_static_attributes.csv")


In [None]:
# check that none of the sources have overlapping subplant-months
columns_to_check_for_duplicates = ['plant_id_eia','subplant_id','report_date']
cems_subplant_months = cems[columns_to_check_for_duplicates].drop_duplicates()
cems_subplant_months['cems'] = 1
partial_cems_subplant_months = partial_cems[columns_to_check_for_duplicates].drop_duplicates()
partial_cems_subplant_months['partial_cems'] = 1
shaped_eia_subplant_months = shaped_eia_data[columns_to_check_for_duplicates].drop_duplicates()
shaped_eia_subplant_months['shaped_eia'] = 1

data_source_overlap = cems_subplant_months.merge(partial_cems_subplant_months, how='outer',on=columns_to_check_for_duplicates).merge(shaped_eia_subplant_months, how='outer',on=columns_to_check_for_duplicates).fillna(0)
data_source_overlap

# check that there is no overlap between shaped eia and cems data
data_source_overlap[(data_source_overlap.shaped_eia == 1) & ((data_source_overlap.cems == 1) | (data_source_overlap.partial_cems == 1))]

# check for overlap between cems and partial cems data
data_source_overlap[(data_source_overlap.cems == 1) & (data_source_overlap.partial_cems == 1)]

In [None]:
cems = data_cleaning.filter_unique_cems_data(cems, partial_cems)

In [None]:
combined_plant_data = data_cleaning.combine_subplant_data(cems, partial_cems, shaped_eia_data)
combined_plant_data

In [None]:
ba_tz = load_data.load_ba_reference()[["ba_code", "timezone_local"]]


In [None]:
ba_table.columns

In [None]:
ba_tz = load_data.load_ba_reference()[["ba_code", "timezone_local"]]
generated_emission_rate_columns = [
    "generated_co2_rate_lb_per_mwh_for_electricity",
    "generated_ch4_rate_lb_per_mwh_for_electricity",
    "generated_n2o_rate_lb_per_mwh_for_electricity",
    "generated_nox_rate_lb_per_mwh_for_electricity",
    "generated_so2_rate_lb_per_mwh_for_electricity",
    "generated_co2_rate_lb_per_mwh_adjusted",
    "generated_ch4_rate_lb_per_mwh_adjusted",
    "generated_n2o_rate_lb_per_mwh_adjusted",
    "generated_nox_rate_lb_per_mwh_adjusted",
    "generated_so2_rate_lb_per_mwh_adjusted",
]

for ba in list(ba_fuel_data.ba_code.unique()):

    # filter the data for a single BA
    ba_table = ba_fuel_data[ba_fuel_data["ba_code"] == ba].drop(columns="ba_code")

    # convert the datetime_utc column back to a datetime
    ba_table["datetime_utc"] = pd.to_datetime(ba_table["datetime_utc"], utc=True)

    # calculate a total for the BA
    ba_total = ba_table.groupby(["datetime_utc"]).sum()[data_columns].reset_index()
    ba_total["fuel_category"] = "total"

    # concat the totals to the fuel-specific totals
    ba_table = pd.concat([ba_table, ba_total], axis=0, ignore_index=True)

    # round all values to one decimal place
    ba_table = ba_table.round(2)

    for emission_type in ["_for_electricity", "_adjusted"]:
        for emission in ["co2", "ch4", "n2o", "nox", "so2"]:
            ba_table[f"generated_{emission}_rate_lb_per_mwh{emission_type}"] = (
                (
                    ba_table[f"{emission}_mass_lb{emission_type}"]
                    / ba_table["net_generation_mwh"]
                )
                .fillna(0)
                .replace(np.inf, np.NaN)
                .replace(-np.inf, np.NaN)
            )

    # create a local datetime column
    local_tz = ba_tz.loc[ba_tz["ba_code"] == ba, "timezone_local"].item()
    ba_table["datetime_local"] = ba_table["datetime_utc"].dt.tz_convert(local_tz)

    # re-order columns
    ba_table = ba_table[['fuel_category','datetime_local','datetime_utc'] + data_columns + generated_emission_rate_columns]

    # export to a csv
    ba_table.to_csv(
        f"../data/results/{path_prefix}power_sector_data/{ba}.csv", index=False
    )



In [None]:
for ba in list(ba_fuel_data.ba_code.unique()):

    # filter the data for a single BA
    ba_table = ba_fuel_data[ba_fuel_data["ba_code"] == ba].drop(columns="ba_code")

    # convert the datetime_utc column back to a datetime
    ba_table["datetime_utc"] = pd.to_datetime(ba_table["datetime_utc"], utc=True)

    # calculate a total for the BA
    ba_total = (
        ba_table.groupby(["datetime_utc"])
        .sum()[data_columns]
        .reset_index()
    )
    ba_total["fuel_category"] = "total"

    # concat the totals to the fuel-specific totals
    ba_table = pd.concat([ba_table, ba_total], axis=0, ignore_index=True)

    # round all values to one decimal place
    ba_table = ba_table.round(1)

    for emission_type in ['_for_electricity','_adjusted']:
        for emission in ['co2','ch4','n2o','nox','so2']:
            ba_table[f"generated_{emission}_rate_lb_per_mwh{emission_type}"] = (
                    (ba_table[f"{emission}_mass_lb{emission_type}"] / ba_table["net_generation_mwh"])
                    .fillna(0)
                    .replace(np.inf, np.NaN).replace(-np.inf, np.NaN)
                )

    # export to a csv
    ba_table.to_csv(f"../data/results/{path_prefix}power_sector_data/{ba}.csv")

# Test new functions

In [None]:
year = 2020
path_prefix = ''
cems = pd.read_csv(f'../data/outputs/{path_prefix}cems_{year}.csv', parse_dates=['operating_datetime_utc','report_date'])
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}eia923_allocated_{year}.csv', parse_dates=['report_date'])

In [None]:
cems.energy_source_code.unique()

In [None]:
cems[cems['energy_source_code'] == 'MSW']

In [None]:

px.line(cems[cems['energy_source_code'] == 'BLQ'], x='operating_datetime_utc', y='fuel_consumed_mmbtu', color='cems_id')

In [None]:
plant_frame = pd.read_csv(f"../data/outputs/{path_prefix}plant_static_attributes.csv")
eia923_allocated = eia923_allocated.merge(plant_frame, how='left', on='plant_id_eia')

In [None]:
# 11. Assign hourly profile to monthly data
print('Assigning hourly profile to monthly EIA-923 data')
# create a separate dataframe containing only the generators for which we do not have CEMS data
monthly_eia_data_to_distribute = eia923_allocated[
    (eia923_allocated["hourly_data_source"] == "eia")
    & ~(eia923_allocated["fuel_consumed_mmbtu"].isna())
]
# load profile data and format for use in the pipeline
# TODO: once this is in the pipeline (step 10), may not need to read file
hourly_profiles = pd.read_csv(
    "../data/outputs/residual_profiles.csv", parse_dates=["report_date"]
)


In [None]:
available_profiles = hourly_profiles[['ba_code','fuel_category']].drop_duplicates()
ba_fuel_to_distribute = monthly_eia_data_to_distribute[['ba_code','fuel_category']].drop_duplicates().dropna()
missing_profiles = ba_fuel_to_distribute.merge(available_profiles, how='outer', on=['ba_code','fuel_category'], indicator='source')
missing_profiles = missing_profiles[missing_profiles.source == 'left_only']
missing_profiles.sort_values(by=['fuel_category','ba_code'])

In [None]:
hourly_profiles = hourly_profiles.load_hourly_profiles(monthly_eia_data_to_distribute, year)

In [None]:
print(hourly_profiles[['ba_code','fuel_category','profile_method']].drop_duplicates().pivot_table(index='fuel_category',columns='profile_method', aggfunc='count').fillna(0).astype(int))

# investigate profile shapes

In [None]:
import plotly.express as px
import src.eia930 as eia930

In [None]:
hydro_demand = load_data.load_raw_eia930_data(year, 'BALANCE')
hydro_demand = hydro_demand[["Balancing Authority","datetime_utc","Demand (MW)","Net Generation (MW)","Net Generation (MW) from Hydropower and Pumped Storage",]]

In [None]:
bas_with_no_hydro = hydro_demand.groupby("Balancing Authority").sum().reset_index()
bas_with_no_hydro = list(bas_with_no_hydro.loc[(bas_with_no_hydro["Net Generation (MW) from Hydropower and Pumped Storage"] == 0),"Balancing Authority"])

In [None]:
hydro_demand = hydro_demand[~hydro_demand['Balancing Authority'].isin(bas_with_no_hydro)]

In [None]:
px.line(hydro_demand, x='datetime_utc', y='Net Generation (MW) from Hydropower and Pumped Storage', color='Balancing Authority')

In [None]:
hydro_corr = hydro_demand.groupby("Balancing Authority")[["Demand (MW)","Net Generation (MW) from Hydropower and Pumped Storage"]].corr().reset_index()
hydro_corr = hydro_corr[hydro_corr['level_1'] == 'Demand (MW)'].drop(columns=["Demand (MW)","level_1"])
hydro_corr

In [None]:
cleaned_930 = eia930.load_chalendar_for_pipeline(
    "../data/outputs/EBA_adjusted_elec.csv", year=year
)

In [None]:
cleaned_930.fuel_category.unique()

In [None]:
fuel = 'other'

data_to_plot = cleaned_930[cleaned_930['fuel_category'] == fuel]

px.line(data_to_plot, x='datetime_local', y='net_generation_mwh_930', color='ba_code')

In [None]:
data_to_plot

In [None]:
data_to_plot.pivot(index='datetime_local', columns='ba_code', values='net_generation_mwh_930').corr()