In [None]:
import pandas as pd
import numpy as np

# TODO: read from parquet files instead?
pr_gen_fuel = pd.read_pickle("../data/raw_eia923__puerto_rico_generation_fuel")
pr_plant_frame = pd.read_pickle("../data/raw_eia923__puerto_rico_plant_frame")

In [None]:
# Handle EIA null values
pr_gen_fuel = pr_gen_fuel.replace(to_replace = ".", value = pd.NA)

# Convert data types (mmbtu/units to numeric, categories, booleans)
pr_gen_fuel = pr_gen_fuel.convert_dtypes()
pr_gen_fuel["associated_combined_heat_power"] = pr_gen_fuel["associated_combined_heat_power"] == "Y"
pr_gen_fuel = pr_gen_fuel.astype({
    "energy_source_code": "category",
    "fuel_type_code_agg": "category",
    "prime_mover_code": "category",
    "reporting_frequency_code": "category",
    "data_maturity": "category",
    "plant_state": "category"
})

In [26]:
#### monthly pivoting

# set up shared indices
raw_index_cols = ["plant_id_eia", "plant_name_eia", "report_year", "prime_mover_code", "energy_source_code", "fuel_unit"]
clean_index_cols = ["date", "energy_source_code", "prime_mover_code", "plant_id_eia", "plant_name_eia", "fuel_unit"]

In [None]:
# Pivot fuel_consumed_for_electricity MMBTU columns

fuel_elec_mmbtu_cols = raw_index_cols + [col for col in pr_gen_fuel.columns if "fuel_consumed_for_electricity_mmbtu" in col]
fuel_elec_mmbtu = pr_gen_fuel.loc[:, fuel_elec_mmbtu_cols]

## Melt the fuel_consumed columns
fuel_elec_mmbtu_melt = fuel_elec_mmbtu.melt(
    id_vars=raw_index_cols,
    var_name="month",
    value_name="fuel_consumed_for_electricity_mmbtu"
)
## Split the month from the variable
fuel_elec_mmbtu_melt["month"] = fuel_elec_mmbtu_melt["month"].str.replace("fuel_consumed_for_electricity_mmbtu_", "")
## Create date from month and year
fuel_elec_mmbtu_melt["date"] = pd.to_datetime(
    fuel_elec_mmbtu_melt["month"] + fuel_elec_mmbtu_melt["report_year"].astype(str),
    format="%B%Y",
)
## Drop old date columns
fuel_elec_mmbtu_clean = fuel_elec_mmbtu_melt.drop(columns = ["report_year", "month"]).set_index(clean_index_cols)
fuel_elec_mmbtu_clean

In [None]:
# Pivot fuel_consumed_for_electricity UNITS columns
fuel_elec_units_cols = raw_index_cols + [col for col in pr_gen_fuel.columns if "fuel_consumed_for_electricity_units" in col]
fuel_elec_units = pr_gen_fuel.loc[:, fuel_elec_units_cols]

## Melt the fuel_consumed columns
fuel_elec_units_melt = fuel_elec_units.melt(
    id_vars=raw_index_cols,
    var_name="month",
    value_name="fuel_consumed_for_electricity_units"
)
## Split the month from the variable
fuel_elec_units_melt["month"] = fuel_elec_units_melt["month"].str.replace("fuel_consumed_for_electricity_units_", "")
## Create date from month and year
fuel_elec_units_melt["date"] = pd.to_datetime(
    fuel_elec_units_melt["month"] + fuel_elec_units_melt["report_year"].astype(str),
    format="%B%Y",
)
## Drop old date columns
fuel_elec_units_clean = fuel_elec_units_melt.drop(columns = ["report_year", "month"]).set_index(clean_index_cols)
fuel_elec_units_clean

In [None]:
# Pivot fuel_consumed MMBTU columns

fuel_mmbtu_cols = raw_index_cols + [col for col in pr_gen_fuel.columns if "fuel_consumed_mmbtu" in col]
fuel_mmbtu = pr_gen_fuel.loc[:, fuel_mmbtu_cols]

## Melt the fuel_consumed columns
fuel_mmbtu_melt = fuel_mmbtu.melt(
    id_vars=raw_index_cols,
    var_name="month",
    value_name="fuel_consumed_mmbtu"
)
## Split the month from the variable
fuel_mmbtu_melt["month"] = fuel_mmbtu_melt["month"].str.replace("fuel_consumed_mmbtu_", "")
## Create date from month and year
fuel_mmbtu_melt["date"] = pd.to_datetime(
    fuel_mmbtu_melt["month"] + fuel_mmbtu_melt["report_year"].astype(str),
    format="%B%Y",
)
## Drop old date columns
fuel_mmbtu_clean = fuel_mmbtu_melt.drop(columns = ["report_year", "month"]).set_index(clean_index_cols)
fuel_mmbtu_clean

In [None]:
# Pivot fuel_consumed UNITS columns

fuel_units_cols = raw_index_cols + [col for col in pr_gen_fuel.columns if "fuel_consumed_units" in col]
fuel_units = pr_gen_fuel.loc[:, fuel_units_cols]

## Melt the fuel_consumed columns
fuel_units_melt = fuel_units.melt(
    id_vars=raw_index_cols,
    var_name="month",
    value_name="fuel_consumed_units"
)
## Split the month from the variable
fuel_units_melt["month"] = fuel_units_melt["month"].str.replace("fuel_consumed_units_", "")
## Create date from month and year
fuel_units_melt["date"] = pd.to_datetime(
    fuel_units_melt["month"] + fuel_units_melt["report_year"].astype(str),
    format="%B%Y",
)
## Drop old date columns
fuel_units_clean = fuel_units_melt.drop(columns = ["report_year", "month"]).set_index(clean_index_cols)
fuel_units_clean

In [None]:
# Pivot net_generation columns

net_gen_cols = raw_index_cols + [col for col in pr_gen_fuel.columns if col.startswith("net_generation_mwh")]
net_gen = pr_gen_fuel.loc[:, net_gen_cols]

## Melt the fuel_consumed columns
net_gen_melt = net_gen.melt(
    id_vars=raw_index_cols,
    var_name="month",
    value_name="net_generation_mwh"
)
## Split the month from the variable
net_gen_melt["month"] = net_gen_melt["month"].str.replace("net_generation_mwh_", "")
## Create date from month and year
net_gen_melt["date"] = pd.to_datetime(
    net_gen_melt["month"] + net_gen_melt["report_year"].astype(str),
    format="%B%Y",
)
## Drop old date columns
net_gen_clean = net_gen_melt.drop(columns = ["report_year", "month"]).set_index(clean_index_cols)
net_gen_clean

In [None]:
pr_gen_fuel_clean = pd.concat(
    [fuel_elec_mmbtu_clean, fuel_elec_units_clean, fuel_mmbtu_clean, fuel_units_clean, net_gen_clean],
    axis="columns",
).reset_index()
pr_gen_fuel_clean

In [None]:
## Drop a bad plant
pr_gen_fuel_final = pr_gen_fuel_clean.loc[
    ~((pr_gen_fuel_clean.plant_id_eia == 62410) 
    & (pr_gen_fuel_clean.date.dt.year == 2020)
    & (pr_gen_fuel_clean.fuel_consumed_for_electricity_mmbtu.isnull()))
]

# drop after 2025-03-01 (for now)
pr_gen_fuel_final = pr_gen_fuel_final.loc[pr_gen_fuel_clean.date < pd.Timestamp("2025-03-01")]


In [None]:
## some investigation into NA values
pp = pr_gen_fuel_final.set_index(clean_index_cols).isna().all(axis=1).reset_index().rename(columns={0: "isna"}).loc[:, ["date", "plant_name_eia", "isna"]]

pp.plot.scatter(x="date", y="plant_name_eia", c=pp["isna"].astype(int), colormap="viridis", s=10, alpha=0.2, figsize=(10, 20))

# it turns out that after 2025-03-01 there's a pile of NAs, and many plants have multiple generators, some of which report as all-NA while their sibling generators are reporting non-NA values
# though Aguirre Plant and Hewlett Packard Puerto Rico seem to have actual stretches of all-NA time that might be wroth dropping.

In [27]:
# TODO: annual table

pr_gen_fuel.loc[:, raw_index_cols + ["total_fuel_consumption_for_electricity_mmbtu"]]

KeyError: "['total_fuel_consumption_for_electricity_mmbtu'] not in index"