In [1]:
import pandas as pd
import numpy as np

pr_gen_fuel = pd.read_pickle('../data/raw_eia923__puerto_rico_generation_fuel')

In [2]:
pd.set_option('future.no_silent_downcasting', True)

In [3]:
# Handle EIA null values
pr_gen_fuel = pr_gen_fuel.replace(to_replace = ".", value = pd.NA)
# Convert data types (mmbtu/units to numeric)
pr_gen_fuel = pr_gen_fuel.convert_dtypes()

In [6]:
# create some useful column sets
primary_key_columns = ['plant_id_eia', 'plant_name_eia', 'report_year', 'prime_mover_code', 'energy_source_code']

monthly_variables = []
for col in pr_gen_fuel.columns:
    if col.endswith("january"):
        monthly_variables.append(col.replace("_january", ""))

monthly_columns = []
for col in pr_gen_fuel.columns:
    for var in monthly_variables:
        if col.startswith(var):
            monthly_columns.append(col)

In [8]:
# pivot the monthly variables into their own df
monthly_dfs = []
# swap in a different date column
monthly_primary_key_columns = ["date"] + [c for c in primary_key_columns if c != "report_year"]
for monthly_var in monthly_variables:
    ## Only keep the index and monthly variable columns
    column_subset_list = primary_key_columns + [col for col in pr_gen_fuel.columns if col.startswith(monthly_var)]
    var_pivot = (
        pr_gen_fuel.loc[:, column_subset_list]
        .melt(id_vars = primary_key_columns)
    )
    ## Split the month from the variable
    var_pivot[['variable', 'month']] = var_pivot['variable'].str.rsplit("_", n=1, expand=True)
    ## Create date from month and year
    var_pivot['date'] = pd.to_datetime(var_pivot['month'] + var_pivot['report_year'].astype(str), format='%B%Y')
    # we don't need the year/month/variable cols anymore
    monthly_dfs.append(
        var_pivot.drop(columns=["report_year", "month", "variable"])
        .rename(columns={"value":monthly_var})
        # setting an index so we can concatenate later
        .set_index(monthly_primary_key_columns)
    )
pr_gen_fuel_monthly = pd.concat(monthly_dfs, axis="columns").reset_index()

In [12]:
# the rest of the columns are annual
pr_gen_fuel_annual = pr_gen_fuel.drop(columns=monthly_columns)

In [11]:
# drop a bad plant
pr_gen_fuel_monthly = (
    pr_gen_fuel_monthly.loc[~(
        (pr_gen_fuel_monthly.plant_id_eia == 62410)
        & (pr_gen_fuel_monthly.date.dt.year == 2020)
        & (pr_gen_fuel_monthly.fuel_consumed_for_electricity_mmbtu.isnull()))]
)