In [14]:
import pandas as pd
import numpy as np

pr_gen_fuel = pd.read_pickle('../data/raw_eia923__puerto_rico_generation_fuel')

In [15]:
pd.set_option('future.no_silent_downcasting', True)

In [16]:
# Handle EIA null values
pr_gen_fuel = pr_gen_fuel.replace(to_replace = ".", value = pd.NA)
# Convert data types (mmbtu/units to numeric)
pr_gen_fuel = pr_gen_fuel.convert_dtypes()

In [18]:
pr_gen_fuel.dtypes.value_counts()

Float64           73
string[python]    11
Int64             11
object             1
Name: count, dtype: int64

In [19]:
help(pr_gen_fuel.convert_dtypes)

Help on method convert_dtypes in module pandas.core.generic:

convert_dtypes(infer_objects: 'bool_t' = True, convert_string: 'bool_t' = True, convert_integer: 'bool_t' = True, convert_boolean: 'bool_t' = True, convert_floating: 'bool_t' = True, dtype_backend: 'DtypeBackend' = 'numpy_nullable') -> 'Self' method of pandas.core.frame.DataFrame instance
    Convert columns to the best possible dtypes using dtypes supporting ``pd.NA``.

    Parameters
    ----------
    infer_objects : bool, default True
        Whether object dtypes should be converted to the best possible types.
    convert_string : bool, default True
        Whether object dtypes should be converted to ``StringDtype()``.
    convert_integer : bool, default True
        Whether, if possible, conversion can be done to integer extension types.
    convert_boolean : bool, defaults True
        Whether object dtypes should be converted to ``BooleanDtypes()``.
    convert_floating : bool, defaults True
        Whether, if poss

In [6]:
# create some useful column sets
primary_key_columns = ['plant_id_eia', 'plant_name_eia', 'report_year', 'prime_mover_code', 'energy_source_code']

monthly_variables = []
for col in pr_gen_fuel.columns:
    if col.endswith("january"):
        monthly_variables.append(col.replace("_january", ""))

monthly_columns = []
for col in pr_gen_fuel.columns:
    for var in monthly_variables:
        if col.startswith(var):
            monthly_columns.append(col)

In [20]:
# split off a separate table for monthly data
# pivot so each row is a different month
monthly_dfs = []
# swap in a different date column
monthly_primary_key_columns = ["date"] + [c for c in primary_key_columns if c != "report_year"]
for monthly_var in monthly_variables:
    ## Only keep the index and monthly variable columns
    column_subset_list = primary_key_columns + [col for col in pr_gen_fuel.columns if col.startswith(monthly_var)]
    var_pivot = (
        pr_gen_fuel.loc[:, column_subset_list]
        .melt(id_vars = primary_key_columns)
    )
    ## Split the month from the variable
    var_pivot[['variable', 'month']] = var_pivot['variable'].str.rsplit("_", n=1, expand=True)
    ## Create date from month and year
    var_pivot['date'] = pd.to_datetime(var_pivot['month'] + var_pivot['report_year'].astype(str), format='%B%Y')
    # we don't need the year/month/variable cols anymore
    monthly_dfs.append(
        var_pivot.drop(columns=["report_year", "month", "variable"])
        .rename(columns={"value":monthly_var})
        # setting an index so we can concatenate later
        .set_index(monthly_primary_key_columns)
    )
pr_gen_fuel_monthly = pd.concat(monthly_dfs, axis="columns").reset_index()

In [21]:
pr_gen_fuel_monthly

Unnamed: 0,date,plant_id_eia,plant_name_eia,prime_mover_code,energy_source_code,fuel_consumed_for_electricity_mmbtu,fuel_consumed_for_electricity_units,fuel_consumed_mmbtu,fuel_consumed_units,fuel_mmbtu_per_unit,net_generation_mwh
0,2017-04-01,61014,Pattern Santa Isabel LLC,WT,WND,101260.0,0.0,101260.0,0.0,0.0,10991.0
1,2017-04-01,61034,EcoElectrica,CA,NG,0.0,0.0,0.0,0.0,0.0,86494.0
2,2017-04-01,61034,EcoElectrica,CT,NG,1976130.0,1976130.0,1976130.0,1976130.0,1.0,189669.0
3,2017-04-01,61036,AES ILUMINA,PV,SUN,31886.0,0.0,31886.0,0.0,0.0,3461.0
4,2017-04-01,61082,AES Puerto Rico,ST,BIT,3258736.0,150103.0,3258736.0,150103.0,21.71,310975.0
...,...,...,...,...,...,...,...,...,...,...,...
5395,2025-09-01,61149,Palo Seco Plant,GT,DFO,,,,,,
5396,2025-09-01,61149,Palo Seco Plant,ST,RFO,,,,,,
5397,2025-09-01,61150,Cambalache Plant,GT,DFO,,,,,,
5398,2025-09-01,61151,Mayaguez Plant,GT,DFO,,,,,,


In [12]:
# the rest of the columns are annual
pr_gen_fuel_annual = pr_gen_fuel.drop(columns=monthly_columns)

In [22]:
pr_gen_fuel_annual

Unnamed: 0,associated_combined_heat_power,census_region,elec_fuel_consumption_mmbtu,electric_fuel_consumption_quantity,energy_source_code,fuel_type_code_agg,fuel_unit,naics_code,nerc_region,nuclear_unit_id,...,plant_state,prime_mover_code,report_year,reporting_frequency_code,sector_id_eia,sector_name_eia,total_fuel_consumption_mmbtu,total_fuel_consumption_quantity,total_net_generation_mwh,data_maturity
0,N,,1024754,0,WND,WND,,22,,,...,PR,WT,2017,,2,NAICS-22 Non-Cogen,1024754,0,111229.0,final
1,Y,,85845,82921,NG,NG,mcf,2122,,,...,PR,CA,2017,,7,Industrial NAICS Cogen,85845,82921,752988.0,final
2,Y,,22632800,21916433,NG,NG,mcf,2122,,,...,PR,CT,2017,,7,Industrial NAICS Cogen,23660771,22909642,2020310.0,final
3,N,,274243,0,SUN,SUN,,22,,,...,PR,PV,2017,,2,NAICS-22 Non-Cogen,274243,0,29767.0,final
4,N,,24372958,1119122,BIT,COL,short tons,22,,,...,PR,ST,2017,,2,NAICS-22 Non-Cogen,24372958,1119122,2315554.0,final
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,N,,323594,55792,DFO,DFO,barrels,22,,,...,PR,GT,2025,,1,Electric Utility,323594,55792,25700.0,incremental_ytd
446,N,,1987852,315532,RFO,RFO,barrels,22,,,...,PR,ST,2025,,1,Electric Utility,1987852,315532,192938.0,incremental_ytd
447,N,,833507,143708,DFO,DFO,barrels,22,,,...,PR,GT,2025,,1,Electric Utility,833507,143708,65800.0,incremental_ytd
448,N,,662563,114235,DFO,DFO,barrels,22,,,...,PR,GT,2025,,1,Electric Utility,662563,114235,60462.0,incremental_ytd


In [11]:
# drop a bad plant
pr_gen_fuel_monthly = (
    pr_gen_fuel_monthly.loc[~(
        (pr_gen_fuel_monthly.plant_id_eia == 62410)
        & (pr_gen_fuel_monthly.date.dt.year == 2020)
        & (pr_gen_fuel_monthly.fuel_consumed_for_electricity_mmbtu.isnull()))]
)

In [23]:
pr_gen_fuel_monthly.to_parquet("../data/eia923__monthly_puerto_rico_generation_fuel.parquet")
pr_gen_fuel_annual.to_parquet("../data/eia923__annual_puerto_rico_generation_fuel.parquet")