In [1]:
import pandas as pd
import numpy as np

# TODO: read from parquet files instead?
pr_gen_fuel = pd.read_pickle('../data/raw_eia923__puerto_rico_generation_fuel')
pr_plant_frame = pd.read_pickle('../data/raw_eia923__puerto_rico_plant_frame')

In [3]:
# Handle EIA null values
pr_gen_fuel = pr_gen_fuel.replace(to_replace = ".", value = pd.NA)
# Convert data types (mmbtu/units to numeric)
pr_gen_fuel = pr_gen_fuel.convert_dtypes()

# TODO:
#   categoricalize: energy_source_code, fuel_type_code_agg, prime_mover_code, reporting_frequency_code, data_maturity, plant_state
#   bool: associated_combined_heat_power

In [43]:
# Pivot fuel_consumed_for_electricity MMBTU columns
## Only keep the index and relevant fuel_consumed columns

# TODO: add fuel units to the index and split index out into own cell
raw_index_cols = ['plant_id_eia', 'plant_name_eia', 'report_year', 'prime_mover_code', 'energy_source_code']
clean_index_cols = ['date', 'energy_source_code', 'prime_mover_code', 'plant_id_eia', 'plant_name_eia']

fuel_elec_mmbtu_cols = raw_index_cols + [col for col in pr_gen_fuel.columns if "fuel_consumed_for_electricity_mmbtu" in col]
fuel_elec_mmbtu = pr_gen_fuel.loc[:, fuel_elec_mmbtu_cols]

## Melt the fuel_consumed columns
fuel_elec_mmbtu_melt = fuel_elec_mmbtu.melt(
    id_vars=raw_index_cols,
    var_name="month",
    value_name="fuel_consumed_for_electricity_mmbtu"
)
## Split the month from the variable
fuel_elec_mmbtu_melt['month'] = fuel_elec_mmbtu_melt['month'].str.replace("fuel_consumed_for_electricity_mmbtu_", "")
## Create date from month and year
fuel_elec_mmbtu_melt['date'] = pd.to_datetime(
    fuel_elec_mmbtu_melt['month'] + fuel_elec_mmbtu_melt['report_year'].astype(str),
    format='%B%Y',
)
## Drop old date columns
fuel_elec_mmbtu_clean = fuel_elec_mmbtu_melt.drop(columns = ['report_year', 'month']).set_index(clean_index_cols)
fuel_elec_mmbtu_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,fuel_consumed_for_electricity_mmbtu
date,energy_source_code,prime_mover_code,plant_id_eia,plant_name_eia,Unnamed: 5_level_1
2017-04-01,WND,WT,61014,Pattern Santa Isabel LLC,101260.0
2017-04-01,NG,CA,61034,EcoElectrica,0.0
2017-04-01,NG,CT,61034,EcoElectrica,1976130.0
2017-04-01,SUN,PV,61036,AES ILUMINA,31886.0
2017-04-01,BIT,ST,61082,AES Puerto Rico,3258736.0
...,...,...,...,...,...
2025-09-01,DFO,GT,61149,Palo Seco Plant,
2025-09-01,RFO,ST,61149,Palo Seco Plant,
2025-09-01,DFO,GT,61150,Cambalache Plant,
2025-09-01,DFO,GT,61151,Mayaguez Plant,


In [44]:
# Pivot fuel_consumed_for_electricity UNITS columns
fuel_elec_units_cols = raw_index_cols + [col for col in pr_gen_fuel.columns if "fuel_consumed_for_electricity_units" in col]
fuel_elec_units = pr_gen_fuel.loc[:, fuel_elec_units_cols]

## Melt the fuel_consumed columns
fuel_elec_units_melt = fuel_elec_units.melt(
    id_vars=raw_index_cols,
    var_name="month",
    value_name="fuel_consumed_for_electricity_units"
)
## Split the month from the variable
fuel_elec_units_melt['month'] = fuel_elec_units_melt['month'].str.replace("fuel_consumed_for_electricity_units_", "")
## Create date from month and year
fuel_elec_units_melt['date'] = pd.to_datetime(
    fuel_elec_units_melt['month'] + fuel_elec_units_melt['report_year'].astype(str),
    format='%B%Y',
)
## Drop old date columns
fuel_elec_units_clean = fuel_elec_units_melt.drop(columns = ['report_year', 'month']).set_index(clean_index_cols)
fuel_elec_units_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,fuel_consumed_for_electricity_units
date,energy_source_code,prime_mover_code,plant_id_eia,plant_name_eia,Unnamed: 5_level_1
2017-04-01,WND,WT,61014,Pattern Santa Isabel LLC,0.0
2017-04-01,NG,CA,61034,EcoElectrica,0.0
2017-04-01,NG,CT,61034,EcoElectrica,1976130.0
2017-04-01,SUN,PV,61036,AES ILUMINA,0.0
2017-04-01,BIT,ST,61082,AES Puerto Rico,150103.0
...,...,...,...,...,...
2025-09-01,DFO,GT,61149,Palo Seco Plant,
2025-09-01,RFO,ST,61149,Palo Seco Plant,
2025-09-01,DFO,GT,61150,Cambalache Plant,
2025-09-01,DFO,GT,61151,Mayaguez Plant,


In [45]:
# Pivot fuel_consumed MMBTU columns

fuel_mmbtu_cols = raw_index_cols + [col for col in pr_gen_fuel.columns if "fuel_consumed_mmbtu" in col]
fuel_mmbtu = pr_gen_fuel.loc[:, fuel_mmbtu_cols]

## Melt the fuel_consumed columns
fuel_mmbtu_melt = fuel_mmbtu.melt(
    id_vars=raw_index_cols,
    var_name="month",
    value_name="fuel_consumed_mmbtu"
)
## Split the month from the variable
fuel_mmbtu_melt['month'] = fuel_mmbtu_melt['month'].str.replace("fuel_consumed_mmbtu_", "")
## Create date from month and year
fuel_mmbtu_melt['date'] = pd.to_datetime(
    fuel_mmbtu_melt['month'] + fuel_mmbtu_melt['report_year'].astype(str),
    format='%B%Y',
)
## Drop old date columns
fuel_mmbtu_clean = fuel_mmbtu_melt.drop(columns = ['report_year', 'month']).set_index(clean_index_cols)
fuel_mmbtu_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,fuel_consumed_mmbtu
date,energy_source_code,prime_mover_code,plant_id_eia,plant_name_eia,Unnamed: 5_level_1
2017-04-01,WND,WT,61014,Pattern Santa Isabel LLC,101260.0
2017-04-01,NG,CA,61034,EcoElectrica,0.0
2017-04-01,NG,CT,61034,EcoElectrica,1976130.0
2017-04-01,SUN,PV,61036,AES ILUMINA,31886.0
2017-04-01,BIT,ST,61082,AES Puerto Rico,3258736.0
...,...,...,...,...,...
2025-09-01,DFO,GT,61149,Palo Seco Plant,
2025-09-01,RFO,ST,61149,Palo Seco Plant,
2025-09-01,DFO,GT,61150,Cambalache Plant,
2025-09-01,DFO,GT,61151,Mayaguez Plant,


In [46]:
# Pivot fuel_consumed UNITS columns

fuel_units_cols = raw_index_cols + [col for col in pr_gen_fuel.columns if "fuel_consumed_units" in col]
fuel_units = pr_gen_fuel.loc[:, fuel_units_cols]

## Melt the fuel_consumed columns
fuel_units_melt = fuel_units.melt(
    id_vars=raw_index_cols,
    var_name="month",
    value_name="fuel_consumed_units"
)
## Split the month from the variable
fuel_units_melt['month'] = fuel_units_melt['month'].str.replace("fuel_consumed_units_", "")
## Create date from month and year
fuel_units_melt['date'] = pd.to_datetime(
    fuel_units_melt['month'] + fuel_units_melt['report_year'].astype(str),
    format='%B%Y',
)
## Drop old date columns
fuel_units_clean = fuel_units_melt.drop(columns = ['report_year', 'month']).set_index(clean_index_cols)
fuel_units_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,fuel_consumed_units
date,energy_source_code,prime_mover_code,plant_id_eia,plant_name_eia,Unnamed: 5_level_1
2017-04-01,WND,WT,61014,Pattern Santa Isabel LLC,0.0
2017-04-01,NG,CA,61034,EcoElectrica,0.0
2017-04-01,NG,CT,61034,EcoElectrica,1976130.0
2017-04-01,SUN,PV,61036,AES ILUMINA,0.0
2017-04-01,BIT,ST,61082,AES Puerto Rico,150103.0
...,...,...,...,...,...
2025-09-01,DFO,GT,61149,Palo Seco Plant,
2025-09-01,RFO,ST,61149,Palo Seco Plant,
2025-09-01,DFO,GT,61150,Cambalache Plant,
2025-09-01,DFO,GT,61151,Mayaguez Plant,


In [47]:
# Pivot net_generation columns

net_gen_cols = raw_index_cols + [col for col in pr_gen_fuel.columns if col.startswith("net_generation_mwh")]
net_gen = pr_gen_fuel.loc[:, net_gen_cols]

## Melt the fuel_consumed columns
net_gen_melt = net_gen.melt(
    id_vars=raw_index_cols,
    var_name="month",
    value_name="net_generation_mwh"
)
## Split the month from the variable
net_gen_melt['month'] = net_gen_melt['month'].str.replace("net_generation_mwh_", "")
## Create date from month and year
net_gen_melt['date'] = pd.to_datetime(
    net_gen_melt['month'] + net_gen_melt['report_year'].astype(str),
    format='%B%Y',
)
## Drop old date columns
net_gen_clean = net_gen_melt.drop(columns = ['report_year', 'month']).set_index(clean_index_cols)
net_gen_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,net_generation_mwh
date,energy_source_code,prime_mover_code,plant_id_eia,plant_name_eia,Unnamed: 5_level_1
2017-04-01,WND,WT,61014,Pattern Santa Isabel LLC,10991.0
2017-04-01,NG,CA,61034,EcoElectrica,86494.0
2017-04-01,NG,CT,61034,EcoElectrica,189669.0
2017-04-01,SUN,PV,61036,AES ILUMINA,3461.0
2017-04-01,BIT,ST,61082,AES Puerto Rico,310975.0
...,...,...,...,...,...
2025-09-01,DFO,GT,61149,Palo Seco Plant,
2025-09-01,RFO,ST,61149,Palo Seco Plant,
2025-09-01,DFO,GT,61150,Cambalache Plant,
2025-09-01,DFO,GT,61151,Mayaguez Plant,


In [57]:
pr_gen_fuel_clean = pd.concat(
    [fuel_elec_mmbtu_clean, fuel_elec_units_clean, fuel_mmbtu_clean, fuel_units_clean, net_gen_clean],
    axis="columns",
).reset_index()
pr_gen_fuel_clean

Unnamed: 0,date,energy_source_code,prime_mover_code,plant_id_eia,plant_name_eia,fuel_consumed_for_electricity_mmbtu,fuel_consumed_for_electricity_units,fuel_consumed_mmbtu,fuel_consumed_units,net_generation_mwh
0,2017-04-01,WND,WT,61014,Pattern Santa Isabel LLC,101260.0,0.0,101260.0,0.0,10991.0
1,2017-04-01,NG,CA,61034,EcoElectrica,0.0,0.0,0.0,0.0,86494.0
2,2017-04-01,NG,CT,61034,EcoElectrica,1976130.0,1976130.0,1976130.0,1976130.0,189669.0
3,2017-04-01,SUN,PV,61036,AES ILUMINA,31886.0,0.0,31886.0,0.0,3461.0
4,2017-04-01,BIT,ST,61082,AES Puerto Rico,3258736.0,150103.0,3258736.0,150103.0,310975.0
...,...,...,...,...,...,...,...,...,...,...
5395,2025-09-01,DFO,GT,61149,Palo Seco Plant,,,,,
5396,2025-09-01,RFO,ST,61149,Palo Seco Plant,,,,,
5397,2025-09-01,DFO,GT,61150,Cambalache Plant,,,,,
5398,2025-09-01,DFO,GT,61151,Mayaguez Plant,,,,,


In [58]:
## Drop a bad plant
pr_gen_fuel_final = pr_gen_fuel_clean.loc[
    ~((pr_gen_fuel_clean.plant_id_eia == 62410) 
    & (pr_gen_fuel_clean.date.dt.year == 2020)
    & (pr_gen_fuel_clean.fuel_consumed_for_electricity_mmbtu.isnull()))
]

## TODO what about other bad plants / timespans? look for the ones that have lots of NA - what's going on there?

In [60]:
pr_gen_fuel_final

Unnamed: 0,date,energy_source_code,prime_mover_code,plant_id_eia,plant_name_eia,fuel_consumed_for_electricity_mmbtu,fuel_consumed_for_electricity_units,fuel_consumed_mmbtu,fuel_consumed_units,net_generation_mwh
0,2017-04-01,WND,WT,61014,Pattern Santa Isabel LLC,101260.0,0.0,101260.0,0.0,10991.0
1,2017-04-01,NG,CA,61034,EcoElectrica,0.0,0.0,0.0,0.0,86494.0
2,2017-04-01,NG,CT,61034,EcoElectrica,1976130.0,1976130.0,1976130.0,1976130.0,189669.0
3,2017-04-01,SUN,PV,61036,AES ILUMINA,31886.0,0.0,31886.0,0.0,3461.0
4,2017-04-01,BIT,ST,61082,AES Puerto Rico,3258736.0,150103.0,3258736.0,150103.0,310975.0
...,...,...,...,...,...,...,...,...,...,...
5395,2025-09-01,DFO,GT,61149,Palo Seco Plant,,,,,
5396,2025-09-01,RFO,ST,61149,Palo Seco Plant,,,,,
5397,2025-09-01,DFO,GT,61150,Cambalache Plant,,,,,
5398,2025-09-01,DFO,GT,61151,Mayaguez Plant,,,,,
