## Preprocessing data

## Libraries

In [1]:
import pandas as pd
import numpy as np

<hr>

## Processing Generation Dataset

### Load dataset

In [2]:
# Load generation dataset
df_gen = pd.read_excel('../../data/in/endog/generation_2020_2025.xlsx')
df_gen.head()

Unnamed: 0,Fecha,Recurso,Tipo Generación,Combustible,Tipo Despacho,Es Menor,Clasificación,0,1,2,...,15,16,17,18,19,20,21,22,23,Version
0,2020-01-01,AGPE - ECOPETROL LA HORMIGA,TERMICA,GAS,NO DESPACHADO CENTRALMENTE,NO,AUTOG PEQ. ESCALA,226.51,223.84,223.12,...,269.53,273.34,273.69,227.1,225.26,256.94,155.93,10.91,11.59,TX5
1,2020-01-01,AGUA FRESCA,HIDRAULICA,AGUA,NO DESPACHADO CENTRALMENTE,SI,NORMAL,7268.4,7268.4,7270.8,...,7024.8,6937.2,6874.8,6802.8,6764.4,6729.6,6714.0,5462.4,5950.8,TX5
2,2020-01-01,ALBAN,HIDRAULICA,AGUA,DESPACHADO CENTRALMENTE,NO,NORMAL,251520.0,251040.0,251280.0,...,238880.0,238880.0,238960.0,244786.0,246944.0,248476.0,250296.0,250590.0,250340.0,TX5
3,2020-01-01,ALEJANDRÍA,HIDRAULICA,AGUA,NO DESPACHADO CENTRALMENTE,SI,NORMAL,14861.0,14608.0,14322.0,...,8536.0,8338.0,8525.0,10065.0,12892.0,14839.0,14927.0,14982.0,14872.0,TX5
4,2020-01-01,ALTO TULUA,HIDRAULICA,AGUA,NO DESPACHADO CENTRALMENTE,SI,NORMAL,0.0,0.0,0.0,...,10908.0,10908.0,10911.6,10908.0,10908.0,10915.2,10915.2,10918.8,10922.4,TX5


In [3]:
print(f"Size dataset: {df_gen.shape[0]} rows and {df_gen.shape[1]} columns")

Size dataset: 448336 rows and 32 columns


### Validate missing values

In [4]:
print("Are there missing values in the dataset?: ", df_gen.isnull().values.any())
print("Missing values by column:")
df_gen.isnull().sum()

Are there missing values in the dataset?:  True
Missing values by column:


Fecha               0
Recurso             0
Tipo Generación     0
Combustible         0
Tipo Despacho       0
Es Menor            0
Clasificación       0
0                  36
1                   5
2                   6
3                   2
4                   0
5                   2
6                   4
7                   5
8                   3
9                   8
10                  2
11                  2
12                  6
13                  3
14                  6
15                  2
16                  0
17                  1
18                  3
19                 50
20                  1
21                  2
22                  3
23                  6
Version             0
dtype: int64

In this case, the missing values represent a contribition of zero (0) to energy generation. Therefore, the missing values will be replaced with 0.

In [5]:
df_gen = df_gen.fillna(0)

In [6]:
print("Are there missing values in the dataset? (after replacement): ", df_gen.isnull().values.any())
print("Missing values by column (after replacement):")
df_gen.isnull().sum()

Are there missing values in the dataset? (after replacement):  False
Missing values by column (after replacement):


Fecha              0
Recurso            0
Tipo Generación    0
Combustible        0
Tipo Despacho      0
Es Menor           0
Clasificación      0
0                  0
1                  0
2                  0
3                  0
4                  0
5                  0
6                  0
7                  0
8                  0
9                  0
10                 0
11                 0
12                 0
13                 0
14                 0
15                 0
16                 0
17                 0
18                 0
19                 0
20                 0
21                 0
22                 0
23                 0
Version            0
dtype: int64

### Standardize "Tipo Generación" values

In [7]:
df_gen['Tipo Generación'].unique()

array(['TERMICA', 'HIDRAULICA', 'SOLAR', 'COGENERADOR', 'EOLICA', 'Solar'],
      dtype=object)

Convert all values ​​in the 'Tipo Generación' column to uppercase

In [8]:
df_gen['Tipo Generación'] = df_gen['Tipo Generación'].str.upper()
df_gen['Tipo Generación'].unique()

array(['TERMICA', 'HIDRAULICA', 'SOLAR', 'COGENERADOR', 'EOLICA'],
      dtype=object)

After validating that there are not missing values in the dataset, the next step is to transform the dataset into the appropiate format.

### Melt columns hours (0-23) -> wide to long format

In [9]:
# Melt columns hours (0-23) -> wide to long format
df_gen_long = df_gen.melt(
    id_vars = ["Fecha", "Tipo Generación"], # Fixed columns
    value_vars = [str(i) for i in range(24)], # Columns to melt (0 to 23)
    var_name = "Hora", # New column name to hours
    value_name="generacion_kwh" # New column name for generation values
)

df_gen_long.head(10)

Unnamed: 0,Fecha,Tipo Generación,Hora,generacion_kwh
0,2020-01-01,TERMICA,0,226.51
1,2020-01-01,HIDRAULICA,0,7268.4
2,2020-01-01,HIDRAULICA,0,251520.0
3,2020-01-01,HIDRAULICA,0,14861.0
4,2020-01-01,HIDRAULICA,0,0.0
5,2020-01-01,HIDRAULICA,0,13917.6
6,2020-01-01,HIDRAULICA,0,625.96
7,2020-01-01,HIDRAULICA,0,72659.88
8,2020-01-01,HIDRAULICA,0,482.71
9,2020-01-01,HIDRAULICA,0,14442.6


### Group and Pivot
Group by Date, Hour and Generation Type (summarized)

In [10]:
# Group by Fecha, Hora and Tipo Generación (summarized)

df_gen_pivot = df_gen_long.groupby(["Fecha", "Hora", "Tipo Generación"])["generacion_kwh"].sum().unstack()
df_gen_pivot.head(25)

Unnamed: 0_level_0,Tipo Generación,COGENERADOR,EOLICA,HIDRAULICA,SOLAR,TERMICA
Fecha,Hora,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-01 00:00:00,0,21522.11,3328.1,5244418.15,0.0,1391375.95
2021-01-01 00:00:00,1,21766.38,4063.22,5117544.23,0.0,1325940.54
2021-01-01 00:00:00,10,21301.39,10220.62,5118684.25,102386.27,1017685.76
2021-01-01 00:00:00,11,22571.79,8472.44,5285404.7,110299.13,1044030.63
2021-01-01 00:00:00,12,22660.4,9043.93,5382742.33,102263.57,1096915.81
2021-01-01 00:00:00,13,22550.31,8886.6,5387610.12,89491.0,1161082.1
2021-01-01 00:00:00,14,22865.93,9317.35,5323733.7,79919.87,1180372.44
2021-01-01 00:00:00,15,22454.38,10573.28,5257803.84,64496.33,1160653.89
2021-01-01 00:00:00,16,22528.86,9816.01,5129654.92,30274.2,1280373.91
2021-01-01 00:00:00,17,18137.58,9010.7,5059082.54,4330.74,1435643.03


<hr>

## Processing Spot Prices Dataset

### Load dataset

In [11]:
# Load dataset
df_price = pd.read_excel('../../data/in/endog/spot_prices_2020_2025.xlsx')
df_price.head()

Unnamed: 0,Fecha,0,1,2,3,4,5,6,7,8,...,15,16,17,18,19,20,21,22,23,Versión
0,2020-01-01,72.02,136.71,127.71,127.71,127.71,72.02,72.02,72.02,72.02,...,136.67,127.71,72.02,183.93,406.73,213.73,213.73,213.73,136.71,TX5
1,2020-01-02,71.34,71.34,71.34,71.34,71.34,73.95,73.95,137.88,185.1,...,357.8,357.8,307.9,397.9,387.9,357.8,307.9,307.9,307.9,TX5
2,2020-01-03,116.31,116.31,126.29,126.29,135.29,126.29,123.46,275.31,275.31,...,275.31,275.31,275.31,285.31,285.31,275.31,275.31,275.31,275.31,TX5
3,2020-01-04,144.36,144.36,144.36,144.36,134.34,144.36,134.34,144.36,188.36,...,255.36,255.36,255.36,255.36,255.36,255.36,255.36,255.36,188.36,TX5
4,2020-01-05,209.12,209.12,188.12,188.12,188.12,188.12,180.62,157.12,188.12,...,209.12,188.12,209.12,305.12,305.12,305.12,305.12,305.12,209.12,TX5


In [12]:
print(f"Size dataset: {df_price.shape[0]} rows and {df_price.shape[1]} columns")

Size dataset: 2008 rows and 26 columns


### Validate missing values

In [13]:
print("Are there missing values in the dataset?: ", df_price.isnull().values.any())
print("Missing values by column:")
df_price.isnull().sum()

Are there missing values in the dataset?:  False
Missing values by column:


Fecha      0
0          0
1          0
2          0
3          0
4          0
5          0
6          0
7          0
8          0
9          0
10         0
11         0
12         0
13         0
14         0
15         0
16         0
17         0
18         0
19         0
20         0
21         0
22         0
23         0
Versión    0
dtype: int64

There are not missing values in the dataset. Therefore, no processing is necessary.

### Melt columns hours (0-23) -> wide to long format

In [14]:
# Derretir columnas de horas (0-23)
df_price_long = df_price.melt(
    id_vars = ["Fecha"], # Fixed columns
    value_vars = [str(i) for i in range(24)], # Columns to melt (0 to 23)
    var_name = "Hora", # New column name to hours
    value_name = "Precio" # New column name for price values
)

df_price_long.head()

Unnamed: 0,Fecha,Hora,Precio
0,2020-01-01,0,72.02
1,2020-01-02,0,71.34
2,2020-01-03,0,116.31
3,2020-01-04,0,144.36
4,2020-01-05,0,209.12


<hr>

## Processing fuel consumption dataset

### Load dataset

In [15]:
# Load dataset
df_fuel_comsump = pd.read_excel("../../data/in/exog/fuel_consumption_2020_2025.xlsx", parse_dates=["Fecha"])
df_fuel_comsump.head()

Unnamed: 0,Fecha,Recurso,Código Agente,Combustible,Consumo Combustible (MBTU),Version
0,2020-01-01,BARRANQUILLA 3,TBSG,GAS,0.0,TX5
1,2020-01-01,BARRANQUILLA 3,TBSG,COMBUSTOLEO,0.0,TX5
2,2020-01-01,BARRANQUILLA 3,TBSG,GAS NI,0.0,TX5
3,2020-01-01,BARRANQUILLA 4,TBSG,GAS,0.0,TX5
4,2020-01-01,BARRANQUILLA 4,TBSG,COMBUSTOLEO,0.0,TX5


In [16]:
print(f"Size dataset: {df_fuel_comsump.shape[0]} rows and {df_fuel_comsump.shape[1]} columns")

Size dataset: 92793 rows and 6 columns


### Validate missing values

In [17]:
print("Are there missing values in the dataset?: ", df_fuel_comsump.isnull().values.any())
print("Missing values by column:")
df_fuel_comsump.isnull().sum()

Are there missing values in the dataset?:  False
Missing values by column:


Fecha                         0
Recurso                       0
Código Agente                 0
Combustible                   0
Consumo Combustible (MBTU)    0
Version                       0
dtype: int64

There are not missing values in the dataset. Therefore, no processing is necessary.

### Standardize "Combustible" values

In [18]:
df_fuel_comsump['Combustible'].unique()

array(['GAS', 'COMBUSTOLEO', 'GAS NI', 'CARBON', 'ACPM', 'QUEROSENE',
       'CRUDO', 'GLP'], dtype=object)

The "Combustible" values are standardized.

After validating that there are not missing values in the dataset, the next step is to transform the dataset into the appropiate format.

### Group by date and fuel, and add up the consumption

In [19]:
df_fuel_comsump_group = df_fuel_comsump.groupby(['Fecha', 'Combustible'])['Consumo Combustible (MBTU)'].sum().reset_index()
df_fuel_comsump_group.head(10)

Unnamed: 0,Fecha,Combustible,Consumo Combustible (MBTU)
0,2020-01-01,ACPM,0.0
1,2020-01-01,CARBON,234788.670155
2,2020-01-01,COMBUSTOLEO,0.0
3,2020-01-01,GAS,167262.229
4,2020-01-01,GAS NI,0.0
5,2020-01-01,QUEROSENE,0.0
6,2020-01-02,ACPM,431.0
7,2020-01-02,CARBON,229485.340893
8,2020-01-02,COMBUSTOLEO,0.0
9,2020-01-02,GAS,240709.2757


### Pivot DataFrame to have one column for each fuel type

In [20]:
df_fuel_comsump_pivot = df_fuel_comsump_group.pivot(index='Fecha', columns='Combustible', values='Consumo Combustible (MBTU)')

df_fuel_comsump_pivot = df_fuel_comsump_pivot.fillna(0)

# Reset index
df_fuel_comsump_pivot = df_fuel_comsump_pivot.reset_index()

# Rename columns
df_fuel_comsump_pivot.columns = ["FUEL_CONS_" + col.strip().upper().replace(" ", "_").replace("-", "_") 
                                for col in df_fuel_comsump_pivot.columns]

df_fuel_comsump_pivot = df_fuel_comsump_pivot.rename(columns={"FUEL_CONS_FECHA": "FECHA"})

df_fuel_comsump_pivot.head(10)

Unnamed: 0,FECHA,FUEL_CONS_ACPM,FUEL_CONS_CARBON,FUEL_CONS_COMBUSTOLEO,FUEL_CONS_CRUDO,FUEL_CONS_GAS,FUEL_CONS_GAS_NI,FUEL_CONS_GLP,FUEL_CONS_QUEROSENE
0,2020-01-01,0.0,234788.670155,0.0,0.0,167262.229,0.0,0.0,0.0
1,2020-01-02,431.0,229485.340893,0.0,0.0,240709.2757,0.0,0.0,0.0
2,2020-01-03,624.0,244625.058498,0.0,0.0,176133.1623,0.0,0.0,0.0
3,2020-01-04,270.24,246854.779129,0.0,0.0,207664.8132,0.0,0.0,0.0
4,2020-01-05,227.47,232745.678846,0.0,0.0,181604.2601,0.0,0.0,0.0
5,2020-01-06,62.87,257606.363626,0.0,0.0,177081.8053,0.0,0.0,0.0
6,2020-01-07,0.0,320653.3814,0.0,0.0,187409.5422,0.0,0.0,0.0
7,2020-01-08,0.0,341092.243844,0.0,0.0,196835.9927,0.0,0.0,0.0
8,2020-01-09,0.0,340148.877333,0.0,0.0,187130.0714,322.08,0.0,0.0
9,2020-01-10,0.0,312694.320108,0.0,0.0,166863.192,0.0,0.0,0.0


In [21]:
# Count unique values for each column
print("Total unique values for each column:")
df_fuel_comsump_pivot.nunique()

Total unique values for each column:


FECHA                    2008
FUEL_CONS_ACPM           1708
FUEL_CONS_CARBON         2004
FUEL_CONS_COMBUSTOLEO     506
FUEL_CONS_CRUDO            88
FUEL_CONS_GAS            2008
FUEL_CONS_GAS_NI         1026
FUEL_CONS_GLP              23
FUEL_CONS_QUEROSENE         1
dtype: int64

In [22]:
df_fuel_comsump_pivot['FUEL_CONS_QUEROSENE'].unique()

array([0.])

Because there is only one value "0" for the "QUEROSENE" fuel type, it can be removed.

In [23]:
df_fuel_comsump_f = df_fuel_comsump_pivot.drop(columns=["FUEL_CONS_QUEROSENE"])
df_fuel_comsump_f.head()

Unnamed: 0,FECHA,FUEL_CONS_ACPM,FUEL_CONS_CARBON,FUEL_CONS_COMBUSTOLEO,FUEL_CONS_CRUDO,FUEL_CONS_GAS,FUEL_CONS_GAS_NI,FUEL_CONS_GLP
0,2020-01-01,0.0,234788.670155,0.0,0.0,167262.229,0.0,0.0
1,2020-01-02,431.0,229485.340893,0.0,0.0,240709.2757,0.0,0.0
2,2020-01-03,624.0,244625.058498,0.0,0.0,176133.1623,0.0,0.0
3,2020-01-04,270.24,246854.779129,0.0,0.0,207664.8132,0.0,0.0
4,2020-01-05,227.47,232745.678846,0.0,0.0,181604.2601,0.0,0.0


<hr>

## Processing fuel cost dataset

### Load dataset

In [24]:
# Load dataset
df_fuel_cost = pd.read_excel("../../data/in/exog/fuel_supply_cost_2020_2025.xlsx", parse_dates=["Fecha"])

# Correct columns names (UPPER CASE)
df_fuel_cost.columns = [col.upper() for col in df_fuel_cost.columns]

df_fuel_cost.head()

Unnamed: 0,FECHA,CARBON,GAS,GAS NI,COMBUSTOLEO
0,2020-01-01,130.459523,240.075697,,585.085344
1,2020-01-02,130.459523,240.075697,,585.085344
2,2020-01-03,130.459523,240.075697,,585.085344
3,2020-01-04,122.3708,238.169035,,585.085344
4,2020-01-05,122.3708,238.169035,,585.085344


### Validate missing values

In [25]:
print("Are there missing values in the dataset?: ", df_fuel_cost.isnull().values.any())
print("Missing values by column:")
df_fuel_cost.isnull().sum()

Are there missing values in the dataset?:  True
Missing values by column:


FECHA            0
CARBON          59
GAS              0
GAS NI         119
COMBUSTOLEO      0
dtype: int64

In this case, the missing values represent a contribition of zero (0) to cost fuel. Therefore, the missing values will be replaced with 0.

In [26]:
df_fuel_cost = df_fuel_cost.fillna(0)

In [27]:
print("Are there missing values in the dataset? (after replacement): ", df_fuel_cost.isnull().values.any())
print("Missing values by column (after replacement):")
df_fuel_cost.isnull().sum()

Are there missing values in the dataset? (after replacement):  False
Missing values by column (after replacement):


FECHA          0
CARBON         0
GAS            0
GAS NI         0
COMBUSTOLEO    0
dtype: int64

In [28]:
df_fuel_cost.columns = ["FUEL_COST_" + col.strip().upper().replace(" ", "_").replace("-", "_") for col in df_fuel_cost.columns]

df_fuel_cost = df_fuel_cost.rename(columns={"FUEL_COST_FECHA": "FECHA"})

df_fuel_cost.head()

Unnamed: 0,FECHA,FUEL_COST_CARBON,FUEL_COST_GAS,FUEL_COST_GAS_NI,FUEL_COST_COMBUSTOLEO
0,2020-01-01,130.459523,240.075697,0.0,585.085344
1,2020-01-02,130.459523,240.075697,0.0,585.085344
2,2020-01-03,130.459523,240.075697,0.0,585.085344
3,2020-01-04,122.3708,238.169035,0.0,585.085344
4,2020-01-05,122.3708,238.169035,0.0,585.085344


<hr>

## Processing ENSO dataset

### Load dataset

In [29]:
# Load dataset
df_enso = pd.read_csv("../../data/in/exog/ENSO_2020_2025.csv", parse_dates=["FECHA"])
df_enso.head()

Unnamed: 0,FECHA,ENSO,NIVEL_ENSO
0,2020-01-01,0.5,1
1,2020-02-01,0.5,1
2,2020-03-01,0.4,0
3,2020-04-01,0.2,0
4,2020-05-01,-0.1,0


In [30]:
print(f"Size dataset: {df_enso.shape[0]} rows and {df_enso.shape[1]} columns")

Size dataset: 67 rows and 3 columns


### Validate missing values

In [31]:
print("Are there missing values in the dataset?: ", df_enso.isnull().values.any())
print("Missing values by column:")
df_enso.isnull().sum()

Are there missing values in the dataset?:  False
Missing values by column:


FECHA         0
ENSO          0
NIVEL_ENSO    0
dtype: int64

There are not missing values in the dataset. Therefore, no processing is necessary.

### Drop columns

In [32]:
df_enso = df_enso.drop(columns=["ENSO"])
df_enso.head()

Unnamed: 0,FECHA,NIVEL_ENSO
0,2020-01-01,1
1,2020-02-01,1
2,2020-03-01,0
3,2020-04-01,0
4,2020-05-01,0


### Autofill all days of the month with the ENSO for the 1st of each month

In [33]:
df_enso['FECHA'] = pd.to_datetime(df_enso['FECHA'])         # 1) Convierte a datetime
df_enso['PERIODO'] = df_enso['FECHA'].dt.to_period('M')     # 2) Crea una columna de periodo mensual

# 3) Fijar como índice ese periodo y re-muestrear a diario
df_enso = (
    df_enso.groupby('PERIODO').mean(numeric_only=True)  # o .first(), .last(), según tu caso
          .to_timestamp()
          .resample('D')
          .ffill()
          .reset_index()
)
df_enso['FECHA'] = df_enso['PERIODO']

# 4) Reconstruir la columna de fecha diaria
df_enso['FECHA'] = df_enso['PERIODO']

# 5) Reordenar columnas
cols = ['FECHA'] + [c for c in df_enso.columns if c not in ('FECHA', 'PERIODO')]
df_enso = df_enso[cols]

# Ahora df_enso tiene una fila por cada día del periodo, con los datos mensuales repetidos
df_enso.head(10)


Unnamed: 0,FECHA,NIVEL_ENSO
0,2020-01-01,1.0
1,2020-01-02,1.0
2,2020-01-03,1.0
3,2020-01-04,1.0
4,2020-01-05,1.0
5,2020-01-06,1.0
6,2020-01-07,1.0
7,2020-01-08,1.0
8,2020-01-09,1.0
9,2020-01-10,1.0


In [34]:
df_enso.tail(10)

Unnamed: 0,FECHA,NIVEL_ENSO
1999,2025-06-22,0.0
2000,2025-06-23,0.0
2001,2025-06-24,0.0
2002,2025-06-25,0.0
2003,2025-06-26,0.0
2004,2025-06-27,0.0
2005,2025-06-28,0.0
2006,2025-06-29,0.0
2007,2025-06-30,0.0
2008,2025-07-01,0.0


<hr>

## Processing IPC dataset

### Load dataset

In [35]:
# Load dataset
df_ipc = pd.read_csv("../../data/in/exog/IPC_2020_2025.csv", parse_dates=["FECHA"])
df_ipc.head()

Unnamed: 0,FECHA,IPC_VAR_MOM_PCT,IPC_VAR_YOY_PCT
0,2019-01-01,0.6,3.8
1,2019-02-01,0.57,3.8
2,2019-03-01,0.43,3.8
3,2019-04-01,0.5,3.8
4,2019-05-01,0.31,3.8


### Validate missing values

In [36]:
print("Are there missing values in the dataset?: ", df_ipc.isnull().values.any())
print("Missing values by column:")
df_ipc.isnull().sum()

Are there missing values in the dataset?:  False
Missing values by column:


FECHA              0
IPC_VAR_MOM_PCT    0
IPC_VAR_YOY_PCT    0
dtype: int64

There are not missing values in the dataset. Therefore, no processing is necessary.

### Calculate the percentage of IPC variation by month and year

In [37]:
df_ipc.set_index('FECHA', inplace=True)

# Calculate month-over-month percent change (compared to previous month)
#df_ipc['IPC_MOM_PCT'] = df_ipc['IPC'].pct_change(periods=1) * 100

# Calculate year-over-year percent change (compared to same month last year)
#df_ipc['IPC_YOY_PCT'] = df_ipc['IPC'].pct_change(periods=12) * 100

# Reset index
df_ipc = df_ipc.reset_index()

# Exclude rows from 2019
df_ipc = df_ipc[df_ipc['FECHA'].dt.year != 2019]

# Replace infinite values with NaN
#df_ipc.replace([np.inf, -np.inf], -1, inplace=True)

df_ipc.head(12)

Unnamed: 0,FECHA,IPC_VAR_MOM_PCT,IPC_VAR_YOY_PCT
12,2020-01-01,0.42,1.61
13,2020-02-01,0.67,1.61
14,2020-03-01,0.57,1.61
15,2020-04-01,0.16,1.61
16,2020-05-01,-0.32,1.61
17,2020-06-01,-0.38,1.61
18,2020-07-01,0.0,1.61
19,2020-08-01,-0.01,1.61
20,2020-09-01,0.32,1.61
21,2020-10-01,-0.06,1.61


### Autofill all days of the month with the IPC for the 1st of each month

In [38]:
df_ipc['FECHA'] = pd.to_datetime(df_ipc['FECHA'])         # 1) Convierte a datetime
df_ipc['PERIODO'] = df_ipc['FECHA'].dt.to_period('M')     # 2) Crea una columna de periodo mensual

# 3) Fijar como índice ese periodo y re-muestrear a diario
df_ipc = (
    df_ipc.set_index('PERIODO')
          .to_timestamp()            # Convertir PeriodIndex a DatetimeIndex
          .resample('D')        # pasar a frecuencia diaria
          .ffill()              # rellenar hacia adelante
          .drop(columns=['FECHA'])  # eliminar columna fecha original
          .reset_index()        # volver a tener columna 'PERIODO'
)

# 4) Reconstruir la columna de fecha diaria
df_ipc['FECHA'] = df_ipc['PERIODO']

# 5) Reordenar columnas
cols = ['FECHA'] + [c for c in df_ipc.columns if c not in ('FECHA', 'PERIODO')]
df_ipc = df_ipc[cols]

# Ahora df_ipc tiene una fila por cada día del periodo, con los datos mensuales repetidos
df_ipc.head(10)


Unnamed: 0,FECHA,IPC_VAR_MOM_PCT,IPC_VAR_YOY_PCT
0,2020-01-01,0.42,1.61
1,2020-01-02,0.42,1.61
2,2020-01-03,0.42,1.61
3,2020-01-04,0.42,1.61
4,2020-01-05,0.42,1.61
5,2020-01-06,0.42,1.61
6,2020-01-07,0.42,1.61
7,2020-01-08,0.42,1.61
8,2020-01-09,0.42,1.61
9,2020-01-10,0.42,1.61


In [39]:
df_ipc.tail(10)

Unnamed: 0,FECHA,IPC_VAR_MOM_PCT,IPC_VAR_YOY_PCT
1999,2025-06-22,0.1,3.3
2000,2025-06-23,0.1,3.3
2001,2025-06-24,0.1,3.3
2002,2025-06-25,0.1,3.3
2003,2025-06-26,0.1,3.3
2004,2025-06-27,0.1,3.3
2005,2025-06-28,0.1,3.3
2006,2025-06-29,0.1,3.3
2007,2025-06-30,0.1,3.3
2008,2025-07-01,0.0,0.0


<hr>

## Processing IPP dataset

### Load dataset

In [40]:
# Load dataset
df_ipp = pd.read_csv("../../data/in/exog/IPP_2020_2025.csv", parse_dates=["FECHA"])
df_ipp.head()

Unnamed: 0,FECHA,IPP_PRODUCCION_NACIONAL,IPP_OFERTA_INTERNA
0,2019-01-01,114.53,117.18
1,2019-02-01,115.6,117.38
2,2019-03-01,116.37,117.82
3,2019-04-01,118.03,118.86
4,2019-05-01,119.91,120.19


### Validate missing values

In [41]:
print("Are there missing values in the dataset?: ", df_ipp.isnull().values.any())
print("Missing values by column:")
df_ipp.isnull().sum()

Are there missing values in the dataset?:  False
Missing values by column:


FECHA                      0
IPP_PRODUCCION_NACIONAL    0
IPP_OFERTA_INTERNA         0
dtype: int64

There are not missing values in the dataset. Therefore, no processing is necessary.

### Calculate the percentage of IPP variation by month and year

In [42]:
df_ipp.set_index('FECHA', inplace=True)

# Calculate month-over-month percent change (compared to previous month)
df_ipp['IPP_VAR_PN_MOM_PCT'] = (df_ipp['IPP_PRODUCCION_NACIONAL'].pct_change(periods=1) * 100).round(2)
df_ipp['IPP_VAR_OI_MOM_PCT'] = (df_ipp['IPP_OFERTA_INTERNA'].pct_change(periods=1) * 100).round(2)

# Calculate year-over-year percent change (compared to same month last year)
df_ipp['IPP_VAR_PN_YOY_PCT'] = (df_ipp['IPP_PRODUCCION_NACIONAL'].pct_change(periods=12) * 100).round(2)
df_ipp['IPP_VAR_OI_YOY_PCT'] = (df_ipp['IPP_OFERTA_INTERNA'].pct_change(periods=12) * 100).round(2)

# Reset index
df_ipp = df_ipp.reset_index()

# Exclude rows from 2019
df_ipp = df_ipp[df_ipp['FECHA'].dt.year != 2019]

# Replace infinite values with NaN
df_ipp.replace([np.inf, -np.inf], -9999, inplace=True)

df_ipp.head(12)

Unnamed: 0,FECHA,IPP_PRODUCCION_NACIONAL,IPP_OFERTA_INTERNA,IPP_VAR_PN_MOM_PCT,IPP_VAR_OI_MOM_PCT,IPP_VAR_PN_YOY_PCT,IPP_VAR_OI_YOY_PCT
12,2020-01-01,119.91,122.34,-0.73,-0.02,4.7,4.4
13,2020-02-01,118.69,122.34,-1.02,0.0,2.67,4.23
14,2020-03-01,116.16,123.27,-2.13,0.76,-0.18,4.63
15,2020-04-01,112.33,122.59,-3.3,-0.55,-4.83,3.14
16,2020-05-01,113.73,122.5,1.25,-0.07,-5.15,1.92
17,2020-06-01,115.73,122.59,1.76,0.07,-1.92,2.36
18,2020-07-01,117.49,122.76,1.52,0.14,-1.0,1.68
19,2020-08-01,119.31,123.54,1.55,0.64,0.34,1.6
20,2020-09-01,118.78,123.7,-0.44,0.13,-1.3,1.2
21,2020-10-01,119.52,124.42,0.62,0.58,-0.68,1.52


In [43]:
count_neg_ones = (df_ipp == -9999).sum().sum()
print(f"Total de valores -9999 en el dataset: {count_neg_ones}")

Total de valores -9999 en el dataset: 0


### Autofill all days of the month with the IPP for the 1st of each month

In [44]:
df_ipp['FECHA'] = pd.to_datetime(df_ipp['FECHA'])         # 1) Convierte a datetime
df_ipp['PERIODO'] = df_ipp['FECHA'].dt.to_period('M')     # 2) Crea una columna de periodo mensual

# 3) Fijar como índice ese periodo y re-muestrear a diario
df_ipp = (
    df_ipp.set_index('PERIODO')
          .to_timestamp()            # Convertir PeriodIndex a DatetimeIndex
          .resample('D')        # pasar a frecuencia diaria
          .ffill()              # rellenar hacia adelante
          .drop(columns=['FECHA'])  # eliminar columna fecha original
          .reset_index()        # volver a tener columna 'PERIODO'
)

# 4) Reconstruir la columna de fecha diaria
df_ipp['FECHA'] = df_ipp['PERIODO']

# 5) Reordenar columnas
cols = ['FECHA'] + [c for c in df_ipp.columns if c not in ('FECHA', 'PERIODO', 'IPP_PRODUCCION_NACIONAL', 'IPP_OFERTA_INTERNA')]
df_ipp = df_ipp[cols]

# Ahora df_ipc tiene una fila por cada día del periodo, con los datos mensuales repetidos
df_ipp.head(10)


Unnamed: 0,FECHA,IPP_VAR_PN_MOM_PCT,IPP_VAR_OI_MOM_PCT,IPP_VAR_PN_YOY_PCT,IPP_VAR_OI_YOY_PCT
0,2020-01-01,-0.73,-0.02,4.7,4.4
1,2020-01-02,-0.73,-0.02,4.7,4.4
2,2020-01-03,-0.73,-0.02,4.7,4.4
3,2020-01-04,-0.73,-0.02,4.7,4.4
4,2020-01-05,-0.73,-0.02,4.7,4.4
5,2020-01-06,-0.73,-0.02,4.7,4.4
6,2020-01-07,-0.73,-0.02,4.7,4.4
7,2020-01-08,-0.73,-0.02,4.7,4.4
8,2020-01-09,-0.73,-0.02,4.7,4.4
9,2020-01-10,-0.73,-0.02,4.7,4.4


In [45]:
df_ipp.tail(10)

Unnamed: 0,FECHA,IPP_VAR_PN_MOM_PCT,IPP_VAR_OI_MOM_PCT,IPP_VAR_PN_YOY_PCT,IPP_VAR_OI_YOY_PCT
1999,2025-06-22,-0.28,-0.63,2.09,2.01
2000,2025-06-23,-0.28,-0.63,2.09,2.01
2001,2025-06-24,-0.28,-0.63,2.09,2.01
2002,2025-06-25,-0.28,-0.63,2.09,2.01
2003,2025-06-26,-0.28,-0.63,2.09,2.01
2004,2025-06-27,-0.28,-0.63,2.09,2.01
2005,2025-06-28,-0.28,-0.63,2.09,2.01
2006,2025-06-29,-0.28,-0.63,2.09,2.01
2007,2025-06-30,-0.28,-0.63,2.09,2.01
2008,2025-07-01,-100.0,-100.0,-100.0,-100.0


<hr>

## Merge datasets

### Merge generation and spot prices datasets

In [46]:
# Merge by Fecha & Hour
df = pd.merge(
    df_gen_pivot,
    df_price_long,
    on = ["Fecha", "Hora"],
    how = "inner" # Keep only existing dates-hours presents in both datasets
)

df.head()

Unnamed: 0,Fecha,Hora,COGENERADOR,EOLICA,HIDRAULICA,SOLAR,TERMICA,Precio
0,2021-01-01 00:00:00,0,21522.11,3328.1,5244418.15,0.0,1391375.95,142.31
1,2021-01-01 00:00:00,1,21766.38,4063.22,5117544.23,0.0,1325940.54,142.31
2,2021-01-01 00:00:00,10,21301.39,10220.62,5118684.25,102386.27,1017685.76,142.31
3,2021-01-01 00:00:00,11,22571.79,8472.44,5285404.7,110299.13,1044030.63,142.31
4,2021-01-01 00:00:00,12,22660.4,9043.93,5382742.33,102263.57,1096915.81,153.31


### Order columns

In [47]:
# Order columns (Fecha, Hora, Precio, Tipos de Generación)
column_order = ["Fecha", "Hora", "Precio"] + df_gen["Tipo Generación"].unique().tolist()

# Reeplace NaN with 0 value
df = df[column_order].fillna(0)

# Correct columns names (UPPER CASE) (eg.: "Solar" vs "SOLAR")
df.columns = [col.upper() for col in df.columns]

df.head()

Unnamed: 0,FECHA,HORA,PRECIO,TERMICA,HIDRAULICA,SOLAR,COGENERADOR,EOLICA
0,2021-01-01 00:00:00,0,142.31,1391375.95,5244418.15,0.0,21522.11,3328.1
1,2021-01-01 00:00:00,1,142.31,1325940.54,5117544.23,0.0,21766.38,4063.22
2,2021-01-01 00:00:00,10,142.31,1017685.76,5118684.25,102386.27,21301.39,10220.62
3,2021-01-01 00:00:00,11,142.31,1044030.63,5285404.7,110299.13,22571.79,8472.44
4,2021-01-01 00:00:00,12,153.31,1096915.81,5382742.33,102263.57,22660.4,9043.93


In [48]:
# Order DataFrame by Date and Hour

# FECHA: string to date
df["FECHA"] = pd.to_datetime(df["FECHA"])

# Hora: string to int
df["HORA"] = df["HORA"].astype(int)

# Order by Fecha y Hora
df = df.sort_values(
    by=["FECHA", "HORA"], 
    ascending=[True, True]
).reset_index(drop=True)  # Reset index to maintain order

df.head(25)

Unnamed: 0,FECHA,HORA,PRECIO,TERMICA,HIDRAULICA,SOLAR,COGENERADOR,EOLICA
0,2020-01-01,0,72.02,2109637.93,4418103.51,0.0,28880.74,4291.34
1,2020-01-01,1,136.71,1864989.8,4548853.97,0.0,29107.37,3511.94
2,2020-01-01,2,127.71,1720883.01,4525898.75,0.0,25939.87,2641.9
3,2020-01-01,3,127.71,1649014.61,4458645.56,0.0,25651.57,3032.3
4,2020-01-01,4,127.71,1711745.36,4279657.9,0.0,25245.37,2514.76
5,2020-01-01,5,72.02,1571431.28,4311552.05,0.0,22696.46,980.62
6,2020-01-01,6,72.02,1297795.61,4173921.34,2330.51,19751.12,457.47
7,2020-01-01,7,72.02,1257262.31,4231331.52,23088.0,18429.91,0.0
8,2020-01-01,8,72.02,1339329.04,4334616.11,43158.98,19297.62,526.95
9,2020-01-01,9,72.02,1486541.89,4437534.05,49153.18,19783.46,1515.43


### Merge ENSO dataset

In [49]:
# Merge by FECHA

df = df.merge(
    df_enso,
    on="FECHA",
    how="left"
)

df.head(10)

Unnamed: 0,FECHA,HORA,PRECIO,TERMICA,HIDRAULICA,SOLAR,COGENERADOR,EOLICA,NIVEL_ENSO
0,2020-01-01,0,72.02,2109637.93,4418103.51,0.0,28880.74,4291.34,1.0
1,2020-01-01,1,136.71,1864989.8,4548853.97,0.0,29107.37,3511.94,1.0
2,2020-01-01,2,127.71,1720883.01,4525898.75,0.0,25939.87,2641.9,1.0
3,2020-01-01,3,127.71,1649014.61,4458645.56,0.0,25651.57,3032.3,1.0
4,2020-01-01,4,127.71,1711745.36,4279657.9,0.0,25245.37,2514.76,1.0
5,2020-01-01,5,72.02,1571431.28,4311552.05,0.0,22696.46,980.62,1.0
6,2020-01-01,6,72.02,1297795.61,4173921.34,2330.51,19751.12,457.47,1.0
7,2020-01-01,7,72.02,1257262.31,4231331.52,23088.0,18429.91,0.0,1.0
8,2020-01-01,8,72.02,1339329.04,4334616.11,43158.98,19297.62,526.95,1.0
9,2020-01-01,9,72.02,1486541.89,4437534.05,49153.18,19783.46,1515.43,1.0


In [50]:
# Add column FECHA_HORA (eg.: 2020-01-01 01:00:00)
df["FECHA_HORA"] = pd.to_datetime(df["FECHA"]) + pd.to_timedelta(df["HORA"], unit="h")


# Reorder columns 
column_order = ["FECHA_HORA"] + [col for col in df.columns if col != "FECHA_HORA"]
df = df[column_order]

df.head()

Unnamed: 0,FECHA_HORA,FECHA,HORA,PRECIO,TERMICA,HIDRAULICA,SOLAR,COGENERADOR,EOLICA,NIVEL_ENSO
0,2020-01-01 00:00:00,2020-01-01,0,72.02,2109637.93,4418103.51,0.0,28880.74,4291.34,1.0
1,2020-01-01 01:00:00,2020-01-01,1,136.71,1864989.8,4548853.97,0.0,29107.37,3511.94,1.0
2,2020-01-01 02:00:00,2020-01-01,2,127.71,1720883.01,4525898.75,0.0,25939.87,2641.9,1.0
3,2020-01-01 03:00:00,2020-01-01,3,127.71,1649014.61,4458645.56,0.0,25651.57,3032.3,1.0
4,2020-01-01 04:00:00,2020-01-01,4,127.71,1711745.36,4279657.9,0.0,25245.37,2514.76,1.0


### Merge Fuel Consumption dataset

In [57]:
# 1. Convierte todo a string y elimina espacios
df_fuel_comsump_f['FECHA'] = df_fuel_comsump_f['FECHA'].astype(str).str.strip()

# 2. Convierte a datetime usando formato 'mixed' para adaptarse a todos
df_fuel_comsump_f['FECHA'] = pd.to_datetime(df_fuel_comsump_f['FECHA'], format='mixed', errors='raise')

# Verifica el tipo
print(df_fuel_comsump_f['FECHA'].dtype) 

datetime64[ns]


In [58]:
# Merge by FECHA

df = df.merge(
    df_fuel_comsump_f,
    on="FECHA",
    how="left"
)

df.head(5)

Unnamed: 0,FECHA_HORA,FECHA,HORA,PRECIO,TERMICA,HIDRAULICA,SOLAR,COGENERADOR,EOLICA,NIVEL_ENSO,FUEL_CONS_ACPM,FUEL_CONS_CARBON,FUEL_CONS_COMBUSTOLEO,FUEL_CONS_CRUDO,FUEL_CONS_GAS,FUEL_CONS_GAS_NI,FUEL_CONS_GLP
0,2020-01-01 00:00:00,2020-01-01,0,72.02,2109637.93,4418103.51,0.0,28880.74,4291.34,1.0,0.0,234788.670155,0.0,0.0,167262.229,0.0,0.0
1,2020-01-01 01:00:00,2020-01-01,1,136.71,1864989.8,4548853.97,0.0,29107.37,3511.94,1.0,0.0,234788.670155,0.0,0.0,167262.229,0.0,0.0
2,2020-01-01 02:00:00,2020-01-01,2,127.71,1720883.01,4525898.75,0.0,25939.87,2641.9,1.0,0.0,234788.670155,0.0,0.0,167262.229,0.0,0.0
3,2020-01-01 03:00:00,2020-01-01,3,127.71,1649014.61,4458645.56,0.0,25651.57,3032.3,1.0,0.0,234788.670155,0.0,0.0,167262.229,0.0,0.0
4,2020-01-01 04:00:00,2020-01-01,4,127.71,1711745.36,4279657.9,0.0,25245.37,2514.76,1.0,0.0,234788.670155,0.0,0.0,167262.229,0.0,0.0


### Merge Fuel Cost dataset

In [59]:
# Merge by FECHA

df = df.merge(
    df_fuel_cost,
    on="FECHA",
    how="left"
)

df.head(5)

Unnamed: 0,FECHA_HORA,FECHA,HORA,PRECIO,TERMICA,HIDRAULICA,SOLAR,COGENERADOR,EOLICA,NIVEL_ENSO,...,FUEL_CONS_CARBON,FUEL_CONS_COMBUSTOLEO,FUEL_CONS_CRUDO,FUEL_CONS_GAS,FUEL_CONS_GAS_NI,FUEL_CONS_GLP,FUEL_COST_CARBON,FUEL_COST_GAS,FUEL_COST_GAS_NI,FUEL_COST_COMBUSTOLEO
0,2020-01-01 00:00:00,2020-01-01,0,72.02,2109637.93,4418103.51,0.0,28880.74,4291.34,1.0,...,234788.670155,0.0,0.0,167262.229,0.0,0.0,130.459523,240.075697,0.0,585.085344
1,2020-01-01 01:00:00,2020-01-01,1,136.71,1864989.8,4548853.97,0.0,29107.37,3511.94,1.0,...,234788.670155,0.0,0.0,167262.229,0.0,0.0,130.459523,240.075697,0.0,585.085344
2,2020-01-01 02:00:00,2020-01-01,2,127.71,1720883.01,4525898.75,0.0,25939.87,2641.9,1.0,...,234788.670155,0.0,0.0,167262.229,0.0,0.0,130.459523,240.075697,0.0,585.085344
3,2020-01-01 03:00:00,2020-01-01,3,127.71,1649014.61,4458645.56,0.0,25651.57,3032.3,1.0,...,234788.670155,0.0,0.0,167262.229,0.0,0.0,130.459523,240.075697,0.0,585.085344
4,2020-01-01 04:00:00,2020-01-01,4,127.71,1711745.36,4279657.9,0.0,25245.37,2514.76,1.0,...,234788.670155,0.0,0.0,167262.229,0.0,0.0,130.459523,240.075697,0.0,585.085344


### Merge IPC dataset

In [60]:
# Merge by FECHA

df = df.merge(
    df_ipc,
    on="FECHA",
    how="left"
)

df.head(5)

Unnamed: 0,FECHA_HORA,FECHA,HORA,PRECIO,TERMICA,HIDRAULICA,SOLAR,COGENERADOR,EOLICA,NIVEL_ENSO,...,FUEL_CONS_CRUDO,FUEL_CONS_GAS,FUEL_CONS_GAS_NI,FUEL_CONS_GLP,FUEL_COST_CARBON,FUEL_COST_GAS,FUEL_COST_GAS_NI,FUEL_COST_COMBUSTOLEO,IPC_VAR_MOM_PCT,IPC_VAR_YOY_PCT
0,2020-01-01 00:00:00,2020-01-01,0,72.02,2109637.93,4418103.51,0.0,28880.74,4291.34,1.0,...,0.0,167262.229,0.0,0.0,130.459523,240.075697,0.0,585.085344,0.42,1.61
1,2020-01-01 01:00:00,2020-01-01,1,136.71,1864989.8,4548853.97,0.0,29107.37,3511.94,1.0,...,0.0,167262.229,0.0,0.0,130.459523,240.075697,0.0,585.085344,0.42,1.61
2,2020-01-01 02:00:00,2020-01-01,2,127.71,1720883.01,4525898.75,0.0,25939.87,2641.9,1.0,...,0.0,167262.229,0.0,0.0,130.459523,240.075697,0.0,585.085344,0.42,1.61
3,2020-01-01 03:00:00,2020-01-01,3,127.71,1649014.61,4458645.56,0.0,25651.57,3032.3,1.0,...,0.0,167262.229,0.0,0.0,130.459523,240.075697,0.0,585.085344,0.42,1.61
4,2020-01-01 04:00:00,2020-01-01,4,127.71,1711745.36,4279657.9,0.0,25245.37,2514.76,1.0,...,0.0,167262.229,0.0,0.0,130.459523,240.075697,0.0,585.085344,0.42,1.61


### Merge IPP dataset

In [61]:
# Merge by FECHA

df = df.merge(
    df_ipp,
    on="FECHA",
    how="left"
)

df.head(5)

Unnamed: 0,FECHA_HORA,FECHA,HORA,PRECIO,TERMICA,HIDRAULICA,SOLAR,COGENERADOR,EOLICA,NIVEL_ENSO,...,FUEL_COST_CARBON,FUEL_COST_GAS,FUEL_COST_GAS_NI,FUEL_COST_COMBUSTOLEO,IPC_VAR_MOM_PCT,IPC_VAR_YOY_PCT,IPP_VAR_PN_MOM_PCT,IPP_VAR_OI_MOM_PCT,IPP_VAR_PN_YOY_PCT,IPP_VAR_OI_YOY_PCT
0,2020-01-01 00:00:00,2020-01-01,0,72.02,2109637.93,4418103.51,0.0,28880.74,4291.34,1.0,...,130.459523,240.075697,0.0,585.085344,0.42,1.61,-0.73,-0.02,4.7,4.4
1,2020-01-01 01:00:00,2020-01-01,1,136.71,1864989.8,4548853.97,0.0,29107.37,3511.94,1.0,...,130.459523,240.075697,0.0,585.085344,0.42,1.61,-0.73,-0.02,4.7,4.4
2,2020-01-01 02:00:00,2020-01-01,2,127.71,1720883.01,4525898.75,0.0,25939.87,2641.9,1.0,...,130.459523,240.075697,0.0,585.085344,0.42,1.61,-0.73,-0.02,4.7,4.4
3,2020-01-01 03:00:00,2020-01-01,3,127.71,1649014.61,4458645.56,0.0,25651.57,3032.3,1.0,...,130.459523,240.075697,0.0,585.085344,0.42,1.61,-0.73,-0.02,4.7,4.4
4,2020-01-01 04:00:00,2020-01-01,4,127.71,1711745.36,4279657.9,0.0,25245.37,2514.76,1.0,...,130.459523,240.075697,0.0,585.085344,0.42,1.61,-0.73,-0.02,4.7,4.4


In [64]:
df.tail(5)

Unnamed: 0,FECHA_HORA,FECHA,HORA,PRECIO,TERMICA,HIDRAULICA,SOLAR,COGENERADOR,EOLICA,NIVEL_ENSO,...,FUEL_COST_CARBON,FUEL_COST_GAS,FUEL_COST_GAS_NI,FUEL_COST_COMBUSTOLEO,IPC_VAR_MOM_PCT,IPC_VAR_YOY_PCT,IPP_VAR_PN_MOM_PCT,IPP_VAR_OI_MOM_PCT,IPP_VAR_PN_YOY_PCT,IPP_VAR_OI_YOY_PCT
48187,2025-06-30 19:00:00,2025-06-30,19,112.55,1311491.62,9025510.46,2476.26,88117.23,23357.37,0.0,...,159.420866,817.935238,515.62897,946.61311,0.1,3.3,-0.28,-0.63,2.09,2.01
48188,2025-06-30 20:00:00,2025-06-30,20,112.55,1316147.03,8848494.08,1550.03,93431.69,23411.47,0.0,...,159.420866,817.935238,515.62897,946.61311,0.1,3.3,-0.28,-0.63,2.09,2.01
48189,2025-06-30 21:00:00,2025-06-30,21,106.25,1376475.95,8510792.32,0.0,95167.23,23284.58,0.0,...,159.420866,817.935238,515.62897,946.61311,0.1,3.3,-0.28,-0.63,2.09,2.01
48190,2025-06-30 22:00:00,2025-06-30,22,105.55,1401097.29,8058142.03,0.0,99156.29,23312.85,0.0,...,159.420866,817.935238,515.62897,946.61311,0.1,3.3,-0.28,-0.63,2.09,2.01
48191,2025-06-30 23:00:00,2025-06-30,23,105.55,1398702.31,7567097.74,0.0,94711.76,23189.12,0.0,...,159.420866,817.935238,515.62897,946.61311,0.1,3.3,-0.28,-0.63,2.09,2.01


## Export final Dataset

In [None]:
#df["FECHA"] = pd.to_datetime(df["FECHA"])
#df["FECHA"] = df["FECHA"].dt.strftime('%Y-%m-%d')
#df.to_csv('../../data/out/dataset_gen_spot_exog.csv', index=False)

print("Final Dataset exported...")

Final Dataset exported...
