## Preprocessing data

## Libraries

In [3]:
import pandas as pd

## Processing Generation Dataset

### Load dataset

In [4]:
# Load generation dataset
df_gen = pd.read_excel('../data/merged/generation_2020_2025.xlsx')
df_gen.head()

Unnamed: 0,Fecha,Recurso,Tipo Generación,Combustible,Tipo Despacho,Es Menor,Clasificación,0,1,2,...,15,16,17,18,19,20,21,22,23,Version
0,2020-01-01,AGPE - ECOPETROL LA HORMIGA,TERMICA,GAS,NO DESPACHADO CENTRALMENTE,NO,AUTOG PEQ. ESCALA,226.51,223.84,223.12,...,269.53,273.34,273.69,227.1,225.26,256.94,155.93,10.91,11.59,TX5
1,2020-01-01,AGUA FRESCA,HIDRAULICA,AGUA,NO DESPACHADO CENTRALMENTE,SI,NORMAL,7268.4,7268.4,7270.8,...,7024.8,6937.2,6874.8,6802.8,6764.4,6729.6,6714.0,5462.4,5950.8,TX5
2,2020-01-01,ALBAN,HIDRAULICA,AGUA,DESPACHADO CENTRALMENTE,NO,NORMAL,251520.0,251040.0,251280.0,...,238880.0,238880.0,238960.0,244786.0,246944.0,248476.0,250296.0,250590.0,250340.0,TX5
3,2020-01-01,ALEJANDRÍA,HIDRAULICA,AGUA,NO DESPACHADO CENTRALMENTE,SI,NORMAL,14861.0,14608.0,14322.0,...,8536.0,8338.0,8525.0,10065.0,12892.0,14839.0,14927.0,14982.0,14872.0,TX5
4,2020-01-01,ALTO TULUA,HIDRAULICA,AGUA,NO DESPACHADO CENTRALMENTE,SI,NORMAL,0.0,0.0,0.0,...,10908.0,10908.0,10911.6,10908.0,10908.0,10915.2,10915.2,10918.8,10922.4,TX5


### Validate missing values

In [5]:
print("Are there missing values in the dataset?: ", df_gen.isnull().values.any())
print("Missing values by column:")
df_gen.isnull().sum()

Are there missing values in the dataset?:  True
Missing values by column:


Fecha               0
Recurso             0
Tipo Generación     0
Combustible         0
Tipo Despacho       0
Es Menor            0
Clasificación       0
0                  36
1                   5
2                   6
3                   2
4                   0
5                   2
6                   4
7                   5
8                   3
9                   8
10                  2
11                  2
12                  6
13                  3
14                  6
15                  2
16                  0
17                  1
18                  3
19                 50
20                  1
21                  2
22                  3
23                  6
Version             0
dtype: int64

In this case, the missing values represent a contribition of zero (0) to energy generation. Therefore, the missing values will be replaced with 0.

In [6]:
df_gen = df_gen.fillna(0)

In [7]:
print("Are there missing values in the dataset? (after replacement): ", df_gen.isnull().values.any())
print("Missing values by column (after replacement):")
df_gen.isnull().sum()

Are there missing values in the dataset? (after replacement):  False
Missing values by column (after replacement):


Fecha              0
Recurso            0
Tipo Generación    0
Combustible        0
Tipo Despacho      0
Es Menor           0
Clasificación      0
0                  0
1                  0
2                  0
3                  0
4                  0
5                  0
6                  0
7                  0
8                  0
9                  0
10                 0
11                 0
12                 0
13                 0
14                 0
15                 0
16                 0
17                 0
18                 0
19                 0
20                 0
21                 0
22                 0
23                 0
Version            0
dtype: int64

After validating that there are not missing values in the dataset, the next step is to transform the dataset into the appropiate format.

### Melt columns hours (0-23) -> wide to long format

In [8]:
# Melt columns hours (0-23) -> wide to long format
df_gen_long = df_gen.melt(
    id_vars = ["Fecha", "Tipo Generación"], # Fixed columns
    value_vars = [str(i) for i in range(24)], # Columns to melt (0 to 23)
    var_name = "Hora", # New column name to hours
    value_name="generacion_kwh" # New column name for generation values
)

df_gen_long.head(10)

Unnamed: 0,Fecha,Tipo Generación,Hora,generacion_kwh
0,2020-01-01,TERMICA,0,226.51
1,2020-01-01,HIDRAULICA,0,7268.4
2,2020-01-01,HIDRAULICA,0,251520.0
3,2020-01-01,HIDRAULICA,0,14861.0
4,2020-01-01,HIDRAULICA,0,0.0
5,2020-01-01,HIDRAULICA,0,13917.6
6,2020-01-01,HIDRAULICA,0,625.96
7,2020-01-01,HIDRAULICA,0,72659.88
8,2020-01-01,HIDRAULICA,0,482.71
9,2020-01-01,HIDRAULICA,0,14442.6


### Group by Date, Hour and Generation Type (summarized)

In [9]:
# Group by Fecha, Hora and Tipo Generación (summarized)

df_gen_agg = df_gen_long.groupby(["Fecha", "Hora", "Tipo Generación"])["generacion_kwh"].sum().reset_index()
df_gen_agg.head(10)

Unnamed: 0,Fecha,Hora,Tipo Generación,generacion_kwh
0,2021-01-01 00:00:00,0,COGENERADOR,21522.11
1,2021-01-01 00:00:00,0,EOLICA,3328.1
2,2021-01-01 00:00:00,0,HIDRAULICA,5244418.15
3,2021-01-01 00:00:00,0,SOLAR,0.0
4,2021-01-01 00:00:00,0,TERMICA,1391375.95
5,2021-01-01 00:00:00,1,COGENERADOR,21766.38
6,2021-01-01 00:00:00,1,EOLICA,4063.22
7,2021-01-01 00:00:00,1,HIDRAULICA,5117544.23
8,2021-01-01 00:00:00,1,SOLAR,0.0
9,2021-01-01 00:00:00,1,TERMICA,1325940.54


## Processing Spot Prices Dataset

### Load dataset

In [10]:
# Load generation dataset
df_price = pd.read_excel('../data/merged/spot_prices_2020_2025.xlsx')
df_price.head()

Unnamed: 0,Fecha,0,1,2,3,4,5,6,7,8,...,15,16,17,18,19,20,21,22,23,Versión
0,2020-01-01,72.02,136.71,127.71,127.71,127.71,72.02,72.02,72.02,72.02,...,136.67,127.71,72.02,183.93,406.73,213.73,213.73,213.73,136.71,TX5
1,2020-01-02,71.34,71.34,71.34,71.34,71.34,73.95,73.95,137.88,185.1,...,357.8,357.8,307.9,397.9,387.9,357.8,307.9,307.9,307.9,TX5
2,2020-01-03,116.31,116.31,126.29,126.29,135.29,126.29,123.46,275.31,275.31,...,275.31,275.31,275.31,285.31,285.31,275.31,275.31,275.31,275.31,TX5
3,2020-01-04,144.36,144.36,144.36,144.36,134.34,144.36,134.34,144.36,188.36,...,255.36,255.36,255.36,255.36,255.36,255.36,255.36,255.36,188.36,TX5
4,2020-01-05,209.12,209.12,188.12,188.12,188.12,188.12,180.62,157.12,188.12,...,209.12,188.12,209.12,305.12,305.12,305.12,305.12,305.12,209.12,TX5


### Validate missing values

In [11]:
print("Are there missing values in the dataset?: ", df_price.isnull().values.any())
print("Missing values by column:")
df_price.isnull().sum()

Are there missing values in the dataset?:  False
Missing values by column:


Fecha      0
0          0
1          0
2          0
3          0
4          0
5          0
6          0
7          0
8          0
9          0
10         0
11         0
12         0
13         0
14         0
15         0
16         0
17         0
18         0
19         0
20         0
21         0
22         0
23         0
Versión    0
dtype: int64

There are not missing values in the dataset. Therefore, no processing is necessary.

### Melt columns hours (0-23) -> wide to long format

In [12]:
# Derretir columnas de horas (0-23)
df_price_long = df_price.melt(
    id_vars = ["Fecha"], # Fixed columns
    value_vars = [str(i) for i in range(24)], # Columns to melt (0 to 23)
    var_name = "Hora", # New column name to hours
    value_name = "Precio" # New column name for price values
)

df_price_long.head()

Unnamed: 0,Fecha,Hora,Precio
0,2020-01-01,0,72.02
1,2020-01-02,0,71.34
2,2020-01-03,0,116.31
3,2020-01-04,0,144.36
4,2020-01-05,0,209.12


## Merge datasets

In [32]:
# Merge by Fecha & Hour
df_merge = pd.merge(
    df_gen_agg,
    df_price_long,
    on = ["Fecha", "Hora"],
    how = "inner" # Keep only existing dates-hours presents in both datasets
)

df_merge.head()

Unnamed: 0,Fecha,Hora,Tipo Generación,generacion_kwh,Precio
0,2021-01-01 00:00:00,0,COGENERADOR,21522.11,142.31
1,2021-01-01 00:00:00,0,EOLICA,3328.1,142.31
2,2021-01-01 00:00:00,0,HIDRAULICA,5244418.15,142.31
3,2021-01-01 00:00:00,0,SOLAR,0.0,142.31
4,2021-01-01 00:00:00,0,TERMICA,1391375.95,142.31


In [35]:
# Pivot Generation Types as Columns
df_pivot = df_merge.pivot_table(
    index = ["Fecha", "Hora", "Precio"], # Group data by 'Fecha', 'Hora', and 'Precio' to create rows
    columns = "Tipo Generación", # Use 'Tipo Generación' values as the new columns
    values = "generacion_kwh", # Populate the table with 'generacion_kwh' values
    fill_value = 0 # Replace any missing data (NaN) with 0
)

df_pivot.index = df_pivot.index.set_levels(df_pivot.index.levels[1].astype(int), level=1)

df_pivot = df_pivot.sort_index(level=['Fecha', 'Hora'], ascending=[True, True])

df_pivot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Tipo Generación,COGENERADOR,EOLICA,HIDRAULICA,SOLAR,Solar,TERMICA
Fecha,Hora,Precio,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-01-01 00:00:00,0,142.31,21522.11,3328.1,5244418.15,0.0,0.0,1391375.95
2021-01-01 00:00:00,1,142.31,21766.38,4063.22,5117544.23,0.0,0.0,1325940.54
2021-01-01 00:00:00,2,142.31,20976.76,4139.08,4919389.58,0.0,0.0,1291073.68
2021-01-01 00:00:00,3,142.31,21212.18,2114.45,4702267.19,0.0,0.0,1326887.72
2021-01-01 00:00:00,4,142.31,19867.94,2070.41,4759095.19,0.0,0.0,1149583.09


In [37]:
df_pivot.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Tipo Generación,COGENERADOR,EOLICA,HIDRAULICA,SOLAR,Solar,TERMICA
Fecha,Hora,Precio,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-01-01 00:00:00,0,142.31,21522.11,3328.1,5244418.15,0.0,0.0,1391375.95
2021-01-01 00:00:00,1,142.31,21766.38,4063.22,5117544.23,0.0,0.0,1325940.54
2021-01-01 00:00:00,2,142.31,20976.76,4139.08,4919389.58,0.0,0.0,1291073.68
2021-01-01 00:00:00,3,142.31,21212.18,2114.45,4702267.19,0.0,0.0,1326887.72
2021-01-01 00:00:00,4,142.31,19867.94,2070.41,4759095.19,0.0,0.0,1149583.09
2021-01-01 00:00:00,5,142.31,20321.55,2669.7,4808062.23,0.0,0.0,997739.48
2021-01-01 00:00:00,6,128.31,20821.96,2117.35,4465756.48,3941.44,0.0,945300.31
2021-01-01 00:00:00,7,128.31,21389.87,3748.52,4512878.18,27233.95,0.0,906850.42
2021-01-01 00:00:00,8,128.31,21362.03,7018.91,4747889.55,53292.43,0.0,931573.52
2021-01-01 00:00:00,9,142.31,21418.93,9077.05,4962393.02,76981.63,0.0,982809.61
