# Data cleaning

In [84]:
import pandas as pd

In [85]:
# Load bronze data
df = pd.read_parquet('data/bronze/df.parquet')
df.head(3)

Unnamed: 0,Forecasted Load,Actual Load
2014-10-24 00:00:00+02:00,,6522.0
2014-10-24 01:00:00+02:00,,6342.0
2014-10-24 02:00:00+02:00,,6269.0


In [86]:
# Currently, the timestamp correponds to "in the next hour, this is the load"
# whereas we want it to mean "the load 24h from this timestamp is"
# Modify it so it fits
df = df.set_index(df.index.to_series().apply(lambda x: x - pd.Timedelta(1, 'd'))) # Update the index
df = df.rename(columns={ # rename the columns to reflect the new index
    'Forecasted Load': '24h_later_forecast',
    'Actual Load': '24h_later_load',
})
df.head(3)

Unnamed: 0,24h_later_forecast,24h_later_load
2014-10-23 00:00:00+02:00,,6522.0
2014-10-23 01:00:00+02:00,,6342.0
2014-10-23 02:00:00+02:00,,6269.0


In [87]:
# We should not drop NaNs
# Missing data means we might not have the load in 24h, but we could have it right now
df.isna().sum()
print(f"Data range: {df.index.min()} -> {df.index.max()}")

Data range: 2014-10-23 00:00:00+02:00 -> 2024-09-27 16:00:00+02:00


In [88]:
# Dump to silver
df.to_parquet('data/silver/df.parquet')