In [2]:
import polars as pl
from pathlib import Path

# Define project root
PROJECT_ROOT = Path("e:/miso-load-forecasting").resolve()
DATA_DIR = PROJECT_ROOT / "data"

# Verify directories exist
print(f"Project root: {PROJECT_ROOT}")
print(f"Data directory: {DATA_DIR}")
print(f"Contents: {[x.name for x in DATA_DIR.iterdir() if x.is_dir()]}")


Project root: E:\miso-load-forecasting
Data directory: E:\miso-load-forecasting\data
Contents: ['miso_load_actual', 'miso_load_forecast', 'weather_kmsp']


In [3]:
## Load MISO Actual Load
actual_files = list((DATA_DIR / "miso_load_actual").rglob("*.parquet"))
actual_df = pl.scan_parquet(actual_files)
## drop duplicates if any
actual_df = actual_df.unique()
print(f"MISO Actual: {actual_df.collect().shape[0]:,} rows")
actual_df.head().collect()

MISO Actual: 26,208 rows


Datetime,Lrz1,Lrz2_7,Lrz3_5,Lrz4,Lrz6,Lrz8_9_10,Miso,load_type
datetime[ns],f64,f64,f64,f64,f64,f64,f64,str
2025-01-28 21:00:00,12134.4,19130.9,11206.9,5588.9,11685.7,19474.1,79220.9,"""actual"""
2025-09-19 19:00:00,11835.2,20758.5,11993.3,6175.0,12619.1,25837.7,89218.8,"""actual"""
2024-03-09 20:00:00,10921.08,16973.77,9492.62,4905.68,9886.28,17872.96,70052.39,"""actual"""
2025-06-04 21:00:00,11493.7,18776.3,10614.6,5097.3,11561.6,24668.8,82212.3,"""actual"""
2024-04-17 17:00:00,10535.44,17551.51,10330.79,5022.43,10506.17,20865.17,74811.51,"""actual"""


In [4]:
## pivot the DataFrame from wide to long format
df_actual_long = (
actual_df
.select(["Datetime","Lrz1"])
.unpivot(
    index="Datetime",  # columns to keep as identifiers
    variable_name="Zone",
    value_name="LoadActual"
    )
)
df_actual_long.head().collect()

Datetime,Zone,LoadActual
datetime[ns],str,f64
2025-12-24 15:00:00,"""Lrz1""",12001.5
2025-08-31 21:00:00,"""Lrz1""",11337.1
2024-09-18 00:00:00,"""Lrz1""",11766.97
2025-07-31 22:00:00,"""Lrz1""",12033.4
2023-08-23 06:00:00,"""Lrz1""",13499.13


In [5]:
## Load MISO Forecasted Load
forecast_files = list((DATA_DIR / "miso_load_forecast").rglob("*.parquet"))
forecast_df = pl.scan_parquet(forecast_files)
## drop duplicates if any
forecast_df = forecast_df.unique()
print(f"MISO Forecasted: {forecast_df.collect().shape[0]:,} rows")
forecast_df.head().collect()

MISO Forecasted: 26,304 rows


Datetime,Lrz1,Lrz2_7,Lrz3_5,Lrz4,Lrz6,Lrz8_9_10,Miso,load_type
datetime[ns],f64,f64,f64,f64,f64,f64,f64,str
2023-11-23 13:00:00,10785.0,16045.0,8842.0,4323.0,8185.0,16414.0,64594.0,"""forecast"""
2023-08-07 03:00:00,9322.0,15961.0,8558.0,4403.0,8746.0,21361.0,68351.0,"""forecast"""
2025-02-11 14:00:00,12741.0,20065.0,11881.0,5384.0,11560.0,18491.0,80122.0,"""forecast"""
2025-10-31 22:00:00,10354.0,15139.0,9124.0,4507.0,9266.0,17095.0,65485.0,"""forecast"""
2023-06-13 12:00:00,12038.0,19128.0,10739.0,5098.0,9888.0,25795.0,82686.0,"""forecast"""


In [6]:
## pivot the DataFrame from wide to long format
df_forecast_long = (
forecast_df
.select(["Datetime","Lrz1"])
.unpivot(
    index="Datetime",  # columns to keep as identifiers
    variable_name="Zone",
    value_name="LoadForecast"
    )
)
df_forecast_long.head().collect()

Datetime,Zone,LoadForecast
datetime[ns],str,f64
2024-04-26 08:00:00,"""Lrz1""",10684.0
2025-05-02 20:00:00,"""Lrz1""",10309.0
2023-06-25 20:00:00,"""Lrz1""",11588.0
2025-12-03 00:00:00,"""Lrz1""",11934.0
2025-02-21 10:00:00,"""Lrz1""",12744.0


In [7]:
## load weather data
weather_files = list((DATA_DIR / "weather_kmsp").rglob("*.parquet"))
weather_df = pl.scan_parquet(weather_files)
print(f"Weather Data: {weather_df.collect().shape[0]:,} rows")
weather_df.head().collect()

Weather Data: 26,616 rows


timestamp,latitude,longitude,station_id,temperature_2m,relative_humidity_2m,dewpoint_2m,apparent_temperature,precipitation,rain,snowfall,pressure_msl,surface_pressure,cloud_cover,wind_speed_10m,wind_direction_10m,wind_gusts_10m,year,month,day
datetime[μs],f64,f64,str,f64,i64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i64,f64,i32,i8,i8
2024-11-24 09:00:00,44.882,-93.2218,"""KMSP""",35.8,86,32.0,29.1,0.0,0.0,0.0,1012.4,981.4,100,6.7,81,13.2,2024,11,24
2024-11-24 00:00:00,44.882,-93.2218,"""KMSP""",36.7,83,32.0,32.0,0.0,0.0,0.0,1015.0,984.0,100,1.9,83,4.9,2024,11,24
2024-11-24 14:00:00,44.882,-93.2218,"""KMSP""",36.0,84,31.7,28.3,0.0,0.0,0.0,1011.2,980.3,100,9.2,89,18.6,2024,11,24
2024-11-24 23:00:00,44.882,-93.2218,"""KMSP""",40.8,72,32.6,33.4,0.0,0.0,0.0,1010.2,979.6,99,8.8,68,18.6,2024,11,24
2024-11-24 05:00:00,44.882,-93.2218,"""KMSP""",35.2,85,31.3,29.0,0.0,0.0,0.0,1013.8,982.8,100,5.5,89,10.3,2024,11,24


In [8]:
## select relevant weather features and convert datetime to proper format
weather_df = weather_df.select([
    pl.col("timestamp").alias("Datetime"),
    pl.col("temperature_2m").alias("Temperature"),
    pl.col("dewpoint_2m").alias("DewPoint"),
    pl.col("relative_humidity_2m").alias("Humidity"),
    pl.col("wind_speed_10m").alias("WindSpeed"),
    pl.col("precipitation").alias("Precipitation")
])
## convert Datetime to proper format
weather_df = weather_df.with_columns(
    pl.col("Datetime").cast(pl.Datetime("ns"))
)
weather_df.head().collect()

Datetime,Temperature,DewPoint,Humidity,WindSpeed,Precipitation
datetime[ns],f64,f64,i64,f64,f64
2024-11-24 09:00:00,35.8,32.0,86,6.7,0.0
2024-11-24 00:00:00,36.7,32.0,83,1.9,0.0
2024-11-24 14:00:00,36.0,31.7,84,9.2,0.0
2024-11-24 23:00:00,40.8,32.6,72,8.8,0.0
2024-11-24 05:00:00,35.2,31.3,85,5.5,0.0


In [9]:
## join load and weather data
df_merged = (
    df_actual_long.join(
        df_forecast_long,
        on=["Datetime", "Zone"],
        how="left",
        suffix="_Forecast"
    )
    .join(
        weather_df,
        on="Datetime",
        how="left"
    )
).sort("Datetime") 
## view merged data
df_merged.head().collect()

Datetime,Zone,LoadActual,LoadForecast,Temperature,DewPoint,Humidity,WindSpeed,Precipitation
datetime[ns],str,f64,f64,f64,f64,i64,f64,f64
2023-01-17 00:00:00,"""Lrz1""",10697.01,10698.0,33.1,32.6,98,5.2,0.043
2023-01-17 01:00:00,"""Lrz1""",10343.69,10349.0,32.9,32.5,99,5.0,0.039
2023-01-17 02:00:00,"""Lrz1""",10096.66,10116.0,32.8,32.5,99,6.1,0.043
2023-01-17 03:00:00,"""Lrz1""",9974.78,10004.0,32.8,32.4,98,5.2,0.035
2023-01-17 04:00:00,"""Lrz1""",10004.15,10002.0,32.7,32.4,99,6.0,0.035


In [18]:
## print mean, median, stddev, min, max, quartiles of LoadActual and LoadForecast and Weather features
summary_stats = df_merged.select(
    pl.col("Datetime"),
    pl.col("LoadActual"),
    pl.col("LoadForecast"),
    pl.col("Temperature"),
    pl.col("DewPoint"),
    pl.col("Humidity"),
    pl.col("WindSpeed"),
    pl.col("Precipitation")
).describe()
summary_stats

statistic,Datetime,LoadActual,LoadForecast,Temperature,DewPoint,Humidity,WindSpeed,Precipitation
str,str,f64,f64,f64,f64,f64,f64,f64
"""count""","""26208""",26208.0,26208.0,26208.0,26208.0,26208.0,26208.0,26208.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""","""2024-07-17 00:30:39.560439""",11249.767102,11211.983669,48.175183,37.845559,70.183188,7.033429,0.004544
"""std""",,1510.780407,1511.399091,22.42944,20.884609,18.085522,3.301583,0.027724
"""min""","""2023-01-17 00:00:00""",8036.2,8062.0,-20.7,-28.8,10.0,0.0,0.0
"""25%""","""2023-10-17 00:00:00""",10154.2,10124.0,31.5,23.9,57.0,4.6,0.0
"""50%""","""2024-07-17 00:00:00""",11103.9,11061.0,49.6,38.9,72.0,6.6,0.0
"""75%""","""2025-04-16 23:00:00""",12179.3,12131.0,66.8,55.8,85.0,9.0,0.0
"""max""","""2026-01-15 23:00:00""",17405.14,18117.0,98.1,80.2,100.0,22.2,0.815


In [19]:

## calculate error metrics
df_merged = df_merged.with_columns([
    (pl.col("LoadForecast") - pl.col("LoadActual")).alias("Error"),
    ((pl.col("LoadForecast") - pl.col("LoadActual")).abs()).alias("AbsoluteError"),
    (((pl.col("LoadForecast") - pl.col("LoadActual")).abs()) / pl.col("LoadActual") * 100).alias("AbsolutePercentageError")
])
## print summary of mean, median, extremes, quartiles for features and error metrics
df_merged.select([
    pl.col("Error"),
    pl.col("AbsoluteError"),
    pl.col("AbsolutePercentageError")
]).describe()

statistic,Error,AbsoluteError,AbsolutePercentageError
str,f64,f64,f64
"""count""",26208.0,26208.0,26208.0
"""null_count""",0.0,0.0,0.0
"""mean""",-37.783433,256.45947,2.247828
"""std""",342.580715,230.25034,1.931287
"""min""",-1395.31,0.0,0.0
"""25%""",-233.47,89.32,0.818511
"""50%""",-36.64,193.5,1.756983
"""75%""",149.2,356.4,3.135944
"""max""",2201.8,2201.8,15.566805
