# Spread & Volatility Feature Engineering
We will build features that quantify:
1. Forecast error
2. Imbalance cash‐out cost
3. Intraday vs imbalance price spreads
4. Rolling volatility of prices and spreads

All calculations are done on the merged dataset aligned by settlement‐period `datetime`.


In [4]:
import pandas as pd
from pathlib import Path

# Path to your final merged dataset - going up one level from notebooks directory
DATA_FILE = Path("..") / "data/processed/final_merged.parquet"

# Load into DataFrame
df = pd.read_parquet(DATA_FILE)
print(f"Loaded {len(df):,} rows with columns:\n{df.columns.tolist()}")

# Quick look
display(df.head())

Loaded 23,330 rows with columns:
['datetime', 'SETTLEMENT_DATE', 'SETTLEMENT_PERIOD', 'ND', 'TSD', 'ENGLAND_WALES_DEMAND', 'EMBEDDED_WIND_GENERATION', 'EMBEDDED_WIND_CAPACITY', 'EMBEDDED_SOLAR_GENERATION', 'EMBEDDED_SOLAR_CAPACITY', 'NON_BM_STOR', 'PUMP_STORAGE_PUMPING', 'SCOTTISH_TRANSFER', 'IFA_FLOW', 'IFA2_FLOW', 'BRITNED_FLOW', 'MOYLE_FLOW', 'EAST_WEST_FLOW', 'NEMO_FLOW', 'NSL_FLOW', 'ELECLINK_FLOW', 'VIKING_FLOW', 'GREENLINK_FLOW', 'Settlement Date', 'Settlement Period', 'Market Index Data Provider Id', 'Market Index Volume (MWh)', 'Market Index Price (£/MWh)', 'mip_price', 'Settlement Date_imb', 'Settlement Period_imb', 'System Sell Price(GBP/MWh)', 'System Buy Price(GBP/MWh)', 'Net Imbalance Volume(MWh)', 'sbp', 'ssp', 'niv']


Unnamed: 0,datetime,SETTLEMENT_DATE,SETTLEMENT_PERIOD,ND,TSD,ENGLAND_WALES_DEMAND,EMBEDDED_WIND_GENERATION,EMBEDDED_WIND_CAPACITY,EMBEDDED_SOLAR_GENERATION,EMBEDDED_SOLAR_CAPACITY,...,Market Index Price (£/MWh),mip_price,Settlement Date_imb,Settlement Period_imb,System Sell Price(GBP/MWh),System Buy Price(GBP/MWh),Net Imbalance Volume(MWh),sbp,ssp,niv
0,2024-01-01 00:00:00+00:00,2024-01-01,1,21783,23466,19539,2804,6488,0,16793,...,36.51,36.51,01/01/2024,1,90.0,90.0,103.352,90.0,90.0,103.352
1,2024-01-01 00:30:00+00:00,2024-01-01,2,22521,24103,20286,2834,6488,0,16793,...,45.17,45.17,01/01/2024,2,100.0,100.0,197.535,100.0,100.0,197.535
2,2024-01-01 01:00:00+00:00,2024-01-01,3,22194,24754,20070,2868,6488,0,16793,...,57.43,57.43,01/01/2024,3,57.43,57.43,458.259,57.43,57.43,458.259
3,2024-01-01 01:30:00+00:00,2024-01-01,4,21510,24505,19424,2901,6488,0,16793,...,46.61,46.61,01/01/2024,4,129.83887,129.83887,492.023,129.83887,129.83887,492.023
4,2024-01-01 02:00:00+00:00,2024-01-01,5,20619,23977,18674,2933,6488,0,16793,...,61.98,61.98,01/01/2024,5,110.0,110.0,407.514,110.0,110.0,407.514


In [5]:
# Forecast error (in MWh)
# Formula: error_MWh = (forecast_MW - actual_MW) * 0.5
# 0.5 comes from half-hour settlement periods

df["error_MWh"] = (df["forecast_MW"] - df["actual_MW"]) * 0.5

# Add relative error (% of forecast)
df["error_pct"] = df["error_MWh"] / (df["forecast_MW"] * 0.5) * 100

# Sanity check
print("Forecast Error (abs) statistics:")
print(df["error_MWh"].describe())
print("\nForecast Error (%) statistics:")
print(df["error_pct"].describe())

KeyError: 'forecast_MW'