In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
# Get the project root directory (one level up from notebooks)
ROOT_DIR = Path().absolute().parent

# Define all the files to be merged/analyzed with absolute paths
FILES = {
    "INTRADAY": str(ROOT_DIR / "data/processed/intraday_trades_raw.parquet"),
    "IMBALANCE": str(ROOT_DIR / "data/processed/imbalance_prices.parquet"),
    "DEMAND": str(ROOT_DIR / "data/processed/forecast_actual.parquet")
}

# Print the paths to verify
print("Project root:", ROOT_DIR)
for name, path in FILES.items():
    print(f"{name}: {path}")

Project root: c:\Users\alexa\OneDrive\Desktop\GB-Power-Price-Diver-Spread-Radar
INTRADAY: c:\Users\alexa\OneDrive\Desktop\GB-Power-Price-Diver-Spread-Radar\data\processed\intraday_trades_raw.parquet
IMBALANCE: c:\Users\alexa\OneDrive\Desktop\GB-Power-Price-Diver-Spread-Radar\data\processed\imbalance_prices.parquet
DEMAND: c:\Users\alexa\OneDrive\Desktop\GB-Power-Price-Diver-Spread-Radar\data\processed\forecast_actual.parquet


In [9]:
# Load datasets and display basic info
dfs = {}
for name, fpath in FILES.items():
    try:
        dfs[name] = pd.read_parquet(fpath)
        print(f"\n{name} loaded: {len(dfs[name])} rows, {len(dfs[name].columns)} columns")
    except Exception as e:
        print(f"❌ Error loading {name}: {e}")

# Display columns, types, head for each
for name, df in dfs.items():
    print(f"\n==== {name} ====")
    print("Columns:", list(df.columns))
    print("Types:\n", df.dtypes)
    print("First 3 rows:\n", df.head(3))



INTRADAY loaded: 50108 rows, 6 columns

IMBALANCE loaded: 24767 rows, 9 columns

DEMAND loaded: 24238 rows, 23 columns

==== INTRADAY ====
Columns: ['Settlement Date', 'Settlement Period', 'Market Index Data Provider Id', 'Market Index Volume (MWh)', 'Market Index Price (£/MWh)', 'datetime']
Types:
 Settlement Date                               object
Settlement Period                              int64
Market Index Data Provider Id                 object
Market Index Volume (MWh)                    float64
Market Index Price (£/MWh)                   float64
datetime                         datetime64[ns, UTC]
dtype: object
First 3 rows:
    Settlement Date  Settlement Period Market Index Data Provider Id  \
0  01 January 2024                  1                      APXMIDP    
1  01 January 2024                  1                      N2EXMIDP   
2  01 January 2024                  2                      APXMIDP    

   Market Index Volume (MWh)  Market Index Price (£/MWh)  \
0     

In [10]:
for name, df in dfs.items():
    print(f"\n==== {name} ====")
    print("NaN counts per column:")
    print(df.isna().sum())
    print("Rows with any NaN:", df.isna().any(axis=1).sum())
    if "datetime" in df.columns:
        # Gaps in datetime (for half-hourly data)
        sorted_dt = df['datetime'].sort_values()
        deltas = sorted_dt.diff().dropna()
        print("Most common time delta:", deltas.value_counts().idxmax())
        print("Other time deltas (potential gaps):")
        print(deltas.value_counts().head(5))



==== INTRADAY ====
NaN counts per column:
Settlement Date                  0
Settlement Period                0
Market Index Data Provider Id    0
Market Index Volume (MWh)        0
Market Index Price (£/MWh)       0
datetime                         0
dtype: int64
Rows with any NaN: 0
Most common time delta: 0 days 00:00:00
Other time deltas (potential gaps):
datetime
0 days 00:00:00    25056
0 days 00:30:00    25049
0 days 01:30:00        2
Name: count, dtype: int64

==== IMBALANCE ====
NaN counts per column:
Settlement Date               0
Settlement Period             0
System Sell Price(GBP/MWh)    0
System Buy Price(GBP/MWh)     0
Net Imbalance Volume(MWh)     0
datetime                      0
sbp                           0
ssp                           0
niv                           0
dtype: int64
Rows with any NaN: 0
Most common time delta: 0 days 00:30:00
Other time deltas (potential gaps):
datetime
0 days 00:30:00    24762
0 days 01:30:00        2
0 days 00:00:00        2
N

In [None]:
for name, df in dfs.items():
    print(f"\n==== {name} ====")
    num_cols = df.select_dtypes(include=[np.number]).columns
    print("Numerical column stats:")
    print(df[num_cols].describe().T)
    # If there's a price or volume column, show their range
    for col in df.columns:
        if "price" in col.lower() or "volume" in col.lower():
            print(f"{col}: min={df[col].min()}, max={df[col].max()}, median={df[col].median()}")



==== INTRADAY ====
Numerical column stats:
                              count         mean          std    min   25%  \
Settlement Period           50108.0    24.498324    13.852842   1.00  12.0   
Market Index Volume (MWh)   50108.0  1103.630295  1224.813186   0.00   0.0   
Market Index Price (£/MWh)  50108.0    38.203266    46.705199 -68.67   0.0   

                             50%        75%      max  
Settlement Period           24.0    36.0000    50.00  
Market Index Volume (MWh)    0.0  2143.5375  5819.65  
Market Index Price (£/MWh)   0.0    78.1800  1352.90  
Market Index Volume (MWh): min=0.0, max=5819.65, median=0.0
Market Index Price (£/MWh): min=-68.67, max=1352.9, median=0.0

==== IMBALANCE ====
Numerical column stats:
                              count       mean         std       min  \
Settlement Period           24767.0  24.497355   13.853500     1.000   
System Sell Price(GBP/MWh)  24767.0  76.873238   64.527553   -95.000   
System Buy Price(GBP/MWh)   24767.0  76

In [12]:
for name, df in dfs.items():
    print(f"\n==== {name} ====")
    if "datetime" in df.columns:
        # Assume you expect continuous half-hour periods (modify if different)
        dt_index = pd.date_range(
            start=df["datetime"].min(), 
            end=df["datetime"].max(),
            freq="30min",
            tz=df["datetime"].dt.tz if hasattr(df["datetime"].dt, 'tz') else None
        )
        missing = set(dt_index) - set(df["datetime"])
        print(f"Expected periods: {len(dt_index)} | Actual: {df['datetime'].nunique()} | Missing: {len(missing)}")
        if missing:
            print("Sample missing datetimes:", list(sorted(missing))[:5])



==== INTRADAY ====
Expected periods: 25056 | Actual: 25052 | Missing: 4
Sample missing datetimes: [Timestamp('2024-03-31 23:00:00+0000', tz='UTC'), Timestamp('2024-03-31 23:30:00+0000', tz='UTC'), Timestamp('2025-03-30 23:00:00+0000', tz='UTC'), Timestamp('2025-03-30 23:30:00+0000', tz='UTC')]

==== IMBALANCE ====
Expected periods: 24769 | Actual: 24765 | Missing: 4
Sample missing datetimes: [Timestamp('2024-03-31 23:00:00+0000', tz='UTC'), Timestamp('2024-03-31 23:30:00+0000', tz='UTC'), Timestamp('2025-03-30 23:00:00+0000', tz='UTC'), Timestamp('2025-03-30 23:30:00+0000', tz='UTC')]

==== DEMAND ====
Expected periods: 24240 | Actual: 24236 | Missing: 4
Sample missing datetimes: [Timestamp('2024-03-31 23:00:00+0000', tz='UTC'), Timestamp('2024-03-31 23:30:00+0000', tz='UTC'), Timestamp('2025-03-30 23:00:00+0000', tz='UTC'), Timestamp('2025-03-30 23:30:00+0000', tz='UTC')]


In [13]:
print("\n===== Date Range Summary =====")
for name, df in dfs.items():
    print(f"\n{name}:")
    if "datetime" in df.columns:
        print(f"  Start: {df['datetime'].min()}")
        print(f"  End:   {df['datetime'].max()}")
        print(f"  Total unique periods: {df['datetime'].nunique()}")
    else:
        print("  No 'datetime' column detected.")



===== Date Range Summary =====

INTRADAY:
  Start: 2024-01-01 00:00:00+00:00
  End:   2025-06-05 23:30:00+00:00
  Total unique periods: 25052

IMBALANCE:
  Start: 2024-01-01 00:00:00+00:00
  End:   2025-05-31 00:00:00+00:00
  Total unique periods: 24765

DEMAND:
  Start: 2024-01-01 00:00:00+00:00
  End:   2025-05-19 23:30:00+00:00
  Total unique periods: 24236


In [14]:
from pandas import Timestamp
# Define filter boundaries
dt_start = Timestamp("2024-01-01 00:00:00", tz="UTC")
dt_end   = Timestamp("2025-04-30 23:30:00", tz="UTC")

for name, df in dfs.items():
    if "datetime" in df.columns:
        before = len(df)
        # Filter the dataframe
        mask = (df["datetime"] >= dt_start) & (df["datetime"] <= dt_end)
        df_filtered = df[mask].copy()
        print(f"{name}: {before} → {len(df_filtered)} rows in date range {dt_start} to {dt_end}")
        # Overwrite the dictionary with the filtered df (if you want)
        dfs[name] = df_filtered
    else:
        print(f"{name}: No 'datetime' column, not filtered.")

# Optionally, save filtered versions (uncomment if needed)
for name, df in dfs.items():
     outpath = str(ROOT_DIR / f"data/processed/{name.lower()}_filtered.parquet")
     df.to_parquet(outpath)
     print(f"Saved {name} to {outpath}")


INTRADAY: 50108 → 46652 rows in date range 2024-01-01 00:00:00+00:00 to 2025-04-30 23:30:00+00:00
IMBALANCE: 24767 → 23326 rows in date range 2024-01-01 00:00:00+00:00 to 2025-04-30 23:30:00+00:00
DEMAND: 24238 → 23326 rows in date range 2024-01-01 00:00:00+00:00 to 2025-04-30 23:30:00+00:00
Saved INTRADAY to c:\Users\alexa\OneDrive\Desktop\GB-Power-Price-Diver-Spread-Radar\data\processed\intraday_filtered.parquet
Saved IMBALANCE to c:\Users\alexa\OneDrive\Desktop\GB-Power-Price-Diver-Spread-Radar\data\processed\imbalance_filtered.parquet
Saved DEMAND to c:\Users\alexa\OneDrive\Desktop\GB-Power-Price-Diver-Spread-Radar\data\processed\demand_filtered.parquet


In [15]:
intraday = dfs['INTRADAY']

# Check what columns you have:
print("Intraday columns:", list(intraday.columns))

# Defensive check: choose the right column names
price_col = next(c for c in intraday.columns if "price" in c.lower())
volume_col = next(c for c in intraday.columns if "volume" in c.lower())
datetime_col = "datetime"

# Compute VWAP per settlement period
vwap_df = (
    intraday
    .groupby(datetime_col)
    .apply(lambda g: np.average(g[price_col], weights=g[volume_col]) if g[volume_col].sum() > 0 else np.nan)
    .reset_index()
    .rename(columns={0: "vwap_price"})
)

# Merge the VWAP back to the dataframe (optional: keep only the first row per datetime)
intraday_vwap = intraday.drop_duplicates(subset=[datetime_col]).merge(vwap_df, on=datetime_col, how="left")

print(intraday_vwap[[datetime_col, price_col, volume_col, "vwap_price"]].head())

# Replace 'INTRADAY' in dfs with the new version (with VWAP)
dfs['INTRADAY'] = intraday_vwap

Intraday columns: ['Settlement Date', 'Settlement Period', 'Market Index Data Provider Id', 'Market Index Volume (MWh)', 'Market Index Price (£/MWh)', 'datetime']
                   datetime  Market Index Price (£/MWh)  \
0 2024-01-01 00:00:00+00:00                       36.51   
1 2024-01-01 00:30:00+00:00                       45.17   
2 2024-01-01 01:00:00+00:00                       57.43   
3 2024-01-01 01:30:00+00:00                       46.61   
4 2024-01-01 02:00:00+00:00                       61.98   

   Market Index Volume (MWh)  vwap_price  
0                     664.40       36.51  
1                     768.65       45.17  
2                     805.40       57.43  
3                     655.85       46.61  
4                     756.35       61.98  


  .apply(lambda g: np.average(g[price_col], weights=g[volume_col]) if g[volume_col].sum() > 0 else np.nan)


In [16]:
# 1. Confirm all datetimes are identical in content (not just length)
dt_sets = {name: set(df['datetime']) for name, df in dfs.items() if 'datetime' in df.columns}
reference = next(iter(dt_sets.values()))
for name, dt_set in dt_sets.items():
    only_in_ref = reference - dt_set
    only_in_this = dt_set - reference
    print(f"\n{name}:")
    print(f"  Dates only in reference (not in {name}): {len(only_in_ref)}")
    print(f"  Dates only in {name} (not in reference): {len(only_in_this)}")



INTRADAY:
  Dates only in reference (not in INTRADAY): 0
  Dates only in INTRADAY (not in reference): 0

IMBALANCE:
  Dates only in reference (not in IMBALANCE): 0
  Dates only in IMBALANCE (not in reference): 0

DEMAND:
  Dates only in reference (not in DEMAND): 0
  Dates only in DEMAND (not in reference): 0


In [18]:
# 2. Are datetime columns the same dtype?
for name, df in dfs.items():
    if 'datetime' in df.columns:
        print(f"{name} datetime dtype: {df['datetime'].dtype}")

# 3. NaN count per column, just to be sure
for name, df in dfs.items():
    print(f"\n{name} NaN summary:")
    print(df.isna().sum())


INTRADAY datetime dtype: datetime64[ns, UTC]
IMBALANCE datetime dtype: datetime64[ns, UTC]
DEMAND datetime dtype: datetime64[ns, UTC]

INTRADAY NaN summary:
Settlement Date                   0
Settlement Period                 0
Market Index Data Provider Id     0
Market Index Volume (MWh)         0
Market Index Price (£/MWh)        0
datetime                          0
vwap_price                       95
dtype: int64

IMBALANCE NaN summary:
Settlement Date               0
Settlement Period             0
System Sell Price(GBP/MWh)    0
System Buy Price(GBP/MWh)     0
Net Imbalance Volume(MWh)     0
datetime                      0
sbp                           0
ssp                           0
niv                           0
dtype: int64

DEMAND NaN summary:
SETTLEMENT_DATE              0
SETTLEMENT_PERIOD            0
ND                           0
TSD                          0
ENGLAND_WALES_DEMAND         0
EMBEDDED_WIND_GENERATION     0
EMBEDDED_WIND_CAPACITY       0
EMBEDDED_SOLAR_

In [19]:
# 5. Sort by datetime for merge safety
for name, df in dfs.items():
    if 'datetime' in df.columns:
        dfs[name] = df.sort_values('datetime').reset_index(drop=True)

In [21]:
# --- Merge on datetime ---
# For clarity, rename columns before merging so you know the source
intraday = dfs['INTRADAY'].copy()
imbalance = dfs['IMBALANCE'].copy()
demand = dfs['DEMAND'].copy()

# Rename price columns to be explicit
if "vwap_price" in intraday.columns:
    intraday = intraday.rename(columns={"vwap_price": "mip_price"})
if "forecast" in demand.columns:
    demand = demand.rename(columns={"forecast": "forecast_MW"})
if "actual" in demand.columns:
    demand = demand.rename(columns={"actual": "actual_MW"})

# Set index for all DataFrames
for df in [intraday, imbalance, demand]:
    if "datetime" in df.columns:
        df.set_index("datetime", inplace=True)

# Merge step by step (outer join to preserve all periods)
merged = (demand
         .join(intraday, how='outer', lsuffix='_demand', rsuffix='_intraday')
         .join(imbalance, how='outer', rsuffix='_imb'))

print(f"\nMerged dataset: {merged.shape[0]} rows, {merged.shape[1]} columns")
print("\nColumns:", list(merged.columns))
print("\nSample rows:")
print(merged.head(3))

# Save for downstream use (optional)
out_path = ROOT_DIR / "data/processed/final_merged.parquet"
merged.reset_index().to_parquet(out_path, index=False)
print(f"✅ Saved merged dataset to {out_path}")


Merged dataset: 23330 rows, 36 columns

Columns: ['SETTLEMENT_DATE', 'SETTLEMENT_PERIOD', 'ND', 'TSD', 'ENGLAND_WALES_DEMAND', 'EMBEDDED_WIND_GENERATION', 'EMBEDDED_WIND_CAPACITY', 'EMBEDDED_SOLAR_GENERATION', 'EMBEDDED_SOLAR_CAPACITY', 'NON_BM_STOR', 'PUMP_STORAGE_PUMPING', 'SCOTTISH_TRANSFER', 'IFA_FLOW', 'IFA2_FLOW', 'BRITNED_FLOW', 'MOYLE_FLOW', 'EAST_WEST_FLOW', 'NEMO_FLOW', 'NSL_FLOW', 'ELECLINK_FLOW', 'VIKING_FLOW', 'GREENLINK_FLOW', 'Settlement Date', 'Settlement Period', 'Market Index Data Provider Id', 'Market Index Volume (MWh)', 'Market Index Price (£/MWh)', 'mip_price', 'Settlement Date_imb', 'Settlement Period_imb', 'System Sell Price(GBP/MWh)', 'System Buy Price(GBP/MWh)', 'Net Imbalance Volume(MWh)', 'sbp', 'ssp', 'niv']

Sample rows:
                          SETTLEMENT_DATE  SETTLEMENT_PERIOD     ND    TSD  \
datetime                                                                     
2024-01-01 00:00:00+00:00      2024-01-01                  1  21783  23466   
2024

In [22]:
print("==== FINAL MERGED DATASET CHECKS ====\n")

print("1. File shape:")
print(f"Rows: {merged.shape[0]}, Columns: {merged.shape[1]}")

print("\n2. Columns and dtypes:")
print(merged.dtypes)

print("\n3. First and last 3 rows:")
print(merged.head(3))
print(merged.tail(3))

print("\n4. Date range:")
print("Start:", merged.index.min())
print("End:  ", merged.index.max())

print("\n5. NaN count per column:")
print(merged.isna().sum())

print("\n6. Unique datetimes:", merged.index.nunique())


==== FINAL MERGED DATASET CHECKS ====

1. File shape:
Rows: 23330, Columns: 36

2. Columns and dtypes:
SETTLEMENT_DATE                  datetime64[ns]
SETTLEMENT_PERIOD                         int64
ND                                        int64
TSD                                       int64
ENGLAND_WALES_DEMAND                      int64
EMBEDDED_WIND_GENERATION                  int64
EMBEDDED_WIND_CAPACITY                    int64
EMBEDDED_SOLAR_GENERATION                 int64
EMBEDDED_SOLAR_CAPACITY                   int64
NON_BM_STOR                               int64
PUMP_STORAGE_PUMPING                      int64
SCOTTISH_TRANSFER                         int64
IFA_FLOW                                  int64
IFA2_FLOW                                 int64
BRITNED_FLOW                              int64
MOYLE_FLOW                                int64
EAST_WEST_FLOW                            int64
NEMO_FLOW                                 int64
NSL_FLOW                         